Tesseract and Leptonica

정의

Tesseract

Tesseract is an optical character recognition engine for various operating systems.^[2] It is free software, released under the Apache License, Version 2.0,^[1]^[3]^[4] and development has been sponsored by Google since 2006.^[5] Tesseract is considered one of the most accurate open-source OCR engines currently available.^[4]^[6]

Leptonica

https://tpgit.github.io/Leptonica/struct_boxa.html

Tesseract API를 활용한 이미지의 문자 검출

#include <string>
#include <iostream>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>

using namespace std;

// string 치환
string str_replace_all(string &str, const string& from, const string& to)
{
    size_t start_pos = 0; //string처음부터 검사
    while((start_pos = str.find(from, start_pos)) != string::npos)  //from을 찾을 수 없을 때까지
    {
        str.replace(start_pos, from.length(), to);
        start_pos += to.length(); // 중복검사를 피하고 from.length() > to.length()인 경우를 위해서
    }
    return str;
}

// 문구 전처리
string preprocessing_text (string str, string delimiter)
{
        str = str_replace_all(str, string("\n"), delimiter);
        str = str_replace_all(str, string(" "),  delimiter);
        str = str_replace_all(str, string("\""), string("\\\""));
        return str;
}

// 구분자 제외 단어 개수
int count_word (string str, string delimiter, bool flag_except_white_space)
{
        size_t pos = 0;
        string token;
        int    word_count = 0;
        int    total_count = 0;
        while ((pos = str.find(delimiter)) != string::npos) {
                total_count++;
                token = str.substr(0, pos);
                word_count += (token.length() != 0) ? 1 : 0;
                str.erase(0, pos + delimiter.length());
        }

        return (flag_except_white_space) ? word_count : total_count;
}

// 메인 함수
int main(int argc, char* argv[])
{
//      const char* inputfile = "/home/csryu/ocrsample.jpg";
        tesseract::Orientation orientation;
        tesseract::WritingDirection direction;
        tesseract::TextlineOrder order;
        float deskew_angle;
        long area = 0;
        long ratio = 0;

//      PIX *image = pixRead(inputfile);
        PIX *image = pixRead(argv[1]);
        tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
        api->Init("/usr/share/tesseract-ocr/tessdata", "kor+eng");
        api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
        api->SetImage(image);
        api->Recognize(0);

        // 텍스트 영역 비율 계산
        Boxa* boxes = api->GetComponentImages(tesseract::RIL_TEXTLINE, true, NULL, NULL);
        for (int i = 0; i < boxes->n; i++) {
                BOX* box = boxaGetBox(boxes, i, L_CLONE);
                area += (box->w * box->h);
        }
        ratio = area * 100 / (long)(image->w*image->h);

        // 테스트 정보 수집
        char* text = api->GetUTF8Text();
        int conf   = api->MeanTextConf();
        area *= 100;
        ratio = area / (long)(image->w*image->h);

        string ocr_text  = string(text);
        string delimiter = "|";
        string pre_text  = preprocessing_text(ocr_text, delimiter);
        int total_word_count = count_word(pre_text, delimiter, true);

        cout << "{\"conf\":\"" << conf << "\",\"line_count\":\"" << boxes->n << "\",\"word_count\":\"" << total_word_count << "\",\"ratio\":\"" << ratio << "\"}" << endl;
        return 0;
}

[2]

[1]

[3]

[4]

[5]

[6]

페이지 트리

Tesseract and Leptonica

정의

Tesseract

Leptonica