Tesseract and Leptonica를 이용한 문자 라인 및 단어 개수 검출 방법

정의

Tesseract

Tesseract is an optical character recognition engine for various operating systems.[2] It is free software, released under the Apache License, Version 2.0,[1][3][4] and development has been sponsored by Google since 2006.[5] Tesseract is considered one of the most accurate open-source OCR engines currently available.[4][6]

Leptonica

Leptonica is an open source C library for efficient image processing and image analysis operations.

데이터 구조체 링크, Boxa 구조체 레퍼런스

사용 방법

해당 설치 방법은 Ubuntu 16.04 기준으로 작성되어 있습니다.

설치

tesseract api 개발을 위한 설치 방법

sudo apt-get install tesseract-ocr
sudo apt-get install libtesseract-dev
sudo apt-get install tesseract-ocr-kor tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra
sudo apt-get install libleptonica-dev

문자 검출 로직 작성

Tesseract API를 활용한 이미지의 텍스트 라인 및 단어 정보 수집

#include <string>
#include <iostream>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>

using namespace std;

// string 치환
string str_replace_all(string &str, const string& from, const string& to)
{
    size_t start_pos = 0; //string처음부터 검사
    while((start_pos = str.find(from, start_pos)) != string::npos)  //from을 찾을 수 없을 때까지
    {
        str.replace(start_pos, from.length(), to);
        start_pos += to.length(); // 중복검사를 피하고 from.length() > to.length()인 경우를 위해서
    }
    return str;
}

// 문구 전처리
string preprocessing_text (string str, string delimiter)
{
        str = str_replace_all(str, string("\n"), delimiter);
        str = str_replace_all(str, string(" "),  delimiter);
        str = str_replace_all(str, string("\""), string("\\\""));
        return str;
}

// 구분자 제외 단어 개수
int count_word (string str, string delimiter, bool flag_except_white_space)
{
        size_t pos = 0;
        string token;
        int    word_count = 0;
        int    total_count = 0;
        while ((pos = str.find(delimiter)) != string::npos) {
                total_count++;
                token = str.substr(0, pos);
                word_count += (token.length() != 0) ? 1 : 0;
                str.erase(0, pos + delimiter.length());
        }

        return (flag_except_white_space) ? word_count : total_count;
}

// 메인 함수
int main(int argc, char* argv[])
{
        tesseract::Orientation orientation;
        tesseract::WritingDirection direction;
        tesseract::TextlineOrder order;
        float deskew_angle;
        long area = 0;
        long ratio = 0;

        PIX *image = pixRead(argv[1]);
        tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
        api->Init("/usr/share/tesseract-ocr/tessdata", "kor+eng");
        api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
        api->SetImage(image);
        api->Recognize(0);

        // 텍스트 영역 비율 계산
        Boxa* boxes = api->GetComponentImages(tesseract::RIL_TEXTLINE, true, NULL, NULL);
        for (int i = 0; i < boxes->n; i++) {
                BOX* box = boxaGetBox(boxes, i, L_CLONE);
                area += (box->w * box->h);
        }
        ratio = area * 100 / (long)(image->w*image->h);

        // 테스트 정보 수집
        char* text = api->GetUTF8Text();
        int conf   = api->MeanTextConf();
        area *= 100;
        ratio = area / (long)(image->w*image->h);

        string ocr_text  = string(text);
        string delimiter = "|";
        string pre_text  = preprocessing_text(ocr_text, delimiter);
        int total_word_count = count_word(pre_text, delimiter, true);

        cout << "{\"conf\":\"" << conf << "\",\"line_count\":\"" << boxes->n << "\",\"word_count\":\"" << total_word_count << "\",\"ratio\":\"" << ratio << "\"}" << endl;
        return 0;
}

컴파일

컴파일

g++ -o text_detect text_detect.cpp -llept -ltesseract

실행

바이너리 실행

./text_detect 이미지_파일명

참고사항

OSD만 이용해서 문자 검출하는 용도로 사용하는게 좋으며, 문자 검출은 별도의 ocr 데이터셋을 만들지 않는한, 좋은 결과를 얻기 어렵습니다.

한글 검출

한글 추출은 가능하지만, 속도가 느려지고, 품질이 좋지 않습니다. ocr 데이터셋을 본인에 맞는 형태로 제작해야 합니다.

한글+영문 이렇게 넣을 경우, 두가지 조합을 모두 이용해 찾지만 시간이 더 필요하게 됩니다.

페이지 트리