정의
Tesseract
Tesseract is an optical character recognition engine for various operating systems.[2] It is free software, released under the Apache License, Version 2.0,[1][3][4] and development has been sponsored by Google since 2006.[5] Tesseract is considered one of the most accurate open-source OCR engines currently available.[4][6]
Leptonica
https://tpgit.github.io/Leptonica/struct_boxa.html
Tesseract API를 활용한 이미지의 문자 검출
#include <string> #include <iostream> #include <tesseract/baseapi.h> #include <leptonica/allheaders.h> using namespace std; // string 치환 string str_replace_all(string &str, const string& from, const string& to) { size_t start_pos = 0; //string처음부터 검사 while((start_pos = str.find(from, start_pos)) != string::npos) //from을 찾을 수 없을 때까지 { str.replace(start_pos, from.length(), to); start_pos += to.length(); // 중복검사를 피하고 from.length() > to.length()인 경우를 위해서 } return str; } // 문구 전처리 string preprocessing_text (string str, string delimiter) { str = str_replace_all(str, string("\n"), delimiter); str = str_replace_all(str, string(" "), delimiter); str = str_replace_all(str, string("\""), string("\\\"")); return str; } // 구분자 제외 단어 개수 int count_word (string str, string delimiter, bool flag_except_white_space) { size_t pos = 0; string token; int word_count = 0; int total_count = 0; while ((pos = str.find(delimiter)) != string::npos) { total_count++; token = str.substr(0, pos); word_count += (token.length() != 0) ? 1 : 0; str.erase(0, pos + delimiter.length()); } return (flag_except_white_space) ? word_count : total_count; } // 메인 함수 int main(int argc, char* argv[]) { // const char* inputfile = "/home/csryu/ocrsample.jpg"; tesseract::Orientation orientation; tesseract::WritingDirection direction; tesseract::TextlineOrder order; float deskew_angle; long area = 0; long ratio = 0; // PIX *image = pixRead(inputfile); PIX *image = pixRead(argv[1]); tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI(); api->Init("/usr/share/tesseract-ocr/tessdata", "kor+eng"); api->SetPageSegMode(tesseract::PSM_AUTO_OSD); api->SetImage(image); api->Recognize(0); // 텍스트 영역 비율 계산 Boxa* boxes = api->GetComponentImages(tesseract::RIL_TEXTLINE, true, NULL, NULL); for (int i = 0; i < boxes->n; i++) { BOX* box = boxaGetBox(boxes, i, L_CLONE); area += (box->w * box->h); } ratio = area * 100 / (long)(image->w*image->h); // 테스트 정보 수집 char* text = api->GetUTF8Text(); int conf = api->MeanTextConf(); area *= 100; ratio = area / (long)(image->w*image->h); string ocr_text = string(text); string delimiter = "|"; string pre_text = preprocessing_text(ocr_text, delimiter); int total_word_count = count_word(pre_text, delimiter, true); cout << "{\"conf\":\"" << conf << "\",\"line_count\":\"" << boxes->n << "\",\"word_count\":\"" << total_word_count << "\",\"ratio\":\"" << ratio << "\"}" << endl; return 0; }