#include <string>
#include <iostream>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
using namespace std;
// string 치환
string str_replace_all(string &str, const string& from, const string& to)
{
size_t start_pos = 0; //string처음부터 검사
while((start_pos = str.find(from, start_pos)) != string::npos) //from을 찾을 수 없을 때까지
{
str.replace(start_pos, from.length(), to);
start_pos += to.length(); // 중복검사를 피하고 from.length() > to.length()인 경우를 위해서
}
return str;
}
// 문구 전처리
string preprocessing_text (string str, string delimiter)
{
str = str_replace_all(str, string("\n"), delimiter);
str = str_replace_all(str, string(" "), delimiter);
str = str_replace_all(str, string("\""), string("\\\""));
return str;
}
// 구분자 제외 단어 개수
int count_word (string str, string delimiter, bool flag_except_white_space)
{
size_t pos = 0;
string token;
int word_count = 0;
int total_count = 0;
while ((pos = str.find(delimiter)) != string::npos) {
total_count++;
token = str.substr(0, pos);
word_count += (token.length() != 0) ? 1 : 0;
str.erase(0, pos + delimiter.length());
}
return (flag_except_white_space) ? word_count : total_count;
}
// 메인 함수
int main(int argc, char* argv[])
{
// const char* inputfile = "/home/csryu/ocrsample.jpg";
tesseract::Orientation orientation;
tesseract::WritingDirection direction;
tesseract::TextlineOrder order;
float deskew_angle;
long area = 0;
long ratio = 0;
// PIX *image = pixRead(inputfile);
PIX *image = pixRead(argv[1]);
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
api->Init("/usr/share/tesseract-ocr/tessdata", "kor+eng");
api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
api->SetImage(image);
api->Recognize(0);
// 텍스트 영역 비율 계산
Boxa* boxes = api->GetComponentImages(tesseract::RIL_TEXTLINE, true, NULL, NULL);
for (int i = 0; i < boxes->n; i++) {
BOX* box = boxaGetBox(boxes, i, L_CLONE);
area += (box->w * box->h);
}
ratio = area * 100 / (long)(image->w*image->h);
// 테스트 정보 수집
char* text = api->GetUTF8Text();
int conf = api->MeanTextConf();
area *= 100;
ratio = area / (long)(image->w*image->h);
string ocr_text = string(text);
string delimiter = "|";
string pre_text = preprocessing_text(ocr_text, delimiter);
int total_word_count = count_word(pre_text, delimiter, true);
cout << "{\"conf\":\"" << conf << "\",\"line_count\":\"" << boxes->n << "\",\"word_count\":\"" << total_word_count << "\",\"ratio\":\"" << ratio << "\"}" << endl;
return 0;
} |