diff options
| author | Akshay Nair <phenax5@gmail.com> | 2024-12-25 19:49:10 +0530 |
|---|---|---|
| committer | Akshay Nair <phenax5@gmail.com> | 2024-12-25 19:49:16 +0530 |
| commit | fb24e589290f7ffbee04972eed35fca37facdf1c (patch) | |
| tree | 49a467cf35f9ac3469164a6134d0a0a3b5b3074a | |
| parent | 580439bf8dd33e00f6a668a4828eab01d24d7abf (diff) | |
| download | chelleport-fb24e589290f7ffbee04972eed35fca37facdf1c.tar.gz chelleport-fb24e589290f7ffbee04972eed35fca37facdf1c.zip | |
Minor tesseract optimizations
| -rw-r--r-- | chelleport.cabal | 1 | ||||
| -rw-r--r-- | cpp/libchelleport.cpp | 89 | ||||
| -rw-r--r-- | include/libchelleport.h | 22 | ||||
| -rw-r--r-- | src/Chelleport/OCR.hs | 2 |
4 files changed, 72 insertions, 42 deletions
diff --git a/chelleport.cabal b/chelleport.cabal index f273a39..2d03b63 100644 --- a/chelleport.cabal +++ b/chelleport.cabal @@ -51,6 +51,7 @@ common extension extra-libraries: stdc++ Xtst X11 tesseract leptonica include-dirs: include c-sources: cpp/libchelleport.cpp + cxx-options: -O3 -ffast-math -march=native extra-source-files: cpp/*.cpp include/*.h diff --git a/cpp/libchelleport.cpp b/cpp/libchelleport.cpp index 5653068..923df6c 100644 --- a/cpp/libchelleport.cpp +++ b/cpp/libchelleport.cpp @@ -5,42 +5,33 @@ #include <iostream> #include <leptonica/allheaders.h> #include <tesseract/baseapi.h> -#include <tesseract/publictypes.h> #include <vector> #include "../include/libchelleport.h" OCRMatch *findWordCoordinates(const char *image_path, int *size) { - auto matches = extractTextCoordinates(image_path); + std::vector<OCRMatch> matches; + MEASURE("OCR", { matches = extractTextCoordinates(image_path); }); + + std::cout << "Word count: " << matches.size() << std::endl; static OCRMatch *ptr = new OCRMatch[matches.size()]; std::copy(matches.begin(), matches.end(), ptr); - // for (const auto &match : matches) - // showMatch(match); - - printf("Count: %ld\n", matches.size()); - *size = matches.size(); return ptr; } std::vector<OCRMatch> extractTextCoordinates(const char *imagePath) { std::vector<OCRMatch> results; - tesseract::TessBaseAPI *tesseract = new tesseract::TessBaseAPI(); - if (tesseract->Init(nullptr, "eng")) { - std::cerr << "Could not initialize tesseract." << std::endl; + auto tesseract = initializeTesseract(); + if (tesseract == nullptr) return results; - } - Pix *image = pixRead(imagePath); - if (!image) { - std::cerr << "Could not load image " << imagePath << std::endl; + Pix *image = loadImage(imagePath); + if (image == nullptr) return results; - } - - preprocessImage(&image); // printf("imagePath: %s\n", imagePath); // pixWrite(imagePath, image, IFF_JFIF_JPEG); @@ -53,16 +44,17 @@ std::vector<OCRMatch> extractTextCoordinates(const char *imagePath) { if (iterator != 0) { do { - float conf = iterator->Confidence(level); - const char *word = iterator->GetUTF8Text(level); + if (iterator->Confidence(level) > CONFIDENCE_THRESHOLD) { + const char *word = iterator->GetUTF8Text(level); - if (conf > CONFIDENCE_THRESHOLD && word != nullptr && - strlen(word) >= MIN_CHARACTER_COUNT) { - int x1, y1, x2, y2; - iterator->BoundingBox(level, &x1, &y1, &x2, &y2); - results.push_back( - OCRMatch{(int)(x1 / scaleFactor), (int)(y1 / scaleFactor), - (int)(x2 / scaleFactor), (int)(y2 / scaleFactor), word}); + if (word != nullptr && strlen(word) >= MIN_CHARACTER_COUNT) { + int x1, y1, x2, y2; + iterator->BoundingBox(level, &x1, &y1, &x2, &y2); + OCRMatch match({(int)(x1 / scaleFactor), (int)(y1 / scaleFactor), + (int)(x2 / scaleFactor), (int)(y2 / scaleFactor), + word}); + results.push_back(match); + } } } while (iterator->Next(level)); } @@ -75,36 +67,55 @@ std::vector<OCRMatch> extractTextCoordinates(const char *imagePath) { return results; } +inline tesseract::TessBaseAPI *initializeTesseract() { + auto *tesseract = new tesseract::TessBaseAPI(); + tesseract->SetPageSegMode(tesseract::PSM_AUTO); + + if (tesseract->Init(nullptr, "eng", tesseract::OEM_LSTM_ONLY)) { + std::cerr << "Could not initialize tesseract." << std::endl; + return nullptr; + } + + return tesseract; +} + +inline Pix *loadImage(const char *imagePath) { + Pix *image = pixRead(imagePath); + if (!image) { + std::cerr << "Could not load image " << imagePath << std::endl; + return nullptr; + } + + preprocessImage(&image); + + return image; +} + void preprocessImage(Pix **image) { Pix *temp; // Scale if (scaleFactor != 1) { - temp = pixScale(*image, scaleFactor, scaleFactor); - pixDestroy(image); - *image = temp; + INLINE_IMAGE_PROC(pixScale(*image, scaleFactor, scaleFactor)); } // Grayscale if (pixGetDepth(*image) > 8) { - temp = pixConvertRGBToGray(*image, grayscaleWeightRed, grayscaleWeightGreen, - grayscaleWeightBlue); - pixDestroy(image); - *image = temp; + INLINE_IMAGE_PROC(pixConvertRGBToGray( + *image, grayscaleWeightRed, grayscaleWeightGreen, grayscaleWeightBlue)); } // Contrast pixContrastTRC(*image, *image, contrast); // Sharpness - // temp = pixUnsharpMaskingGrayFast(*image, 1, sharpness, 1); - temp = pixUnsharpMasking(*image, 1, sharpness); - pixDestroy(image); - *image = temp; + // INLINE_IMAGE_PROC(pixUnsharpMaskingGrayFast(*image, 1, sharpness, 1)); + INLINE_IMAGE_PROC(pixUnsharpMasking(*image, 1, sharpness)); } -void showMatch(const OCRMatch &match) { +void printMatch(const OCRMatch &match) { std::cout << "Text: " << match.text << "; Position: (" << match.startX << "," << match.startY << ") -> (" << match.endX << "," << match.endY - << ")" << "\n\n"; + << ")" << std::endl + << std::endl; } diff --git a/include/libchelleport.h b/include/libchelleport.h index c74058d..e6a074d 100644 --- a/include/libchelleport.h +++ b/include/libchelleport.h @@ -1,5 +1,6 @@ +#include <chrono> #include <leptonica/allheaders.h> -#include <tesseract/publictypes.h> +#include <tesseract/baseapi.h> #include <vector> // NOTE: Remember to update size and alignment in ocr hs module on change @@ -26,8 +27,25 @@ extern "C" { OCRMatch *findWordCoordinates(const char *image_path, /* returns */ int *size); } +tesseract::TessBaseAPI *initializeTesseract(); + +Pix *loadImage(const char *imagePath); + std::vector<OCRMatch> extractTextCoordinates(const char *imagePath); -void showMatch(const OCRMatch &match); +void printMatch(const OCRMatch &match); void preprocessImage(Pix **image); + +#define INLINE_IMAGE_PROC(process) \ + temp = process; \ + pixDestroy(image); \ + *image = temp; + +#define MEASURE(label, stmts) \ + auto start = std::chrono::high_resolution_clock::now(); \ + stmts; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto duration = \ + std::chrono::duration_cast<std::chrono::microseconds>(end - start); \ + std::cout << label << ": " << duration.count() / 1000.0 << " ms" << std::endl; diff --git a/src/Chelleport/OCR.hs b/src/Chelleport/OCR.hs index 87cad62..5ee331c 100644 --- a/src/Chelleport/OCR.hs +++ b/src/Chelleport/OCR.hs @@ -1,6 +1,7 @@ module Chelleport.OCR (MonadOCR (..)) where import Chelleport.Types +import Chelleport.Utils (benchmark) import Control.Concurrent (threadDelay) import Control.Monad.IO.Class (MonadIO (liftIO)) import Control.Monad.RWS (MonadReader (ask)) @@ -31,7 +32,6 @@ instance (MonadIO m) => MonadOCR (AppM m) where pure path getWordsInImage filePath = liftIO $ do - print filePath findWordCoordinates filePath <* removeFile filePath findWordCoordinates :: String -> IO [OCRMatch] |
