diff options
| author | Akshay Nair <phenax5@gmail.com> | 2024-12-24 18:51:17 +0530 |
|---|---|---|
| committer | Akshay Nair <phenax5@gmail.com> | 2024-12-24 18:51:17 +0530 |
| commit | 70e3920556496e5fecb5fedddf1067b2522fcac7 (patch) | |
| tree | 7e4e88023bdcd5b13dde738f7afedd533a6d7fcd | |
| parent | dfdf1600ba251f6b3cfef85f6904d79a1c60b49d (diff) | |
| download | chelleport-70e3920556496e5fecb5fedddf1067b2522fcac7.tar.gz chelleport-70e3920556496e5fecb5fedddf1067b2522fcac7.zip | |
Add setup for ocr with tesseract
Diffstat (limited to '')
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | chelleport.cabal | 26 | ||||
| -rw-r--r-- | cpp/libchelleport.cpp | 74 | ||||
| -rw-r--r-- | flake.nix | 19 | ||||
| -rw-r--r-- | include/libchelleport.h | 12 | ||||
| -rw-r--r-- | justfile | 12 | ||||
| -rw-r--r-- | specs/Specs/AppStateUpdateSpec.hs | 3 | ||||
| -rw-r--r-- | specs/Specs/ViewSpec.hs | 3 | ||||
| -rw-r--r-- | src/Chelleport.hs | 11 | ||||
| -rw-r--r-- | src/Chelleport/Context.hs | 8 | ||||
| -rw-r--r-- | src/Chelleport/OCR.hs | 84 | ||||
| -rw-r--r-- | src/Chelleport/Types.hs | 37 | ||||
| -rw-r--r-- | src/Chelleport/Utils.hs | 11 |
13 files changed, 279 insertions, 22 deletions
@@ -1,3 +1,4 @@ result dist-newstyle/ +dist-lib/* *.AppImage diff --git a/chelleport.cabal b/chelleport.cabal index adef7b0..f273a39 100644 --- a/chelleport.cabal +++ b/chelleport.cabal @@ -9,11 +9,16 @@ build-type: Simple synopsis: Mouse control description: Mouse control +source-repository head + type: git + location: https://github.com/phenax/chelleport.git + common common-config default-extensions: ExplicitForAll FlexibleContexts FlexibleInstances + ForeignFunctionInterface GADTs GeneralizedNewtypeDeriving LambdaCase @@ -31,6 +36,9 @@ common common-config time, mtl == 2.3.1, sdl2 == 2.5.5.0, + array, + temporary, + directory, containers common warnings @@ -38,17 +46,24 @@ common warnings -Wall -Wincomplete-record-updates -Wincomplete-uni-patterns -Wunused-foralls -Wextra -Wno-unused-do-bind -Wname-shadowing -fwarn-tabs -fprint-explicit-foralls -fprint-explicit-kinds - extra-libraries: Xtst X11 + +common extension + extra-libraries: stdc++ Xtst X11 tesseract leptonica + include-dirs: include + c-sources: cpp/libchelleport.cpp + extra-source-files: + cpp/*.cpp + include/*.h + static/font.ttf executable chelleport - import: common-config, warnings + import: common-config, warnings, extension hs-source-dirs: bin main-is: Main.hs build-depends: lib-chelleport - -- other-modules: library lib-chelleport - import: common-config, warnings + import: common-config, warnings, extension hs-source-dirs: src build-depends: bytestring, @@ -64,12 +79,13 @@ library lib-chelleport Chelleport.Control Chelleport.Draw Chelleport.KeySequence + Chelleport.OCR Chelleport.Types Chelleport.Utils Chelleport.View test-suite specs - import: common-config, warnings + import: common-config, warnings, extension type: exitcode-stdio-1.0 hs-source-dirs: specs main-is: Main.hs diff --git a/cpp/libchelleport.cpp b/cpp/libchelleport.cpp new file mode 100644 index 0000000..8f2e9f2 --- /dev/null +++ b/cpp/libchelleport.cpp @@ -0,0 +1,74 @@ +#include <algorithm> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <leptonica/allheaders.h> +#include <tesseract/baseapi.h> +#include <tesseract/publictypes.h> +#include <vector> + +#include "../include/libchelleport.h" + +std::vector<OCRMatch> extractTextCoordinates(const char *imagePath); + +#define CONFIDENCE_THRESHOLD 30. + +OCRMatch *findWordCoordinates(const char *image_path, int *size) { + auto boxes = extractTextCoordinates(image_path); + static OCRMatch *ptr = new OCRMatch[boxes.size()]; + std::copy(boxes.begin(), boxes.end(), ptr); + + // for (const auto &box : boxes) { + // std::cout << box.text << "\n\n"; + // std::cout << "Text: " << box.text << "\nPosition: (" << box.startX << "," + // << box.startY << ") -> (" << box.endX << "," << box.endY << ")" + // << "\n\n"; + // } + + *size = boxes.size(); + return ptr; +} + +std::vector<OCRMatch> extractTextCoordinates(const char *imagePath) { + std::vector<OCRMatch> results; + tesseract::TessBaseAPI *tesseract = new tesseract::TessBaseAPI(); + + if (tesseract->Init(nullptr, "eng")) { + std::cerr << "Could not initialize tesseract." << std::endl; + return results; + } + + Pix *image = pixRead(imagePath); + if (!image) { + std::cerr << "Could not load image " << imagePath << std::endl; + return results; + } + + tesseract->SetImage(image); + tesseract->Recognize(0); + + tesseract::ResultIterator *iterator = tesseract->GetIterator(); + tesseract::PageIteratorLevel level = tesseract::RIL_TEXTLINE; + + if (iterator != 0) { + do { + float conf = iterator->Confidence(level); + const char *word = iterator->GetUTF8Text(level); + int x1, y1, x2, y2; + iterator->BoundingBox(level, &x1, &y1, &x2, &y2); + + if (conf > CONFIDENCE_THRESHOLD && word != nullptr && strlen(word) >= 2) { + OCRMatch box{x1, y1, x2, y2, word}; + results.push_back(box); + } + } while (iterator->Next(level)); + } + + delete iterator; + tesseract->End(); + delete tesseract; + pixDestroy(&image); + + return results; +} @@ -19,6 +19,8 @@ ./bin ./src ./specs + ./include + ./cpp ./chelleport.cabal ]; }); @@ -33,6 +35,18 @@ xorg.libX11 SDL2 SDL2_ttf + tesseract + leptonica + imagemagick + + gcc + pkg-config + ]; + + devPackages = with pkgs; [ + just + nodemon + clang-tools ]; in { haskellProjects.default = { @@ -69,10 +83,7 @@ inputsFrom = [ config.haskellProjects.default.outputs.devShell ]; - packages = with pkgs; [ - just - nodemon - ]; + packages = devPackages; inherit buildInputs; LD_LIBRARY_PATH = "${pkgs.lib.makeLibraryPath buildInputs}"; diff --git a/include/libchelleport.h b/include/libchelleport.h new file mode 100644 index 0000000..8a9a17e --- /dev/null +++ b/include/libchelleport.h @@ -0,0 +1,12 @@ +#include <string> + +// NOTE: Remember to update size and alignment in ocr hs module on change +struct OCRMatch { + int startX, startY; + int endX, endY; + const char *text; +}; + +extern "C" { +OCRMatch *findWordCoordinates(const char *image_path, /* returns */ int *size); +} @@ -4,11 +4,13 @@ default: run *args: cabal run chelleport -- {{args}} +runw *args: + nodemon -e .hs,.cpp -w bin -w src -w cpp --exec 'clear && just run {{args}}' + test *args: cabal test {{args}} testw *args: - # nodemon -e .hs -w src --exec 'ghcid -c "cabal repl test:specs" -T :main' nodemon -e .hs -w src -w specs --exec 'clear && just test {{args}}' build: @@ -16,3 +18,11 @@ build: appimage: nix bundle --bundler github:ralismark/nix-appimage + +# lib: +# @mkdir -p dist-lib; +# gcc -o dist-lib/libchelleport.so \ +# cpp/libchelleport.cpp \ +# -shared \ +# -lstdc++ \ +# $(pkg-config --libs tesseract lept); diff --git a/specs/Specs/AppStateUpdateSpec.hs b/specs/Specs/AppStateUpdateSpec.hs index 67f6fb2..e54f50f 100644 --- a/specs/Specs/AppStateUpdateSpec.hs +++ b/specs/Specs/AppStateUpdateSpec.hs @@ -35,7 +35,8 @@ test = do stateIsMatched = False, stateGrid = [["ABC", "DEF"], ["DJK", "JKL"]], stateRepetition = 1, - stateIsDragging = False + stateIsDragging = False, + stateMode = ModeHints } context "with action HandleKeyInput" $ do diff --git a/specs/Specs/ViewSpec.hs b/specs/Specs/ViewSpec.hs index 7bca567..b8418d6 100644 --- a/specs/Specs/ViewSpec.hs +++ b/specs/Specs/ViewSpec.hs @@ -15,7 +15,8 @@ test = do stateIsMatched = False, stateGrid = [["ABC", "DEF"], ["DJK", "JKL"]], stateRepetition = 1, - stateIsDragging = False + stateIsDragging = False, + stateMode = ModeHints } let drawTextCalls = filter (\case Mock_drawText {} -> True; _ -> False) . calls diff --git a/src/Chelleport.hs b/src/Chelleport.hs index fa21c28..96c9fb6 100644 --- a/src/Chelleport.hs +++ b/src/Chelleport.hs @@ -23,6 +23,14 @@ import Control.Monad.Reader (ReaderT (runReaderT)) import Data.Maybe (fromMaybe, isJust) import qualified SDL +-- run :: IO () +-- run = do +-- ctx <- initializeContext +-- benchmark "ocr" $ do +-- res <- (`runReaderT` ctx) . runAppM $ getWordsOnScreen +-- print $ "---" ++ show (length res) +-- pure () + run :: IO () run = do ctx <- initializeContext @@ -47,7 +55,8 @@ initialState = do stateIsMatched = False, stateIsShiftPressed = False, stateIsDragging = False, - stateRepetition = 1 + stateRepetition = 1, + stateMode = ModeSearch } where rows = 9 diff --git a/src/Chelleport/Context.hs b/src/Chelleport/Context.hs index b23c1c2..30d8516 100644 --- a/src/Chelleport/Context.hs +++ b/src/Chelleport/Context.hs @@ -11,14 +11,6 @@ import SDL (($=)) import qualified SDL import qualified SDL.Font as TTF --- benchmark :: String -> IO a -> IO a --- benchmark msg m = do --- start <- systemNanoseconds <$> getSystemTime --- result <- m --- end <- systemNanoseconds <$> getSystemTime --- Debug.traceM $ msg ++ ": " ++ show (end - start) --- pure result - initializeContext :: IO DrawContext initializeContext = do -- Initialize SDL diff --git a/src/Chelleport/OCR.hs b/src/Chelleport/OCR.hs new file mode 100644 index 0000000..496b6a0 --- /dev/null +++ b/src/Chelleport/OCR.hs @@ -0,0 +1,84 @@ +module Chelleport.OCR (getWordsOnScreen) where + +import Chelleport.Types +import Control.Monad.IO.Class (MonadIO (liftIO)) +import Control.Monad.RWS (asks) +import qualified Data.ByteString as BS +import Foreign (Bits (shiftR), Ptr, Storable (peek, pokeByteOff), alloca, allocaBytes, peekArray, (.&.)) +import Foreign.C (CInt, CString, newCString) +import GHC.IO.Handle.FD (withFile) +import GHC.IO.IOMode (IOMode (WriteMode)) +import qualified Graphics.X11 as X11 +import qualified SDL +import System.Directory (removeFile) +import System.IO (hPutStrLn) +import System.IO.Temp (emptySystemTempFile) + +foreign import ccall unsafe "libchelleport.h findWordCoordinates" + c_findWordCoordinates :: CString -> Ptr CInt -> IO (Ptr OCRMatch) + +class (Monad m) => MonadOCR m where + getWordsOnScreen :: m [OCRMatch] + +instance (MonadIO m) => MonadOCR (AppM m) where + getWordsOnScreen = do + SDL.V2 width height <- asks ctxWindow >>= SDL.get . SDL.windowSize + SDL.V2 x y <- asks ctxWindow >>= SDL.getWindowAbsolutePosition + liftIO $ do + imgFilePath <- liftIO $ createTemporaryScreenshot (x, y) (width, height) + findWordCoordinates imgFilePath <* removeFile imgFilePath + +findWordCoordinates :: String -> IO [OCRMatch] +findWordCoordinates imgPath = alloca $ \sizePtr -> do + imgPathC <- newCString imgPath + arrayPtr <- c_findWordCoordinates imgPathC sizePtr + + size <- peek sizePtr + peekArray (fromIntegral size) arrayPtr + +createTemporaryScreenshot :: (CInt, CInt) -> (CInt, CInt) -> IO String +createTemporaryScreenshot offset size = do + tmpFilePath <- emptySystemTempFile "chelleport-screenshot.png" + screenshot tmpFilePath offset size + pure tmpFilePath + +screenshot :: String -> (CInt, CInt) -> (CInt, CInt) -> IO () +screenshot filename (offsetX, offsetY) (width, height) = do + dpy <- X11.openDisplay "" + root <- X11.rootWindow dpy (X11.defaultScreen dpy) + + image <- + X11.getImage + dpy + root + offsetX + offsetY + (fromIntegral width) + (fromIntegral height) + (fromIntegral X11.allPlanes_aux) + X11.zPixmap + + allocaBytes (fromIntegral $ width * height * 3) $ \ptr -> do + let getPixel :: CInt -> CInt -> IO () + getPixel x y = do + pixel <- X11.xGetPixel image x y + let r = pixel `shiftR` 16 .&. 0xFF + let g = pixel `shiftR` 8 .&. 0xFF + let b = pixel .&. 0xFF + pokeByteOff ptr (fromIntegral (y * width + x) * 3) r + pokeByteOff ptr (fromIntegral (y * width + x) * 3 + 1) g + pokeByteOff ptr (fromIntegral (y * width + x) * 3 + 2) b + + sequence_ [getPixel x y | y <- [0 .. height - 1], x <- [0 .. width - 1]] + rgbData <- BS.packCStringLen (ptr, fromIntegral $ width * height * 3) + savePPMFile filename (fromIntegral width) (fromIntegral height) rgbData + + X11.destroyImage image + X11.closeDisplay dpy + +savePPMFile :: FilePath -> Int -> Int -> BS.ByteString -> IO () +savePPMFile path width height rgbData = withFile path WriteMode $ \h -> do + hPutStrLn h "P6" + hPutStrLn h $ show width ++ " " ++ show height + hPutStrLn h "255" + BS.hPut h rgbData diff --git a/src/Chelleport/Types.hs b/src/Chelleport/Types.hs index 3c52909..cb580e0 100644 --- a/src/Chelleport/Types.hs +++ b/src/Chelleport/Types.hs @@ -1,7 +1,10 @@ module Chelleport.Types where import Control.Monad.Reader (MonadIO, MonadReader, ReaderT) +import Data.Vector.Storable (Storable) import Data.Word (Word8) +import Foreign (Ptr, Storable (alignment, peek, poke, sizeOf), castPtr, nullPtr, plusPtr) +import Foreign.C (CChar, CInt, peekCString) import qualified Graphics.X11 as X11 import qualified SDL import qualified SDL.Font as TTF @@ -14,13 +17,17 @@ type KeySequence = [Char] type KeyGrid = [[Cell]] +data Mode = ModeHints | ModeSearch + deriving (Show, Eq) + data State = State { stateGrid :: KeyGrid, stateKeySequence :: KeySequence, stateIsMatched :: Bool, stateIsShiftPressed :: Bool, stateIsDragging :: Bool, - stateRepetition :: Int + stateRepetition :: Int, + stateMode :: Mode } deriving (Show, Eq) @@ -51,3 +58,31 @@ data MouseButtonType = LeftClick | RightClick newtype AppM m a = AppM {runAppM :: ReaderT DrawContext m a} deriving (Functor, Applicative, Monad, MonadIO, MonadReader DrawContext) + +data OCRMatch = OCRMatch + { matchStartX :: !CInt, + matchStartY :: !CInt, + matchEndX :: !CInt, + matchEndY :: !CInt, + matchText :: !String + } + deriving (Show) + +instance Storable OCRMatch where + sizeOf _ = 4 * sizeOf (undefined :: CInt) + sizeOf (undefined :: Ptr CChar) + + -- TODO: Remove hardcoding later + alignment _ = 8 + + peek ptr = do + let cintSize = sizeOf (undefined :: CInt) + startX <- peek $ castPtr ptr + startY <- peek $ castPtr ptr `plusPtr` cintSize + endX <- peek $ castPtr ptr `plusPtr` (2 * cintSize) + endY <- peek $ castPtr ptr `plusPtr` (3 * cintSize) + text <- peek $ castPtr ptr `plusPtr` (4 * cintSize) + textStr <- if text == nullPtr then pure "" else peekCString text + pure $ OCRMatch startX startY endX endY textStr + + -- NOTE: Dont need poke + poke _ _ = undefined diff --git a/src/Chelleport/Utils.hs b/src/Chelleport/Utils.hs index 0e7dabc..c15a3e8 100644 --- a/src/Chelleport/Utils.hs +++ b/src/Chelleport/Utils.hs @@ -1,6 +1,9 @@ module Chelleport.Utils where +import Control.Monad.IO.Class (MonadIO (liftIO)) import Data.List (nub) +import Data.Time.Clock.System (SystemTime (systemNanoseconds), getSystemTime) +import qualified Debug.Trace as Debug import Foreign.C (CInt) intToCInt :: Int -> CInt @@ -24,3 +27,11 @@ isEmpty = null isNotEmpty :: [a] -> Bool isNotEmpty = not . isEmpty + +benchmark :: (MonadIO m) => String -> m a -> m a +benchmark msg m = do + start <- systemNanoseconds <$> liftIO getSystemTime + result <- m + end <- systemNanoseconds <$> liftIO getSystemTime + Debug.traceM $ msg ++ " (ms): " ++ show (fromIntegral (end - start) / 1_000_000.0 :: Double) + pure result |
