aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAkshay Nair <phenax5@gmail.com>2024-12-24 18:51:17 +0530
committerAkshay Nair <phenax5@gmail.com>2024-12-24 18:51:17 +0530
commit70e3920556496e5fecb5fedddf1067b2522fcac7 (patch)
tree7e4e88023bdcd5b13dde738f7afedd533a6d7fcd
parentdfdf1600ba251f6b3cfef85f6904d79a1c60b49d (diff)
downloadchelleport-70e3920556496e5fecb5fedddf1067b2522fcac7.tar.gz
chelleport-70e3920556496e5fecb5fedddf1067b2522fcac7.zip
Add setup for ocr with tesseract
Diffstat (limited to '')
-rw-r--r--.gitignore1
-rw-r--r--chelleport.cabal26
-rw-r--r--cpp/libchelleport.cpp74
-rw-r--r--flake.nix19
-rw-r--r--include/libchelleport.h12
-rw-r--r--justfile12
-rw-r--r--specs/Specs/AppStateUpdateSpec.hs3
-rw-r--r--specs/Specs/ViewSpec.hs3
-rw-r--r--src/Chelleport.hs11
-rw-r--r--src/Chelleport/Context.hs8
-rw-r--r--src/Chelleport/OCR.hs84
-rw-r--r--src/Chelleport/Types.hs37
-rw-r--r--src/Chelleport/Utils.hs11
13 files changed, 279 insertions, 22 deletions
diff --git a/.gitignore b/.gitignore
index aefeff9..f1b67c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
result
dist-newstyle/
+dist-lib/*
*.AppImage
diff --git a/chelleport.cabal b/chelleport.cabal
index adef7b0..f273a39 100644
--- a/chelleport.cabal
+++ b/chelleport.cabal
@@ -9,11 +9,16 @@ build-type: Simple
synopsis: Mouse control
description: Mouse control
+source-repository head
+ type: git
+ location: https://github.com/phenax/chelleport.git
+
common common-config
default-extensions:
ExplicitForAll
FlexibleContexts
FlexibleInstances
+ ForeignFunctionInterface
GADTs
GeneralizedNewtypeDeriving
LambdaCase
@@ -31,6 +36,9 @@ common common-config
time,
mtl == 2.3.1,
sdl2 == 2.5.5.0,
+ array,
+ temporary,
+ directory,
containers
common warnings
@@ -38,17 +46,24 @@ common warnings
-Wall -Wincomplete-record-updates -Wincomplete-uni-patterns
-Wunused-foralls -Wextra -Wno-unused-do-bind -Wname-shadowing
-fwarn-tabs -fprint-explicit-foralls -fprint-explicit-kinds
- extra-libraries: Xtst X11
+
+common extension
+ extra-libraries: stdc++ Xtst X11 tesseract leptonica
+ include-dirs: include
+ c-sources: cpp/libchelleport.cpp
+ extra-source-files:
+ cpp/*.cpp
+ include/*.h
+ static/font.ttf
executable chelleport
- import: common-config, warnings
+ import: common-config, warnings, extension
hs-source-dirs: bin
main-is: Main.hs
build-depends: lib-chelleport
- -- other-modules:
library lib-chelleport
- import: common-config, warnings
+ import: common-config, warnings, extension
hs-source-dirs: src
build-depends:
bytestring,
@@ -64,12 +79,13 @@ library lib-chelleport
Chelleport.Control
Chelleport.Draw
Chelleport.KeySequence
+ Chelleport.OCR
Chelleport.Types
Chelleport.Utils
Chelleport.View
test-suite specs
- import: common-config, warnings
+ import: common-config, warnings, extension
type: exitcode-stdio-1.0
hs-source-dirs: specs
main-is: Main.hs
diff --git a/cpp/libchelleport.cpp b/cpp/libchelleport.cpp
new file mode 100644
index 0000000..8f2e9f2
--- /dev/null
+++ b/cpp/libchelleport.cpp
@@ -0,0 +1,74 @@
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <leptonica/allheaders.h>
+#include <tesseract/baseapi.h>
+#include <tesseract/publictypes.h>
+#include <vector>
+
+#include "../include/libchelleport.h"
+
+std::vector<OCRMatch> extractTextCoordinates(const char *imagePath);
+
+#define CONFIDENCE_THRESHOLD 30.
+
+OCRMatch *findWordCoordinates(const char *image_path, int *size) {
+ auto boxes = extractTextCoordinates(image_path);
+ static OCRMatch *ptr = new OCRMatch[boxes.size()];
+ std::copy(boxes.begin(), boxes.end(), ptr);
+
+ // for (const auto &box : boxes) {
+ // std::cout << box.text << "\n\n";
+ // std::cout << "Text: " << box.text << "\nPosition: (" << box.startX << ","
+ // << box.startY << ") -> (" << box.endX << "," << box.endY << ")"
+ // << "\n\n";
+ // }
+
+ *size = boxes.size();
+ return ptr;
+}
+
+std::vector<OCRMatch> extractTextCoordinates(const char *imagePath) {
+ std::vector<OCRMatch> results;
+ tesseract::TessBaseAPI *tesseract = new tesseract::TessBaseAPI();
+
+ if (tesseract->Init(nullptr, "eng")) {
+ std::cerr << "Could not initialize tesseract." << std::endl;
+ return results;
+ }
+
+ Pix *image = pixRead(imagePath);
+ if (!image) {
+ std::cerr << "Could not load image " << imagePath << std::endl;
+ return results;
+ }
+
+ tesseract->SetImage(image);
+ tesseract->Recognize(0);
+
+ tesseract::ResultIterator *iterator = tesseract->GetIterator();
+ tesseract::PageIteratorLevel level = tesseract::RIL_TEXTLINE;
+
+ if (iterator != 0) {
+ do {
+ float conf = iterator->Confidence(level);
+ const char *word = iterator->GetUTF8Text(level);
+ int x1, y1, x2, y2;
+ iterator->BoundingBox(level, &x1, &y1, &x2, &y2);
+
+ if (conf > CONFIDENCE_THRESHOLD && word != nullptr && strlen(word) >= 2) {
+ OCRMatch box{x1, y1, x2, y2, word};
+ results.push_back(box);
+ }
+ } while (iterator->Next(level));
+ }
+
+ delete iterator;
+ tesseract->End();
+ delete tesseract;
+ pixDestroy(&image);
+
+ return results;
+}
diff --git a/flake.nix b/flake.nix
index e0c4579..48557f5 100644
--- a/flake.nix
+++ b/flake.nix
@@ -19,6 +19,8 @@
./bin
./src
./specs
+ ./include
+ ./cpp
./chelleport.cabal
];
});
@@ -33,6 +35,18 @@
xorg.libX11
SDL2
SDL2_ttf
+ tesseract
+ leptonica
+ imagemagick
+
+ gcc
+ pkg-config
+ ];
+
+ devPackages = with pkgs; [
+ just
+ nodemon
+ clang-tools
];
in {
haskellProjects.default = {
@@ -69,10 +83,7 @@
inputsFrom = [
config.haskellProjects.default.outputs.devShell
];
- packages = with pkgs; [
- just
- nodemon
- ];
+ packages = devPackages;
inherit buildInputs;
LD_LIBRARY_PATH = "${pkgs.lib.makeLibraryPath buildInputs}";
diff --git a/include/libchelleport.h b/include/libchelleport.h
new file mode 100644
index 0000000..8a9a17e
--- /dev/null
+++ b/include/libchelleport.h
@@ -0,0 +1,12 @@
+#include <string>
+
+// NOTE: Remember to update size and alignment in ocr hs module on change
+struct OCRMatch {
+ int startX, startY;
+ int endX, endY;
+ const char *text;
+};
+
+extern "C" {
+OCRMatch *findWordCoordinates(const char *image_path, /* returns */ int *size);
+}
diff --git a/justfile b/justfile
index 8fc38cd..b1ad23d 100644
--- a/justfile
+++ b/justfile
@@ -4,11 +4,13 @@ default:
run *args:
cabal run chelleport -- {{args}}
+runw *args:
+ nodemon -e .hs,.cpp -w bin -w src -w cpp --exec 'clear && just run {{args}}'
+
test *args:
cabal test {{args}}
testw *args:
- # nodemon -e .hs -w src --exec 'ghcid -c "cabal repl test:specs" -T :main'
nodemon -e .hs -w src -w specs --exec 'clear && just test {{args}}'
build:
@@ -16,3 +18,11 @@ build:
appimage:
nix bundle --bundler github:ralismark/nix-appimage
+
+# lib:
+# @mkdir -p dist-lib;
+# gcc -o dist-lib/libchelleport.so \
+# cpp/libchelleport.cpp \
+# -shared \
+# -lstdc++ \
+# $(pkg-config --libs tesseract lept);
diff --git a/specs/Specs/AppStateUpdateSpec.hs b/specs/Specs/AppStateUpdateSpec.hs
index 67f6fb2..e54f50f 100644
--- a/specs/Specs/AppStateUpdateSpec.hs
+++ b/specs/Specs/AppStateUpdateSpec.hs
@@ -35,7 +35,8 @@ test = do
stateIsMatched = False,
stateGrid = [["ABC", "DEF"], ["DJK", "JKL"]],
stateRepetition = 1,
- stateIsDragging = False
+ stateIsDragging = False,
+ stateMode = ModeHints
}
context "with action HandleKeyInput" $ do
diff --git a/specs/Specs/ViewSpec.hs b/specs/Specs/ViewSpec.hs
index 7bca567..b8418d6 100644
--- a/specs/Specs/ViewSpec.hs
+++ b/specs/Specs/ViewSpec.hs
@@ -15,7 +15,8 @@ test = do
stateIsMatched = False,
stateGrid = [["ABC", "DEF"], ["DJK", "JKL"]],
stateRepetition = 1,
- stateIsDragging = False
+ stateIsDragging = False,
+ stateMode = ModeHints
}
let drawTextCalls = filter (\case Mock_drawText {} -> True; _ -> False) . calls
diff --git a/src/Chelleport.hs b/src/Chelleport.hs
index fa21c28..96c9fb6 100644
--- a/src/Chelleport.hs
+++ b/src/Chelleport.hs
@@ -23,6 +23,14 @@ import Control.Monad.Reader (ReaderT (runReaderT))
import Data.Maybe (fromMaybe, isJust)
import qualified SDL
+-- run :: IO ()
+-- run = do
+-- ctx <- initializeContext
+-- benchmark "ocr" $ do
+-- res <- (`runReaderT` ctx) . runAppM $ getWordsOnScreen
+-- print $ "---" ++ show (length res)
+-- pure ()
+
run :: IO ()
run = do
ctx <- initializeContext
@@ -47,7 +55,8 @@ initialState = do
stateIsMatched = False,
stateIsShiftPressed = False,
stateIsDragging = False,
- stateRepetition = 1
+ stateRepetition = 1,
+ stateMode = ModeSearch
}
where
rows = 9
diff --git a/src/Chelleport/Context.hs b/src/Chelleport/Context.hs
index b23c1c2..30d8516 100644
--- a/src/Chelleport/Context.hs
+++ b/src/Chelleport/Context.hs
@@ -11,14 +11,6 @@ import SDL (($=))
import qualified SDL
import qualified SDL.Font as TTF
--- benchmark :: String -> IO a -> IO a
--- benchmark msg m = do
--- start <- systemNanoseconds <$> getSystemTime
--- result <- m
--- end <- systemNanoseconds <$> getSystemTime
--- Debug.traceM $ msg ++ ": " ++ show (end - start)
--- pure result
-
initializeContext :: IO DrawContext
initializeContext = do
-- Initialize SDL
diff --git a/src/Chelleport/OCR.hs b/src/Chelleport/OCR.hs
new file mode 100644
index 0000000..496b6a0
--- /dev/null
+++ b/src/Chelleport/OCR.hs
@@ -0,0 +1,84 @@
+module Chelleport.OCR (getWordsOnScreen) where
+
+import Chelleport.Types
+import Control.Monad.IO.Class (MonadIO (liftIO))
+import Control.Monad.RWS (asks)
+import qualified Data.ByteString as BS
+import Foreign (Bits (shiftR), Ptr, Storable (peek, pokeByteOff), alloca, allocaBytes, peekArray, (.&.))
+import Foreign.C (CInt, CString, newCString)
+import GHC.IO.Handle.FD (withFile)
+import GHC.IO.IOMode (IOMode (WriteMode))
+import qualified Graphics.X11 as X11
+import qualified SDL
+import System.Directory (removeFile)
+import System.IO (hPutStrLn)
+import System.IO.Temp (emptySystemTempFile)
+
+foreign import ccall unsafe "libchelleport.h findWordCoordinates"
+ c_findWordCoordinates :: CString -> Ptr CInt -> IO (Ptr OCRMatch)
+
+class (Monad m) => MonadOCR m where
+ getWordsOnScreen :: m [OCRMatch]
+
+instance (MonadIO m) => MonadOCR (AppM m) where
+ getWordsOnScreen = do
+ SDL.V2 width height <- asks ctxWindow >>= SDL.get . SDL.windowSize
+ SDL.V2 x y <- asks ctxWindow >>= SDL.getWindowAbsolutePosition
+ liftIO $ do
+ imgFilePath <- liftIO $ createTemporaryScreenshot (x, y) (width, height)
+ findWordCoordinates imgFilePath <* removeFile imgFilePath
+
+findWordCoordinates :: String -> IO [OCRMatch]
+findWordCoordinates imgPath = alloca $ \sizePtr -> do
+ imgPathC <- newCString imgPath
+ arrayPtr <- c_findWordCoordinates imgPathC sizePtr
+
+ size <- peek sizePtr
+ peekArray (fromIntegral size) arrayPtr
+
+createTemporaryScreenshot :: (CInt, CInt) -> (CInt, CInt) -> IO String
+createTemporaryScreenshot offset size = do
+ tmpFilePath <- emptySystemTempFile "chelleport-screenshot.png"
+ screenshot tmpFilePath offset size
+ pure tmpFilePath
+
+screenshot :: String -> (CInt, CInt) -> (CInt, CInt) -> IO ()
+screenshot filename (offsetX, offsetY) (width, height) = do
+ dpy <- X11.openDisplay ""
+ root <- X11.rootWindow dpy (X11.defaultScreen dpy)
+
+ image <-
+ X11.getImage
+ dpy
+ root
+ offsetX
+ offsetY
+ (fromIntegral width)
+ (fromIntegral height)
+ (fromIntegral X11.allPlanes_aux)
+ X11.zPixmap
+
+ allocaBytes (fromIntegral $ width * height * 3) $ \ptr -> do
+ let getPixel :: CInt -> CInt -> IO ()
+ getPixel x y = do
+ pixel <- X11.xGetPixel image x y
+ let r = pixel `shiftR` 16 .&. 0xFF
+ let g = pixel `shiftR` 8 .&. 0xFF
+ let b = pixel .&. 0xFF
+ pokeByteOff ptr (fromIntegral (y * width + x) * 3) r
+ pokeByteOff ptr (fromIntegral (y * width + x) * 3 + 1) g
+ pokeByteOff ptr (fromIntegral (y * width + x) * 3 + 2) b
+
+ sequence_ [getPixel x y | y <- [0 .. height - 1], x <- [0 .. width - 1]]
+ rgbData <- BS.packCStringLen (ptr, fromIntegral $ width * height * 3)
+ savePPMFile filename (fromIntegral width) (fromIntegral height) rgbData
+
+ X11.destroyImage image
+ X11.closeDisplay dpy
+
+savePPMFile :: FilePath -> Int -> Int -> BS.ByteString -> IO ()
+savePPMFile path width height rgbData = withFile path WriteMode $ \h -> do
+ hPutStrLn h "P6"
+ hPutStrLn h $ show width ++ " " ++ show height
+ hPutStrLn h "255"
+ BS.hPut h rgbData
diff --git a/src/Chelleport/Types.hs b/src/Chelleport/Types.hs
index 3c52909..cb580e0 100644
--- a/src/Chelleport/Types.hs
+++ b/src/Chelleport/Types.hs
@@ -1,7 +1,10 @@
module Chelleport.Types where
import Control.Monad.Reader (MonadIO, MonadReader, ReaderT)
+import Data.Vector.Storable (Storable)
import Data.Word (Word8)
+import Foreign (Ptr, Storable (alignment, peek, poke, sizeOf), castPtr, nullPtr, plusPtr)
+import Foreign.C (CChar, CInt, peekCString)
import qualified Graphics.X11 as X11
import qualified SDL
import qualified SDL.Font as TTF
@@ -14,13 +17,17 @@ type KeySequence = [Char]
type KeyGrid = [[Cell]]
+data Mode = ModeHints | ModeSearch
+ deriving (Show, Eq)
+
data State = State
{ stateGrid :: KeyGrid,
stateKeySequence :: KeySequence,
stateIsMatched :: Bool,
stateIsShiftPressed :: Bool,
stateIsDragging :: Bool,
- stateRepetition :: Int
+ stateRepetition :: Int,
+ stateMode :: Mode
}
deriving (Show, Eq)
@@ -51,3 +58,31 @@ data MouseButtonType = LeftClick | RightClick
newtype AppM m a = AppM {runAppM :: ReaderT DrawContext m a}
deriving (Functor, Applicative, Monad, MonadIO, MonadReader DrawContext)
+
+data OCRMatch = OCRMatch
+ { matchStartX :: !CInt,
+ matchStartY :: !CInt,
+ matchEndX :: !CInt,
+ matchEndY :: !CInt,
+ matchText :: !String
+ }
+ deriving (Show)
+
+instance Storable OCRMatch where
+ sizeOf _ = 4 * sizeOf (undefined :: CInt) + sizeOf (undefined :: Ptr CChar)
+
+ -- TODO: Remove hardcoding later
+ alignment _ = 8
+
+ peek ptr = do
+ let cintSize = sizeOf (undefined :: CInt)
+ startX <- peek $ castPtr ptr
+ startY <- peek $ castPtr ptr `plusPtr` cintSize
+ endX <- peek $ castPtr ptr `plusPtr` (2 * cintSize)
+ endY <- peek $ castPtr ptr `plusPtr` (3 * cintSize)
+ text <- peek $ castPtr ptr `plusPtr` (4 * cintSize)
+ textStr <- if text == nullPtr then pure "" else peekCString text
+ pure $ OCRMatch startX startY endX endY textStr
+
+ -- NOTE: Dont need poke
+ poke _ _ = undefined
diff --git a/src/Chelleport/Utils.hs b/src/Chelleport/Utils.hs
index 0e7dabc..c15a3e8 100644
--- a/src/Chelleport/Utils.hs
+++ b/src/Chelleport/Utils.hs
@@ -1,6 +1,9 @@
module Chelleport.Utils where
+import Control.Monad.IO.Class (MonadIO (liftIO))
import Data.List (nub)
+import Data.Time.Clock.System (SystemTime (systemNanoseconds), getSystemTime)
+import qualified Debug.Trace as Debug
import Foreign.C (CInt)
intToCInt :: Int -> CInt
@@ -24,3 +27,11 @@ isEmpty = null
isNotEmpty :: [a] -> Bool
isNotEmpty = not . isEmpty
+
+benchmark :: (MonadIO m) => String -> m a -> m a
+benchmark msg m = do
+ start <- systemNanoseconds <$> liftIO getSystemTime
+ result <- m
+ end <- systemNanoseconds <$> liftIO getSystemTime
+ Debug.traceM $ msg ++ " (ms): " ++ show (fromIntegral (end - start) / 1_000_000.0 :: Double)
+ pure result