From 02e0f7bc51acfa06e7299919b54b56a3c7eca02b Mon Sep 17 00:00:00 2001
From: Samo Penic <samo.penic@gmail.com>
Date: Fri, 16 Nov 2018 18:22:33 +0000
Subject: [PATCH] Initial version of sid recognition. Cutting the numerals is not perfect yet.

---
 aoiOcr.py             |    6 ++
 sid_process.py        |   33 ++++++++++++++--
 .idea/sonarIssues.xml |    5 ++
 Ocr.py                |   21 +++++++---
 4 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/.idea/sonarIssues.xml b/.idea/sonarIssues.xml
index c014979..60f73dd 100644
--- a/.idea/sonarIssues.xml
+++ b/.idea/sonarIssues.xml
@@ -13,6 +13,11 @@
             <set />
           </value>
         </entry>
+        <entry key="/a.dummy">
+          <value>
+            <set />
+          </value>
+        </entry>
         <entry key="$PROJECT_DIR$/Ocr.py">
           <value>
             <set />
diff --git a/Ocr.py b/Ocr.py
index 0da4497..35c6729 100644
--- a/Ocr.py
+++ b/Ocr.py
@@ -1,17 +1,18 @@
 from pyzbar.pyzbar import decode
-from sid_process import enhanceSID
+from sid_process import getSID
 import cv2
 import numpy as np
 import math
 
 
 class Paper:
-    def __init__(self, filename=None):
+    def __init__(self, filename=None, sid_classifier=None):
         self.filename = filename
         self.invalid = None
         self.QRData = None
         self.errors = []
         self.warnings = []
+        self.sid_classifier=sid_classifier
         if filename is not None:
             self.loadImage(filename)
             self.runOcr()
@@ -137,8 +138,8 @@
             loc_filtered_x, loc_filtered_y = zip(
                 *sorted(zip(loc_filtered_x, loc_filtered_y))
             )
-        # loc=[loc_filtered_y,loc_filtered_x]
-        # remove duplicates
+            # loc=[loc_filtered_y,loc_filtered_x]
+            # remove duplicates
             a = np.diff(loc_filtered_x) > 40
             a = np.append(a, True)
             loc_filtered_x = np.array(loc_filtered_x)
@@ -213,5 +214,13 @@
             self.answerMatrix.append(oneline)
 
     def get_enhanced_sid(self):
-        es= enhanceSID(self.img[int(0.04*self.imgHeight):int(0.08*self.imgHeight), int(0.7*self.imgWidth):int(0.99*self.imgWidth)])
-        cv2.imwrite("enhancedSID.png",es)
\ No newline at end of file
+        if self.sid_classifier is None:
+            return "x"
+        es = getSID(
+            self.img[
+                int(0.045 * self.imgHeight) : int(0.085 * self.imgHeight),
+                int(0.7 * self.imgWidth) : int(0.99 * self.imgWidth),
+            ],
+            self.sid_classifier,
+        )
+        return es
diff --git a/aoiOcr.py b/aoiOcr.py
index c8bc97b..72ed3b9 100644
--- a/aoiOcr.py
+++ b/aoiOcr.py
@@ -1,8 +1,12 @@
 from Ocr import Paper
+from sklearn.externals import joblib
 
+classifier = joblib.load('filename.joblib')
 
 #p=Paper(filename='testpage300dpi_scan1.png')
-p=Paper(filename='sizif111.tif')
+p=Paper(filename='sizif111.tif', sid_classifier=classifier)
+#p=Paper(filename='processed_scans/20141016095134535_0028.tif')
+
 print(p.QRData)
 print(p.errors)
 
diff --git a/sid_process.py b/sid_process.py
index 210cfe7..90d9b33 100644
--- a/sid_process.py
+++ b/sid_process.py
@@ -1,6 +1,10 @@
 import cv2
 import numpy as np
 from skimage import morphology,img_as_ubyte
+from sklearn import svm
+from sklearn.externals import joblib
+
+
 
 """
   (1) The text is an array of chars (in row-major order) where
@@ -57,19 +61,18 @@
     return np.ones((x, y), np.uint8)
 
 
-def enhanceSID(image):
+def getSID(image, classifier):
     image=255-image
     image=img_as_ubyte(image>100)
     cv2.imwrite("enSID0.png", image)
     # Remove noise
     image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(2,2), iterations=1)
     # Closing. Connect non connected parts
-    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(5, 5), iterations=2)
+    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(5, 3), iterations=4)
     # Again noise removal after closing
+
     image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(8,8), iterations=1)
     # Skeletonization
-    ##For thinning I am using erosion
-    ##image = cv2.erode(image,kernel(4,4),iterations = 40)
     image = img_as_ubyte(morphology.thin(image>128))
     cv2.imwrite("enSID1.png",image)
     # Stub removal (might not be necessary if thinning instead of skeletonize is used above
@@ -80,4 +83,26 @@
     # Thining again
     image = img_as_ubyte(morphology.skeletonize(image>0.5))
     image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(10, 10))
+
+    im2,ctrs, hier = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])
+
+    #classifier = joblib.load('filename.joblib')
+
+    sid_no=""
+    for i, ctr in enumerate(sorted_ctrs):
+        # Get bounding box
+        x, y, w, h = cv2.boundingRect(ctr)
+        # Getting ROI
+        if(w<h/2):
+            sid_no=sid_no+"1"
+            continue
+        roi = image[y:y+h, x:x+w]
+        roi = img_as_ubyte(roi < 128)
+        roi = cv2.resize(roi,(32,32))
+
+        #cv2.rectangle(image,(x,y),( x + w, y + h ),(0,255,0),2)
+        cv2.imwrite('sid_no_{}.png'.format(i), roi)
+        sid_no=sid_no+str(classifier.predict(roi.reshape(1,-1)/255.0)[0])
+    print(sid_no)
     return image

--
Gitblit v1.9.3