Development of the ocr part of AOI
Samo Penic
2018-11-16 762a5e258a90387922d6c6eb3ecc9a7ca7c96144
refactored and debugged.
3 files modified
80 ■■■■■ changed files
Ocr.py 8 ●●●● patch | view | raw | blame | history
aoiOcr.py 4 ●●● patch | view | raw | blame | history
sid_process.py 68 ●●●●● patch | view | raw | blame | history
Ocr.py
@@ -6,13 +6,14 @@
class Paper:
    def __init__(self, filename=None, sid_classifier=None):
    def __init__(self, filename=None, sid_classifier=None, settings=None):
        self.filename = filename
        self.invalid = None
        self.QRData = None
        self.settings = settings
        self.errors = []
        self.warnings = []
        self.sid_classifier=sid_classifier
        self.sid_classifier = sid_classifier
        if filename is not None:
            self.loadImage(filename)
            self.runOcr()
@@ -216,11 +217,14 @@
    def get_enhanced_sid(self):
        if self.sid_classifier is None:
            return "x"
        if self.settings is not None:
            sid_mask=self.settings.get("sid_mask", None)
        es = getSID(
            self.img[
                int(0.045 * self.imgHeight) : int(0.085 * self.imgHeight),
                int(0.7 * self.imgWidth) : int(0.99 * self.imgWidth),
            ],
            self.sid_classifier,
            sid_mask
        )
        return es
aoiOcr.py
@@ -1,10 +1,12 @@
from Ocr import Paper
from sklearn.externals import joblib
settings={'sid_mask':'11xx0xxx',}
classifier = joblib.load('filename.joblib')
#p=Paper(filename='testpage300dpi_scan1.png')
p=Paper(filename='sizif111.tif', sid_classifier=classifier)
p=Paper(filename='sizif111.tif', sid_classifier=classifier, settings=settings)
#p=Paper(filename='processed_scans/20141016095134535_0028.tif')
print(p.QRData)
sid_process.py
@@ -1,9 +1,8 @@
import cv2
import numpy as np
from skimage import morphology,img_as_ubyte
from skimage import morphology, img_as_ubyte
from sklearn import svm
from sklearn.externals import joblib
"""
@@ -61,48 +60,59 @@
    return np.ones((x, y), np.uint8)
def getSID(image, classifier):
    image=255-image
    image=img_as_ubyte(image>100)
def segment_by_contours(image, sorted_ctrs, classifier):
    sid_no = ""
    for i, ctr in enumerate(sorted_ctrs):
        # Get bounding box
        x, y, w, h = cv2.boundingRect(ctr)
        # Getting ROI
        if w < h / 2:
            sid_no = sid_no + "1"
            continue
        roi = image[y : y + h, x : x + w]
        roi = img_as_ubyte(roi < 128)
        roi = cv2.resize(roi, (32, 32))
        # cv2.rectangle(image,(x,y),( x + w, y + h ),(0,255,0),2)
        cv2.imwrite("sid_no_{}.png".format(i), roi)
        sid_no = sid_no + str(classifier.predict(roi.reshape(1, -1) / 255.0)[0])
    return sid_no
def getSID(image, classifier, sid_mask):
    image = 255 - image
    image = img_as_ubyte(image > 100)
    cv2.imwrite("enSID0.png", image)
    # Remove noise
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(2,2), iterations=1)
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(2, 2), iterations=1)
    # Closing. Connect non connected parts
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(5, 3), iterations=4)
    # Again noise removal after closing
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(8,8), iterations=1)
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(8, 8), iterations=1)
    # Skeletonization
    image = img_as_ubyte(morphology.thin(image>128))
    cv2.imwrite("enSID1.png",image)
    image = img_as_ubyte(morphology.thin(image > 128))
    cv2.imwrite("enSID1.png", image)
    # Stub removal (might not be necessary if thinning instead of skeletonize is used above
    # Making lines stronger
    image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(5, 5), iterations=1)
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(10, 10))
    # Thining again
    image = img_as_ubyte(morphology.skeletonize(image>0.5))
    image = img_as_ubyte(morphology.skeletonize(image > 0.5))
    image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(10, 10))
    im2,ctrs, hier = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    im2, ctrs, hier = cv2.findContours(
        image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )
    sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])
    #classifier = joblib.load('filename.joblib')
    sid_no=""
    for i, ctr in enumerate(sorted_ctrs):
        # Get bounding box
        x, y, w, h = cv2.boundingRect(ctr)
        # Getting ROI
        if(w<h/2):
            sid_no=sid_no+"1"
            continue
        roi = image[y:y+h, x:x+w]
        roi = img_as_ubyte(roi < 128)
        roi = cv2.resize(roi,(32,32))
        #cv2.rectangle(image,(x,y),( x + w, y + h ),(0,255,0),2)
        cv2.imwrite('sid_no_{}.png'.format(i), roi)
        sid_no=sid_no+str(classifier.predict(roi.reshape(1,-1)/255.0)[0])
    sid_no = ""
    sid_len = 0
    if sid_mask is not None:
        if len(sid_mask)==len(sorted_ctrs):
            sid_no=segment_by_contours(image,sorted_ctrs,classifier)
        else:
            print("Ooops have to find another way")
    print(sid_no)
    return image
    return sid_no