havoc/sizif-ocr.git

			@@ -6,13 +6,14 @@


			class Paper:
			def __init__(self, filename=None, sid_classifier=None):
			def __init__(self, filename=None, sid_classifier=None, settings=None):
			self.filename = filename
			self.invalid = None
			self.QRData = None
			self.settings = settings
			self.errors = []
			self.warnings = []
			self.sid_classifier=sid_classifier
			self.sid_classifier = sid_classifier
			if filename is not None:
			self.loadImage(filename)
			self.runOcr()
			@@ -216,11 +217,14 @@
			def get_enhanced_sid(self):
			if self.sid_classifier is None:
			return "x"
			if self.settings is not None:
			sid_mask=self.settings.get("sid_mask", None)
			es = getSID(
			self.img[
			int(0.045 * self.imgHeight) : int(0.085 * self.imgHeight),
			int(0.7 * self.imgWidth) : int(0.99 * self.imgWidth),
			],
			self.sid_classifier,
			sid_mask
			)
			return es

			@@ -1,9 +1,8 @@
			import cv2
			import numpy as np
			from skimage import morphology,img_as_ubyte
			from skimage import morphology, img_as_ubyte
			from sklearn import svm
			from sklearn.externals import joblib



			"""
			@@ -61,48 +60,59 @@
			return np.ones((x, y), np.uint8)


			def getSID(image, classifier):
			image=255-image
			image=img_as_ubyte(image>100)
			def segment_by_contours(image, sorted_ctrs, classifier):
			sid_no = ""
			for i, ctr in enumerate(sorted_ctrs):
			# Get bounding box
			x, y, w, h = cv2.boundingRect(ctr)
			# Getting ROI
			if w < h / 2:
			sid_no = sid_no + "1"
			continue
			roi = image[y : y + h, x : x + w]
			roi = img_as_ubyte(roi < 128)
			roi = cv2.resize(roi, (32, 32))

			# cv2.rectangle(image,(x,y),( x + w, y + h ),(0,255,0),2)
			cv2.imwrite("sid_no_{}.png".format(i), roi)
			sid_no = sid_no + str(classifier.predict(roi.reshape(1, -1) / 255.0)[0])
			return sid_no


			def getSID(image, classifier, sid_mask):
			image = 255 - image
			image = img_as_ubyte(image > 100)
			cv2.imwrite("enSID0.png", image)
			# Remove noise
			image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(2,2), iterations=1)
			image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(2, 2), iterations=1)
			# Closing. Connect non connected parts
			image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(5, 3), iterations=4)
			# Again noise removal after closing

			image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(8,8), iterations=1)
			image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(8, 8), iterations=1)
			# Skeletonization
			image = img_as_ubyte(morphology.thin(image>128))
			cv2.imwrite("enSID1.png",image)
			image = img_as_ubyte(morphology.thin(image > 128))
			cv2.imwrite("enSID1.png", image)
			# Stub removal (might not be necessary if thinning instead of skeletonize is used above
			# Making lines stronger
			image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(5, 5), iterations=1)

			image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(10, 10))
			# Thining again
			image = img_as_ubyte(morphology.skeletonize(image>0.5))
			image = img_as_ubyte(morphology.skeletonize(image > 0.5))
			image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(10, 10))

			im2,ctrs, hier = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
			im2, ctrs, hier = cv2.findContours(
			image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
			)
			sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])

			#classifier = joblib.load('filename.joblib')

			sid_no=""
			for i, ctr in enumerate(sorted_ctrs):
			# Get bounding box
			x, y, w, h = cv2.boundingRect(ctr)
			# Getting ROI
			if(w<h/2):
			sid_no=sid_no+"1"
			continue
			roi = image[y:y+h, x:x+w]
			roi = img_as_ubyte(roi < 128)
			roi = cv2.resize(roi,(32,32))

			#cv2.rectangle(image,(x,y),( x + w, y + h ),(0,255,0),2)
			cv2.imwrite('sid_no_{}.png'.format(i), roi)
			sid_no=sid_no+str(classifier.predict(roi.reshape(1,-1)/255.0)[0])
			sid_no = ""
			sid_len = 0
			if sid_mask is not None:
			if len(sid_mask)==len(sorted_ctrs):
			sid_no=segment_by_contours(image,sorted_ctrs,classifier)
			else:
			print("Ooops have to find another way")
			print(sid_no)
			return image
			return sid_no

	Ocr.py	8 ●●●●● patch \| view \| raw \| blame \| history
	aoiOcr.py	4 ●●●●● patch \| view \| raw \| blame \| history
	sid_process.py	68 ●●●●● patch \| view \| raw \| blame \| history

			@@ -1,10 +1,12 @@
			from Ocr import Paper
			from sklearn.externals import joblib


			settings={'sid_mask':'11xx0xxx',}
			classifier = joblib.load('filename.joblib')

			#p=Paper(filename='testpage300dpi_scan1.png')
			p=Paper(filename='sizif111.tif', sid_classifier=classifier)
			p=Paper(filename='sizif111.tif', sid_classifier=classifier, settings=settings)
			#p=Paper(filename='processed_scans/20141016095134535_0028.tif')

			print(p.QRData)