havoc/sizif-ocr.git

Samo Penic

2019-01-24 8dad6520729d26d90ebee48939d7f2a1fd73dd38

commit \| author \| age
9efc18	1	import cv2
SP	2	import numpy as np
762a5e	3	from skimage import morphology, img_as_ubyte
02e0f7	4
0d97e9	5	import pkg_resources
SP	6
5460bf	7	templatefile = "/template-8.png" # always use slash
0d97e9	8	template8 = pkg_resources.resource_filename(__name__, templatefile)
SP	9
9efc18	10
SP	11	def kernel(x, y):
6fde5f	12	"""
SP	13	Function greates square kernel of size x and y
	14	"""
9efc18	15	return np.ones((x, y), np.uint8)
SP	16
6fde5f	17
5460bf	18	def find_biggest_blob(image, original_image, sid_mask):
6fde5f	19	if sid_mask[0] == "1":
c9e021	20	move_left = 45
6fde5f	21	elif sid_mask[0] == "x":
c9e021	22	move_left = 50
6fde5f	23	else:
SP	24	move_left = 0
5460bf	25	# Remove noise
6fde5f	26	image2 = cv2.morphologyEx(
9c222b	27	original_image, cv2.MORPH_OPEN, kernel(2, 2), iterations=3
6fde5f	28	)
SP	29	# find biggest block of pixels
c9e021	30	image1 = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(5, 25), iterations=3)
6fde5f	31	image1 = img_as_ubyte(image1 > 50)
0d97e9	32	cv2.imwrite("/tmp/sidblock1.png", image1)
6fde5f	33	im2, ctrs, hier = cv2.findContours(
SP	34	image1.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
	35	)
	36	sorted_ctrs = sorted(
	37	ctrs, key=lambda ctr: cv2.contourArea(ctr)
	38	) # get bigges contour
	39	x, y, w, h = cv2.boundingRect(sorted_ctrs[-1])
c9e021	40	image = image[y : y + h, x + 25 - move_left : x + w - 30] # +25,-25
6fde5f	41	return image
SP	42
5460bf	43
d5c694	44	def sid_compare(sid_no, sid_mask):
6fde5f	45	"""
SP	46	Function compares student id number with student id mask if the recognised number is valid according to the mask
	47	:param sid_no:
	48	:param sid_mask:
	49	:return: True if they match, else False
	50	"""
	51	for s, es in zip(sid_mask, sid_no):
	52	if s != "x" and s != es:
d5c694	53	return False
SP	54	return True
	55
	56
5460bf	57	def segment_by_contours(image, original_image, classifier, sid_mask):
6fde5f	58	"""
SP	59	First algorithm. it segments numerals with contours. It works with numbers where individual numerals does not touch.
	60	:param image:
	61	:param original_image:
	62	:param classifier:
	63	:return: student id as a string
	64	"""
9efc18	65
762a5e	66	sid_no = ""
5460bf	67	image = find_biggest_blob(image, original_image, sid_mask)
SP	68	cv2.imwrite("/tmp/sid_contour1.png", image)
6fde5f	69	im2, ctrs, hier = cv2.findContours(
SP	70	image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
	71	)
	72	sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])
	73
762a5e	74	for i, ctr in enumerate(sorted_ctrs):
SP	75	# Get bounding box
	76	x, y, w, h = cv2.boundingRect(ctr)
	77	# Getting ROI
7621b3	78	if w < h / 3:
762a5e	79	sid_no = sid_no + "1"
SP	80	continue
	81	roi = image[y : y + h, x : x + w]
	82	roi = img_as_ubyte(roi < 128)
	83	roi = cv2.resize(roi, (32, 32))
	84
	85	# cv2.rectangle(image,(x,y),( x + w, y + h ),(0,255,0),2)
0d97e9	86	cv2.imwrite("/tmp/sid_no_{}.png".format(i), roi)
77dabf	87	sid_currno= str(classifier.predict(roi.reshape(1, -1) / 255.0)[0])
SP	88	sid_no = sid_no + sid_currno
	89	fname="/tmp/SID/"+str(sid_currno)+"/sid"+str(np.random.randint(0,1000000))+".png"
	90	print("Writing all the sid images ", fname)
	91	cv2.imwrite(fname, roi)
762a5e	92	return sid_no
SP	93
	94
d5c694	95	def segment_by_sid_len(image, original_image, sid_mask, classifier):
6fde5f	96	"""
SP	97	Third algorithm. It trys to get biggest "blob" in the image and then it cuts it into individual numbers by force.
	98	It has some problems with finding individual numbers, so some tweaking must be done!
	99
	100	:param image:
	101	:param original_image:
	102	:param sid_mask:
	103	:param classifier:
	104	:return: student id as a string
	105	"""
5cb7c1	106	sid_no = ""
SP	107	sid_len = len(sid_mask)
5460bf	108	image = find_biggest_blob(image, original_image, sid_mask)
0d97e9	109	cv2.imwrite("/tmp/sidblock2.png", image)
ac766e	110	imgHeight, imgWidth = image.shape[0:2]
5cb7c1	111	numWidth = int(imgWidth / (sid_len))
SP	112	for i in range(0, sid_len):
	113	num = image[:, i * numWidth : (i + 1) * numWidth]
ac766e	114	num = img_as_ubyte(num < 128)
SP	115	num = cv2.resize(num, (32, 32))
0d97e9	116	cv2.imwrite("/tmp/sid_no_{}.png".format(i), num)
ac766e	117	sid_no = sid_no + str(classifier.predict(num.reshape(1, -1) / 255.0)[0])
SP	118	return sid_no
	119
6fde5f	120
SP	121	def segment_by_7segments(image, original_image, sid_mask, classifier):
	122	"""
	123	Second attempt. It dilates the image to get all 7 segments wisible as 8888888 then it does pattern matching of 8 with
	124	pattern image. It works if the scaned gray level is high enough.
	125
	126	:param image:
	127	:param original_image:
	128	:param sid_mask:
	129	:param classifier:
	130	:return: student id number as a string
	131	"""
	132	block_image = cv2.morphologyEx(
	133	original_image, cv2.MORPH_CLOSE, kernel(2, 2), iterations=10
	134	)
	135	block_image = img_as_ubyte(block_image < 50)
0d97e9	136	cv2.imwrite("/tmp/sid_3rd1.png", block_image)
SP	137	template = cv2.imread(template8, 0)
d5c694	138	w, h = template.shape[::-1]
SP	139	res = cv2.matchTemplate(block_image, template, cv2.TM_CCOEFF_NORMED)
	140	loc = np.where(res >= 0.75)
	141	cimg = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
6fde5f	142	loc_filtered_x = []
SP	143	loc_filtered_y = []
d5c694	144	for pt in zip(*loc[::-1]):
6fde5f	145	pt = (pt[0] - 10, pt[1] - 10)
d5c694	146	loc_filtered_y.append(pt[1])
SP	147	loc_filtered_x.append(pt[0])
6fde5f	148	# points.append(pt)
SP	149	# filter points
	150	if len(loc_filtered_x) == 0:
d5c694	151	return ""
6fde5f	152	loc_filtered_x, loc_filtered_y = zip(*sorted(zip(loc_filtered_x, loc_filtered_y)))
SP	153	a = np.diff(loc_filtered_x) > int(w / 2)
d5c694	154	a = np.append(a, True)
SP	155	loc_filtered_x = np.array(loc_filtered_x)
	156	loc_filtered_y = np.array(loc_filtered_y)
	157	points = [loc_filtered_y[a], loc_filtered_x[a]]
	158	for pt in zip(*points[::-1]):
	159	cv2.rectangle(cimg, pt, (pt[0] + w, pt[1] + h), (0, 255, 255), 2)
0d97e9	160	cv2.imwrite("/tmp/sid_3rd2.png", cimg)
d5c694	161
6fde5f	162	sid_no = ""
SP	163	for i, pt in enumerate(zip(*points[::-1])):
	164	num = image[pt[1] : pt[1] + h, pt[0] : pt[0] + w]
	165	# cv2.imwrite("sid_3no_{}.png".format(i), num)
d5c694	166	num = img_as_ubyte(num < 128)
SP	167	try:
	168	num = cv2.resize(num, (32, 32))
	169	except:
	170	return ""
0d97e9	171	cv2.imwrite("/tmp/sid_3no_{}.png".format(i), num)
d5c694	172	sid_no = sid_no + str(classifier.predict(num.reshape(1, -1) / 255.0)[0])
SP	173
	174	return sid_no
ac766e	175
6fde5f	176
762a5e	177	def getSID(image, classifier, sid_mask):
6fde5f	178	"""
SP	179	Tries different approaches on image to get student id number. Firstly clears image of noise and then skeletonizes
	180	numbers and thickens it until it gets normalized image. It sends it to the segmentation and recognition functions.
	181
	182	Tweak both MORPH_OPEN lines....
	183
	184	:param image:
	185	:param classifier:
	186	:param sid_mask:
	187	:return: (student_id, error, warning) student id as a string, list of errors and list of warnings during the recognition
	188
	189	"""
5cb7c1	190	sid_warn = []
6fde5f	191	sid_err = []
762a5e	192	image = 255 - image
6fde5f	193	image_original = image.copy()
9c222b	194	image = img_as_ubyte(image > 70)
0d97e9	195	cv2.imwrite("/tmp/enSID0.png", image)
6fde5f	196
9efc18	197	# Remove noise
93d924	198	image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(3, 3), iterations=2)
6fde5f	199
9efc18	200	# Closing. Connect non connected parts
9c222b	201	image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(5, 1), iterations=4)
02e0f7	202
6fde5f	203	# Again noise removal after closing
c9e021	204	#image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(8, 8), iterations=1)
5cb7c1	205	# don't do too much noise removal.
93d924	206	#image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel(3, 3), iterations=2)
ac766e	207
9efc18	208	# Skeletonization
c9e021	209	#image = img_as_ubyte(morphology.skeletonize(image > 128))
762a5e	210	image = img_as_ubyte(morphology.thin(image > 128))
0d97e9	211	cv2.imwrite("/tmp/enSID1.png", image)
6fde5f	212
9efc18	213	# Stub removal (might not be necessary if thinning instead of skeletonize is used above
SP	214	# Making lines stronger
c9e021	215	image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(5, 2), iterations=1)
9efc18	216	image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel(10, 10))
6fde5f	217
9efc18	218	# Thining again
762a5e	219	image = img_as_ubyte(morphology.skeletonize(image > 0.5))
9efc18	220	image = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel(10, 10))
0d97e9	221	cv2.imwrite("/tmp/enhancedSID.png", image)
02e0f7	222
6fde5f	223	sid_no = segment_by_contours(image, image_original, classifier, sid_mask)
e0996e	224
6fde5f	225	if len(sid_no) != len(sid_mask) or not sid_compare(sid_no, sid_mask):
5cb7c1	226	sid_warn.append("Trying second SID algorithm.")
d5c694	227	sid_no = segment_by_7segments(image, image_original, sid_mask, classifier)
e0996e	228
6fde5f	229	if (len(sid_no)) != len(sid_mask):
d5c694	230	sid_no = segment_by_sid_len(image, image_original, sid_mask, classifier)
SP	231	sid_warn.append("Trying third SID algorithm.")
	232
	233	if not sid_compare(sid_no, sid_mask):
6fde5f	234	sid_err = ["Wrong SID!"]
93d924	235	cv2.imwrite("/tmp/SID_"+sid_no+".png", image)
6fde5f	236	return sid_no, sid_err, sid_warn