Spaces:

Cpp4App
/

Cpp4App_test

Sleeping

Cpp4App_test / CDM /detect_text /text_detection.py

HaochenGong

change time cost count

1c42b13 over 1 year ago

10.3 kB

	import CDM.detect_text.ocr as ocr
	from CDM.detect_text.Text import Text
	import numpy as np
	import cv2
	import json
	import time
	import os
	from os.path import join as pjoin
	# from paddleocr import PaddleOCR
	import pytesseract

	# paddle_model = PaddleOCR(use_angle_cls=True, lang="en") #'ch' for chinese and english, 'en' for english


	def save_detection_json(file_path, texts, img_shape):
	f_out = open(file_path, 'w')
	output = {'img_shape': img_shape, 'texts': []}
	for text in texts:
	c = {'id': text.id, 'content': text.content}
	loc = text.location
	c['column_min'], c['row_min'], c['column_max'], c['row_max'] = loc['left'], loc['top'], loc['right'], loc['bottom']
	c['width'] = text.width
	c['height'] = text.height
	output['texts'].append(c)
	json.dump(output, f_out, indent=4)


	def visualize_texts(org_img, texts, shown_resize_height=None, show=False, write_path=None):
	img = org_img.copy()
	for text in texts:
	text.visualize_element(img, line=2)

	img_resize = img
	if shown_resize_height is not None:
	img_resize = cv2.resize(img, (int(shown_resize_height * (img.shape[1]/img.shape[0])), shown_resize_height))

	if show:
	cv2.imshow('texts', img_resize)
	cv2.waitKey(0)
	cv2.destroyWindow('texts')
	if write_path is not None:
	cv2.imwrite(write_path, img)


	def text_sentences_recognition(texts):
	'''
	Merge separate words detected by Google ocr into a sentence
	'''
	changed = True
	while changed:
	changed = False
	temp_set = []
	for text_a in texts:
	merged = False
	for text_b in temp_set:
	if text_a.is_on_same_line(text_b, 'h', bias_justify=0.2 * min(text_a.height, text_b.height), bias_gap=2 * max(text_a.word_width, text_b.word_width)):
	text_b.merge_text(text_a)
	merged = True
	changed = True
	break
	if not merged:
	temp_set.append(text_a)
	texts = temp_set.copy()

	for i, text in enumerate(texts):
	text.id = i
	return texts


	def merge_intersected_texts(texts):
	'''
	Merge intersected texts (sentences or words)
	'''
	changed = True
	while changed:
	changed = False
	temp_set = []
	for text_a in texts:
	merged = False
	for text_b in temp_set:
	if text_a.is_intersected(text_b, bias=2):
	text_b.merge_text(text_a)
	merged = True
	changed = True
	break
	if not merged:
	temp_set.append(text_a)
	texts = temp_set.copy()
	return texts


	def text_cvt_orc_format(ocr_result):
	texts = []
	if ocr_result is not None:
	for i, result in enumerate(ocr_result):
	error = False
	x_coordinates = []
	y_coordinates = []
	text_location = result['boundingPoly']['vertices']
	content = result['description']
	for loc in text_location:
	if 'x' not in loc or 'y' not in loc:
	error = True
	break
	x_coordinates.append(loc['x'])
	y_coordinates.append(loc['y'])
	if error: continue
	location = {'left': min(x_coordinates), 'top': min(y_coordinates),
	'right': max(x_coordinates), 'bottom': max(y_coordinates)}
	texts.append(Text(i, content, location))
	return texts


	def text_cvt_orc_format_paddle(paddle_result):
	texts = []
	for i, line in enumerate(paddle_result):
	points = np.array(line[0])
	# points = points * 5
	location = {'left': int(min(points[:, 0])), 'top': int(min(points[:, 1])), 'right': int(max(points[:, 0])),
	'bottom': int(max(points[:, 1]))}
	content = line[1][0]
	texts.append(Text(i, content, location))
	return texts


	def text_cvt_orc_format_tesseract(tesseract_result):
	# texts = []
	# i_real = 0
	# for i, line in enumerate(tesseract_result['text']):
	# content = line.strip()
	# location = {
	# 'left': int(tesseract_result['left'][i]),
	# 'top': int(tesseract_result['top'][i]),
	# 'right': int(tesseract_result['left'][i]) + int(tesseract_result['width'][i]),
	# 'bottom': int(tesseract_result['top'][i]) + int(tesseract_result['height'][i])
	# }
	# if len(content) > 0:
	# texts.append(Text(i_real, content, location))
	# i_real = i_real + 1

	# Extract line boxes
	texts = []
	i_real = 0
	line_boxes = []
	n_boxes = len(tesseract_result['level'])
	for i in range(n_boxes):
	if tesseract_result['level'][i] == 4 and len(tesseract_result['text'][i].strip()) > 0:
	# (x, y, w, h) = (tesseract_result['left'][i], tesseract_result['top'][i], tesseract_result['width'][i], tesseract_result['height'][i])
	content = tesseract_result['text'][i].strip()
	location = {
	'left': int(tesseract_result['left'][i]),
	'top': int(tesseract_result['top'][i]),
	'right': int(tesseract_result['left'][i]) + int(tesseract_result['width'][i]),
	'bottom': int(tesseract_result['top'][i]) + int(tesseract_result['height'][i])
	}
	texts.append(Text(i_real, content, location))
	i_real = i_real + 1
	# print("ocr result: ", texts)

	return texts

	def text_cvt_orc_format_tesseract_by_line(data):

	# line_data = []
	line_num = None
	line_text = []
	line_box = [0, 0, 0, 0]
	texts = []
	i_real = 0

	for i in range(len(data['level'])):
	# check if the level is word
	if data['level'][i] == 5:
	if line_num != data['line_num'][i]:
	if line_num is not None: # append the previous line data to line_data
	content = ' '.join(line_text)
	location = {
	'left': line_box[0],
	'top': line_box[1],
	'right': line_box[2],
	'bottom': line_box[3]
	}
	texts.append(Text(i_real, content, location))
	i_real = i_real + 1

	# start a new line
	line_num = data['line_num'][i]
	line_text = [data['text'][i]]
	line_box = [
	data['left'][i],
	data['top'][i],
	data['left'][i] + data['width'][i],
	data['top'][i] + data['height'][i],
	]
	else: # add a word to the current line
	line_text.append(data['text'][i])
	line_box[2] = max(line_box[2], data['left'][i] + data['width'][i])
	line_box[3] = max(line_box[3], data['top'][i] + data['height'][i])

	# append the last line data to line_data
	if line_text:
	content = ' '.join(line_text)
	location = {
	'left': line_box[0],
	'top': line_box[1],
	'right': line_box[2],
	'bottom': line_box[3]
	}
	texts.append(Text(i_real, content, location))
	i_real = i_real + 1

	return texts


	def text_filter_noise(texts):
	valid_texts = []
	for text in texts:
	if len(text.content) <= 1 and text.content.lower() not in ['a', ',', '.', '!', '?', '$', '%', ':', '&', '+']:
	continue
	valid_texts.append(text)
	return valid_texts


	def text_detection(input_file='../data/input/30800.jpg', output_file='../data/output', show=False, method='google', paddle_model=None):
	'''
	:param method: google or paddle
	:param paddle_model: the preload paddle model for paddle ocr
	'''
	start = time.time()
	name = input_file.split('/')[-1][:-4]
	ocr_root = pjoin(output_file, 'ocr')
	img = cv2.imread(input_file)
	if img is None:
	print("imread nothing!")

	# resize the img to speed up the ocr
	# img = cv2.resize(img, (int(img.shape[1]/5), int(img.shape[0]/5)))
	# cv2.imshow("img", img)
	# cv2.waitKey(0)

	if method == 'google':
	print('* Detect Text through Google OCR *')
	ocr_result = ocr.ocr_detection_google(input_file)
	texts = text_cvt_orc_format(ocr_result)
	texts = merge_intersected_texts(texts)
	texts = text_filter_noise(texts)
	texts = text_sentences_recognition(texts)
	ocr_time_cost = time.time() - start
	elif method == 'paddle':
	# The import of the paddle ocr can be separate to the beginning of the program if you decide to use this method
	# from paddleocr import PaddleOCR
	print('* Detect Text through Paddle OCR *')
	# if paddle_model is None:
	# paddle_model = PaddleOCR(use_angle_cls=True, lang="en") #'ch' for chinese and english, 'en' for english
	# None
	result = paddle_model.ocr(input_file, cls=True)
	ocr_time_cost = time.time() - start
	texts = text_cvt_orc_format_paddle(result)

	elif method == 'pytesseract':

	img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

	# Perform OCR using Tesseract
	result = pytesseract.image_to_data(img_rgb, output_type=pytesseract.Output.DICT)
	print("ocr result: ", result)

	ocr_time_cost = time.time() - start

	# Convert the Tesseract result to the desired format
	texts = text_cvt_orc_format_tesseract_by_line(result)
	print("texts: ", texts)
	else:
	raise ValueError('Method has to be "google" or "paddle" or "pytesseract"')

	visualize_texts(img, texts, shown_resize_height=800, show=show, write_path=pjoin(ocr_root, name+'.png'))
	save_detection_json(pjoin(ocr_root, name+'.json'), texts, img.shape)
	# ocr_time_cost = time.time() - start
	print("[Text Detection Completed in %.3f s] Input: %s Output: %s" % (ocr_time_cost, input_file, pjoin(ocr_root, name+'.json')))

	# print("!!! detected content !!!")
	# for text in texts:
	# print(text.content)

	return ocr_time_cost


	# text_detection()