Spaces:

genelify
/

tesseract-ocr-bbox

Sleeping

App Files Files Community

tesseract-ocr-bbox / app.py

genelify

Update app.py

193d94a verified 3 months ago

raw

history blame contribute delete

3.96 kB

	import gradio as gr, pytesseract, cv2, numpy as np, os
	from PIL import Image, ImageDraw

	def process(image_path: str, lang: str) -> list[Image.Image, str]:

	if not image_path:
	return []

	with Image.open(image_path).convert('RGB') as image_data:

	os.remove(image_path)
	gray = cv2.cvtColor(np.array(image_data), cv2.COLOR_BGR2GRAY)
	_,threshold = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)
	data = pytesseract.image_to_data(threshold, output_type=pytesseract.Output.DICT, lang=lang)

	boxes_and_words = []
	for i in range(len(data['text'])):
	# Process based on available text and higher confidence score
	if data['text'][i].strip() != '' and data['conf'][i] > 50:
	x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
	word = data['text'][i]
	boxes_and_words.append({'box': (x, y, w, h), 'word': word})

	# Remove overlapping boxes based on IoU
	to_remove = set()
	for i in range(len(boxes_and_words)):
	for j in range(i + 1, len(boxes_and_words)):
	box1 = boxes_and_words[i]['box']
	box2 = boxes_and_words[j]['box']
	iou = calculate_iou(box1, box2)
	if iou > 0.5:
	# Remove the box with the smaller area
	area1 = box1[2] * box1[3]
	area2 = box2[2] * box2[3]
	if area1 > area2:
	to_remove.add(j)
	else:
	to_remove.add(i)

	filtered_boxes_and_words = [bw for i, bw in enumerate(boxes_and_words) if i not in to_remove]

	# Sort the text data based on bbox coordinates
	filtered_boxes_and_words.sort(key=lambda bw: (bw['box'][1], bw['box'][0]))

	# Group words into lines
	lines = []
	current_line = []
	current_top = -1

	for bw in filtered_boxes_and_words:
	x, y, w, h = bw['box']
	if current_top == -1 or y > current_top + h:
	if current_line:
	lines.append(current_line)
	current_line = [bw]
	current_top = y
	else:
	current_line.append(bw)

	if current_line:
	lines.append(current_line)

	sorted_text = ''
	for line in lines:
	line.sort(key=lambda bw: bw['box'][0])
	line_text = " ".join([bw['word'] for bw in line])
	sorted_text += line_text + "\n"

	draw = ImageDraw.Draw(image_data)
	for bw in filtered_boxes_and_words:
	x, y, w, h = bw['box']
	draw.rectangle([x, y, x + w, y + h], outline='yellow', width=2)

	return [image_data, sorted_text.strip()]

	def calculate_iou(box1, box2):

	xA = max(box1[0], box2[0])
	yA = max(box1[1], box2[1])
	xB = min(box1[0] + box1[2], box2[0] + box2[2])
	yB = min(box1[1] + box1[3], box2[1] + box2[3])

	intersection_area = max(0, xB - xA) * max(0, yB - yA)
	box1_area = box1[2] * box1[3]
	box2_area = box2[2] * box2[3]
	iou = intersection_area / float(box1_area + box2_area - intersection_area)
	return iou

	languages = pytesseract.get_languages()

	iface = gr.Interface(
	fn=process,
	inputs=[gr.Image(label='Upload Image', type='filepath'), gr.Dropdown(label="Select Language", choices=languages, type="value")],
	outputs=[gr.Image(type='filepath', label="Image with Bounding Boxes"), gr.Textbox(label='Output Text')],
	css="footer {visibility: hidden}",
	title="OCR \| PyTesseract with bbox",
	article = """<p style='text-align: center;'>Hello, thanks for coming, visit: <a href="https://www.genelify.com" target="_blank">Genelify</a>, <a href="https://www.tubtic.com" target="_blank">Tubtic</a></p>"""
	)
	iface.launch(show_api=False, inline=False)