Spaces:

MLBench
/

OCR_Term_Exctraction

Sleeping

App Files Files Community

OCR_Term_Exctraction / app.py

ArslanRobo

Upload 2 files

fdb32ca verified about 1 year ago

raw

history blame contribute delete

7 kB

	# import os
	# import cv2
	# import re
	# import numpy as np
	# from PIL import Image, ImageDraw, ImageFont
	# from paddleocr import PaddleOCR
	# from pdf2image import convert_from_path
	# import gradio as gr

	# # Specify the path to the Poppler bin directory
	# poppler_path = r"C:\\poppler\\poppler-24.08.0\\Library\\bin"

	# # Function to check proximity of bounding boxes
	# def are_boxes_close(box1, box2, y_threshold=50):
	# y1_center = (box1[0][1] + box1[2][1]) / 2
	# y2_center = (box2[0][1] + box2[2][1]) / 2
	# return abs(y1_center - y2_center) <= y_threshold

	# # Function to extract terms with specific rules
	# def extract_specific_terms(ocr_results):
	# extracted_terms = []

	# for line in ocr_results[0]:
	# detected_text = line[1][0] # Extracted text
	# box = line[0] # Bounding box of the detected text

	# if re.match(r"Bill of Lading:\s*\d+", detected_text):
	# extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

	# elif re.match(r"Page:\s*\w+", detected_text):
	# extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

	# elif detected_text in ["Shipper", "Receiver", "Carrier"]:
	# extracted_terms.append({'detected_text': detected_text + " Signature", 'bounding_box': box})

	# elif detected_text == "Signature":
	# extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

	# return extracted_terms

	# # Function to annotate image with detected terms
	# def annotate_image_with_terms(image, terms):
	# pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
	# draw = ImageDraw.Draw(pil_image)

	# font_size = 40
	# try:
	# font = ImageFont.truetype("arial.ttf", font_size)
	# except IOError:
	# font = ImageFont.load_default()

	# for term in terms:
	# box = term['bounding_box']
	# detected_text = term['detected_text']

	# points = [(int(x[0]), int(x[1])) for x in box]
	# draw.polygon(points, outline="blue", width=2)
	# position = (points[0][0], points[0][1] - font_size - 5)
	# draw.text(position, detected_text, fill="red", font=font)

	# return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

	# # Main processing function
	# def process_file(file):
	# ocr = PaddleOCR(lang='en')
	# extracted_terms = []

	# if file.name.endswith(".pdf"):
	# images = convert_from_path(file.name, poppler_path=poppler_path)
	# processed_images = []
	# for image in images:
	# image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	# ocr_results = ocr.ocr(image_np, cls=True)
	# extracted_terms = extract_specific_terms(ocr_results)
	# annotated_image = annotate_image_with_terms(image_np, extracted_terms)
	# processed_images.append(annotated_image)

	# return [Image.fromarray(img) for img in processed_images]

	# else:
	# image = cv2.imread(file.name)
	# ocr_results = ocr.ocr(image, cls=True)
	# extracted_terms = extract_specific_terms(ocr_results)
	# annotated_image = annotate_image_with_terms(image, extracted_terms)
	# return Image.fromarray(annotated_image)

	# # Gradio Interface
	# def gradio_interface(file):
	# result = process_file(file)
	# if isinstance(result, list):
	# return result[0] # Display only the first page
	# return result

	# iface = gr.Interface(
	# fn=gradio_interface,
	# inputs=gr.File(label="Upload an Image or PDF", file_types=[".pdf", ".png", ".jpg", ".jpeg"]),
	# outputs="image",
	# live=True,
	# title="OCR Term Extraction",
	# description="Upload an image or PDF containing text to detect and annotate terms such as 'Bill of Lading', 'Page', and signatures.",
	# allow_flagging="never"
	# )
	# iface.launch()



	import os
	import cv2
	import re
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	from paddleocr import PaddleOCR
	import gradio as gr

	# Function to check proximity of bounding boxes
	def are_boxes_close(box1, box2, y_threshold=50):
	y1_center = (box1[0][1] + box1[2][1]) / 2
	y2_center = (box2[0][1] + box2[2][1]) / 2
	return abs(y1_center - y2_center) <= y_threshold

	# Function to extract terms with specific rules
	def extract_specific_terms(ocr_results):
	extracted_terms = []

	for line in ocr_results[0]:
	detected_text = line[1][0] # Extracted text
	box = line[0] # Bounding box of the detected text

	if re.match(r"Bill of Lading:\s*\d+", detected_text):
	extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

	elif re.match(r"Page:\s*\w+", detected_text):
	extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

	elif detected_text in ["Shipper", "Receiver", "Carrier"]:
	extracted_terms.append({'detected_text': detected_text + " Signature", 'bounding_box': box})

	elif detected_text == "Signature":
	extracted_terms.append({'detected_text': detected_text, 'bounding_box': box})

	return extracted_terms

	# Function to annotate image with detected terms
	def annotate_image_with_terms(image, terms):
	pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
	draw = ImageDraw.Draw(pil_image)

	font_size = 20
	try:
	font = ImageFont.truetype("arial.ttf", font_size)
	except IOError:
	font = ImageFont.load_default()

	for term in terms:
	box = term['bounding_box']
	detected_text = term['detected_text']

	points = [(int(x[0]), int(x[1])) for x in box]
	draw.polygon(points, outline="blue", width=2)
	position = (points[0][0], points[0][1] - font_size - 5)
	draw.text(position, detected_text, fill="red", font=font)

	return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

	# Main processing function
	def process_file(file):
	ocr = PaddleOCR(lang='en')
	extracted_terms = []

	# Handle image files (PNG, JPG, JPEG)
	image = cv2.imread(file.name)
	ocr_results = ocr.ocr(image, cls=True)
	extracted_terms = extract_specific_terms(ocr_results)
	annotated_image = annotate_image_with_terms(image, extracted_terms)
	return Image.fromarray(annotated_image)

	# Gradio Interface
	def gradio_interface(file):
	result = process_file(file)
	return result

	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.File(label="Upload an Image", file_types=[".png", ".jpg", ".jpeg"]),
	outputs="image",
	live=True,
	title="OCR Term Extraction",
	description="Upload an image containing text to detect and annotate terms such as 'Bill of Lading', 'Page', and signatures.",
	allow_flagging="never"
	)
	iface.launch()