Spaces:

IotaCluster
/

NeuralDrafter

Paused

App Files Files Community

NeuralDrafter / doc_llm_agent.py

IotaCluster

Update doc_llm_agent.py

2d67d15 verified 9 months ago

raw

history blame contribute delete

5.06 kB

	from gradio_client import Client, handle_file
	from docx import Document
	from docx.shared import Inches
	import ast
	import re
	from dotenv import load_dotenv
	from langchain.tools import Tool
	from langchain_experimental.tools import PythonREPLTool
	from langchain.agents import initialize_agent
	from langchain_groq import ChatGroq
	import tempfile
	from PIL import Image
	import os

	load_dotenv()

	# Gradio OCR Tool
	def ocr_tool(image_path: str) -> str:
	try:
	client = Client("IotaCluster/OCR")
	# Accept both local file paths and URLs
	image_input = image_path
	if not (image_path.startswith('http://') or image_path.startswith('https://')):
	image_input = handle_file(image_path)
	else:
	image_input = handle_file(image_path) # handle_file also works with URLs
	result = client.predict(
	image=image_input,
	language=["eng"],
	api_name="/predict"
	)
	return str(result)
	except Exception as e:
	return f"OCR failed: {str(e)}"

	# Word Document Builder
	class DocumentBuilder:
	def __init__(self):
	self.doc = Document()

	def add_image_with_text(self, image_path, matched_dict):
	self.doc.add_picture(image_path, width=Inches(5))
	if matched_dict:
	# Add a table with keys as headers and values as a row, and set borders
	table = self.doc.add_table(rows=2, cols=len(matched_dict))
	table.style = 'Table Grid' # Adds borders to the table
	hdr_cells = table.rows[0].cells
	val_cells = table.rows[1].cells
	for i, (k, v) in enumerate(matched_dict.items()):
	hdr_cells[i].text = str(k)
	val_cells[i].text = str(v)
	self.doc.add_paragraph("\n")

	def save(self, path="output.docx"):
	self.doc.save(path)
	return path

	# Core Function with LLM Matching

	def compile_images_and_text_to_doc(image_objs, text_objects):
	"""
	image_objs: list of PIL.Image.Image objects
	text_objects: list of dicts
	"""
	ocr_tool_instance = Tool(
	name="ImageOCR",
	func=ocr_tool,
	description="Extracts text content from an image using OCR via Gradio"
	)

	doc_builder = DocumentBuilder()

	llm = ChatGroq(temperature=0, model_name="qwen/qwen3-32b", api_key=os.getenv("GROQ_API_KEY2"))
	agent = initialize_agent(
	tools=[ocr_tool_instance, PythonREPLTool()],
	llm=llm,
	agent_type="chat-zero-shot-react-description",
	verbose=True,
	handle_parsing_errors=True,
	max_iterations = 2
	)

	temp_files = []
	try:
	for img in image_objs:
	# Save PIL image to a temporary file
	temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
	img.save(temp_file, format="PNG")
	temp_file.close()
	temp_files.append(temp_file.name)
	image_path = temp_file.name

	print(f"🔍 Processing: {image_path}")
	ocr_text = agent.run(f"Use ImageOCR to extract text from image: {image_path}")

	prompt = (
	f"You are a helpful AI. Here is some OCR text extracted from an image:\n{ocr_text}\n\n"
	f"Below is a list of product entries (each as a dictionary).\n{text_objects}\n"
	f"Your task is to match the OCR text with the most relevant dictionary based on the fields."
	f" Return only the best matching dictionary in raw Python dictionary format."
	)

	llm_match = agent.run(prompt)

	# Extract the first dictionary from the LLM output using regex
	dict_match = re.search(r'\{[^\{\}]*\}', llm_match, re.DOTALL)
	if dict_match:
	dict_str = dict_match.group(0)
	try:
	matched_dict = ast.literal_eval(dict_str)
	except Exception:
	matched_dict = {"Match": dict_str}
	else:
	matched_dict = {"Match": llm_match.strip()[:200]}

	doc_builder.add_image_with_text(image_path, matched_dict)

	path = doc_builder.save()
	print(f"✅ Final document saved at: {path}")
	finally:
	# Clean up temp files
	for f in temp_files:
	try:
	os.remove(f)
	except Exception:
	pass

	# Example usage:
	if __name__ == "__main__":
	from PIL import Image
	image_objs = [Image.open("4090.png"), Image.open("eg_img.png"), Image.open("5080.png")]
	text_objects = [
	{"Name": "RTX 5080 GPU", "Cost": "126999", "Quantity": 3},
	{"Name": "RTX 4090", "Cost": "292999", "Quantity": 5, "Description": "High-end GPU for gaming and AI"},
	{"Name": "groq", "Cost": "N/A", "Quantity": "N/A", "Description": "Service provider for AI models"},
	{"Name": "T 1000", "Cost": "N/A", "Quantity": "N/A", "Description": "High-end GPU for gaming and AI"}
	]
	compile_images_and_text_to_doc(image_objs, text_objects)