NeuralDrafter / doc_llm_agent.py
IotaCluster's picture
Update doc_llm_agent.py
2d67d15 verified
from gradio_client import Client, handle_file
from docx import Document
from docx.shared import Inches
import ast
import re
from dotenv import load_dotenv
from langchain.tools import Tool
from langchain_experimental.tools import PythonREPLTool
from langchain.agents import initialize_agent
from langchain_groq import ChatGroq
import tempfile
from PIL import Image
import os
load_dotenv()
# Gradio OCR Tool
def ocr_tool(image_path: str) -> str:
try:
client = Client("IotaCluster/OCR")
# Accept both local file paths and URLs
image_input = image_path
if not (image_path.startswith('http://') or image_path.startswith('https://')):
image_input = handle_file(image_path)
else:
image_input = handle_file(image_path) # handle_file also works with URLs
result = client.predict(
image=image_input,
language=["eng"],
api_name="/predict"
)
return str(result)
except Exception as e:
return f"OCR failed: {str(e)}"
# Word Document Builder
class DocumentBuilder:
def __init__(self):
self.doc = Document()
def add_image_with_text(self, image_path, matched_dict):
self.doc.add_picture(image_path, width=Inches(5))
if matched_dict:
# Add a table with keys as headers and values as a row, and set borders
table = self.doc.add_table(rows=2, cols=len(matched_dict))
table.style = 'Table Grid' # Adds borders to the table
hdr_cells = table.rows[0].cells
val_cells = table.rows[1].cells
for i, (k, v) in enumerate(matched_dict.items()):
hdr_cells[i].text = str(k)
val_cells[i].text = str(v)
self.doc.add_paragraph("\n")
def save(self, path="output.docx"):
self.doc.save(path)
return path
# Core Function with LLM Matching
def compile_images_and_text_to_doc(image_objs, text_objects):
"""
image_objs: list of PIL.Image.Image objects
text_objects: list of dicts
"""
ocr_tool_instance = Tool(
name="ImageOCR",
func=ocr_tool,
description="Extracts text content from an image using OCR via Gradio"
)
doc_builder = DocumentBuilder()
llm = ChatGroq(temperature=0, model_name="qwen/qwen3-32b", api_key=os.getenv("GROQ_API_KEY2"))
agent = initialize_agent(
tools=[ocr_tool_instance, PythonREPLTool()],
llm=llm,
agent_type="chat-zero-shot-react-description",
verbose=True,
handle_parsing_errors=True,
max_iterations = 2
)
temp_files = []
try:
for img in image_objs:
# Save PIL image to a temporary file
temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
img.save(temp_file, format="PNG")
temp_file.close()
temp_files.append(temp_file.name)
image_path = temp_file.name
print(f"🔍 Processing: {image_path}")
ocr_text = agent.run(f"Use ImageOCR to extract text from image: {image_path}")
prompt = (
f"You are a helpful AI. Here is some OCR text extracted from an image:\n{ocr_text}\n\n"
f"Below is a list of product entries (each as a dictionary).\n{text_objects}\n"
f"Your task is to match the OCR text with the most relevant dictionary based on the fields."
f" Return only the best matching dictionary in raw Python dictionary format."
)
llm_match = agent.run(prompt)
# Extract the first dictionary from the LLM output using regex
dict_match = re.search(r'\{[^\{\}]*\}', llm_match, re.DOTALL)
if dict_match:
dict_str = dict_match.group(0)
try:
matched_dict = ast.literal_eval(dict_str)
except Exception:
matched_dict = {"Match": dict_str}
else:
matched_dict = {"Match": llm_match.strip()[:200]}
doc_builder.add_image_with_text(image_path, matched_dict)
path = doc_builder.save()
print(f"✅ Final document saved at: {path}")
finally:
# Clean up temp files
for f in temp_files:
try:
os.remove(f)
except Exception:
pass
# Example usage:
if __name__ == "__main__":
from PIL import Image
image_objs = [Image.open("4090.png"), Image.open("eg_img.png"), Image.open("5080.png")]
text_objects = [
{"Name": "RTX 5080 GPU", "Cost": "126999", "Quantity": 3},
{"Name": "RTX 4090", "Cost": "292999", "Quantity": 5, "Description": "High-end GPU for gaming and AI"},
{"Name": "groq", "Cost": "N/A", "Quantity": "N/A", "Description": "Service provider for AI models"},
{"Name": "T 1000", "Cost": "N/A", "Quantity": "N/A", "Description": "High-end GPU for gaming and AI"}
]
compile_images_and_text_to_doc(image_objs, text_objects)