Spaces:

not0x100
/

InsightOCR

Sleeping

App Files Files Community

InsightOCR / app.py

0X100

Add Gradio app for OCR and insight generation with Qwen and GPT-4

a7ce53f 10 months ago

raw

history blame contribute delete

3.13 kB

	import os
	import base64
	import json
	import gradio as gr
	from openai import OpenAI
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")
	OPENAI_TOKEN = os.getenv("OPENAI_API_KEY")

	if not HF_TOKEN or not OPENAI_TOKEN:
	raise EnvironmentError("HF_TOKEN and OPENAI_API_KEY must be set in the .env file")

	# Initialize clients
	client_qwen = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN)
	client_openai = OpenAI(api_key=OPENAI_TOKEN)

	def encode_image_to_base64(image_path: str) -> str:
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")

	def extract_text_with_qwen(base64_image: str) -> str:
	prompt = (
	"Extract all visible printed or handwritten text from this image as accurately and cleanly as possible. "
	"Do not summarize or explain. Just return the extracted text clearly."
	)

	response = client_qwen.chat.completions.create(
	model="Qwen/Qwen2.5-VL-7B-Instruct:hyperbolic",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
	]
	}
	]
	)
	return response.choices[0].message.content.strip()

	def analyze_text_with_openai(ocr_text: str) -> str:
	prompt = (
	"answer in arabic. "
	"You are an expert assistant. The following is text extracted via OCR from an image. "
	"Please analyze it carefully. Return a structured JSON output that includes:\n"
	"- A concise summary of the content\n"
	"- Any critical insights, risks, or anomalies\n"
	"- Detected topics or categories\n"
	"- Language and tone characteristics (if relevant)\n\n"
	f"OCR TEXT:\n{ocr_text}"
	)

	response = client_openai.chat.completions.create(
	model="gpt-4.1-mini",
	messages=[{"role": "user", "content": prompt}]
	)
	return response.choices[0].message.content.strip()

	def process_image(image) -> str:
	image_path = "temp_uploaded_image.jpg"
	image.save(image_path)

	base64_image = encode_image_to_base64(image_path)
	ocr_text = extract_text_with_qwen(base64_image)
	openai_analysis = analyze_text_with_openai(ocr_text)

	result = {
	"ocr_text": ocr_text,
	"openai_analysis": openai_analysis
	}

	# Save to file
	with open("ocr_summary_output.json", "w", encoding="utf-8") as f:
	json.dump(result, f, ensure_ascii=False, indent=2)

	return f"--- OCR TEXT ---\n{ocr_text}\n\n--- GPT ANALYSIS ---\n{openai_analysis}"

	# Gradio UI
	demo = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil", label="Upload Image"),
	outputs=gr.Textbox(label="OCR + GPT Result"),
	title="OCR and Insight Generator",
	description="Upload an image with printed or handwritten text. Qwen extracts the text. GPT-4 summarizes and analyzes it."
	)

	if __name__ == "__main__":
	demo.launch(share=True)