InsightOCR / app.py
0X100's picture
Add Gradio app for OCR and insight generation with Qwen and GPT-4
a7ce53f
import os
import base64
import json
import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_TOKEN = os.getenv("OPENAI_API_KEY")
if not HF_TOKEN or not OPENAI_TOKEN:
raise EnvironmentError("HF_TOKEN and OPENAI_API_KEY must be set in the .env file")
# Initialize clients
client_qwen = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN)
client_openai = OpenAI(api_key=OPENAI_TOKEN)
def encode_image_to_base64(image_path: str) -> str:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def extract_text_with_qwen(base64_image: str) -> str:
prompt = (
"Extract all visible printed or handwritten text from this image as accurately and cleanly as possible. "
"Do not summarize or explain. Just return the extracted text clearly."
)
response = client_qwen.chat.completions.create(
model="Qwen/Qwen2.5-VL-7B-Instruct:hyperbolic",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}
]
)
return response.choices[0].message.content.strip()
def analyze_text_with_openai(ocr_text: str) -> str:
prompt = (
"answer in arabic. "
"You are an expert assistant. The following is text extracted via OCR from an image. "
"Please analyze it carefully. Return a structured JSON output that includes:\n"
"- A concise summary of the content\n"
"- Any critical insights, risks, or anomalies\n"
"- Detected topics or categories\n"
"- Language and tone characteristics (if relevant)\n\n"
f"OCR TEXT:\n{ocr_text}"
)
response = client_openai.chat.completions.create(
model="gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content.strip()
def process_image(image) -> str:
image_path = "temp_uploaded_image.jpg"
image.save(image_path)
base64_image = encode_image_to_base64(image_path)
ocr_text = extract_text_with_qwen(base64_image)
openai_analysis = analyze_text_with_openai(ocr_text)
result = {
"ocr_text": ocr_text,
"openai_analysis": openai_analysis
}
# Save to file
with open("ocr_summary_output.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
return f"--- OCR TEXT ---\n{ocr_text}\n\n--- GPT ANALYSIS ---\n{openai_analysis}"
# Gradio UI
demo = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil", label="Upload Image"),
outputs=gr.Textbox(label="OCR + GPT Result"),
title="OCR and Insight Generator",
description="Upload an image with printed or handwritten text. Qwen extracts the text. GPT-4 summarizes and analyzes it."
)
if __name__ == "__main__":
demo.launch(share=True)