Spaces:

not0x100
/

InsightOCR

Sleeping

File size: 3,126 Bytes

a7ce53f

import os
import base64
import json
import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_TOKEN = os.getenv("OPENAI_API_KEY")

if not HF_TOKEN or not OPENAI_TOKEN:
    raise EnvironmentError("HF_TOKEN and OPENAI_API_KEY must be set in the .env file")

# Initialize clients
client_qwen = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN)
client_openai = OpenAI(api_key=OPENAI_TOKEN)

def encode_image_to_base64(image_path: str) -> str:
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def extract_text_with_qwen(base64_image: str) -> str:
    prompt = (
        "Extract all visible printed or handwritten text from this image as accurately and cleanly as possible. "
        "Do not summarize or explain. Just return the extracted text clearly."
    )

    response = client_qwen.chat.completions.create(
        model="Qwen/Qwen2.5-VL-7B-Instruct:hyperbolic",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ]
    )
    return response.choices[0].message.content.strip()

def analyze_text_with_openai(ocr_text: str) -> str:
    prompt = (
        "answer in arabic. "
        "You are an expert assistant. The following is text extracted via OCR from an image. "
        "Please analyze it carefully. Return a structured JSON output that includes:\n"
        "- A concise summary of the content\n"
        "- Any critical insights, risks, or anomalies\n"
        "- Detected topics or categories\n"
        "- Language and tone characteristics (if relevant)\n\n"
        f"OCR TEXT:\n{ocr_text}"
    )

    response = client_openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

def process_image(image) -> str:
    image_path = "temp_uploaded_image.jpg"
    image.save(image_path)

    base64_image = encode_image_to_base64(image_path)
    ocr_text = extract_text_with_qwen(base64_image)
    openai_analysis = analyze_text_with_openai(ocr_text)

    result = {
        "ocr_text": ocr_text,
        "openai_analysis": openai_analysis
    }

    # Save to file
    with open("ocr_summary_output.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    return f"--- OCR TEXT ---\n{ocr_text}\n\n--- GPT ANALYSIS ---\n{openai_analysis}"

# Gradio UI
demo = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=gr.Textbox(label="OCR + GPT Result"),
    title="OCR and Insight Generator",
    description="Upload an image with printed or handwritten text. Qwen extracts the text. GPT-4 summarizes and analyzes it."
)

if __name__ == "__main__":
    demo.launch(share=True)