import os import base64 import json import gradio as gr from openai import OpenAI from dotenv import load_dotenv # Load environment variables load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") OPENAI_TOKEN = os.getenv("OPENAI_API_KEY") if not HF_TOKEN or not OPENAI_TOKEN: raise EnvironmentError("HF_TOKEN and OPENAI_API_KEY must be set in the .env file") # Initialize clients client_qwen = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN) client_openai = OpenAI(api_key=OPENAI_TOKEN) def encode_image_to_base64(image_path: str) -> str: with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def extract_text_with_qwen(base64_image: str) -> str: prompt = ( "Extract all visible printed or handwritten text from this image as accurately and cleanly as possible. " "Do not summarize or explain. Just return the extracted text clearly." ) response = client_qwen.chat.completions.create( model="Qwen/Qwen2.5-VL-7B-Instruct:hyperbolic", messages=[ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} ] } ] ) return response.choices[0].message.content.strip() def analyze_text_with_openai(ocr_text: str) -> str: prompt = ( "answer in arabic. " "You are an expert assistant. The following is text extracted via OCR from an image. " "Please analyze it carefully. Return a structured JSON output that includes:\n" "- A concise summary of the content\n" "- Any critical insights, risks, or anomalies\n" "- Detected topics or categories\n" "- Language and tone characteristics (if relevant)\n\n" f"OCR TEXT:\n{ocr_text}" ) response = client_openai.chat.completions.create( model="gpt-4.1-mini", messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content.strip() def process_image(image) -> str: image_path = "temp_uploaded_image.jpg" image.save(image_path) base64_image = encode_image_to_base64(image_path) ocr_text = extract_text_with_qwen(base64_image) openai_analysis = analyze_text_with_openai(ocr_text) result = { "ocr_text": ocr_text, "openai_analysis": openai_analysis } # Save to file with open("ocr_summary_output.json", "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) return f"--- OCR TEXT ---\n{ocr_text}\n\n--- GPT ANALYSIS ---\n{openai_analysis}" # Gradio UI demo = gr.Interface( fn=process_image, inputs=gr.Image(type="pil", label="Upload Image"), outputs=gr.Textbox(label="OCR + GPT Result"), title="OCR and Insight Generator", description="Upload an image with printed or handwritten text. Qwen extracts the text. GPT-4 summarizes and analyzes it." ) if __name__ == "__main__": demo.launch(share=True)