Spaces:

akashraut
/

docAI

Sleeping

App Files Files Community

akashraut commited on Feb 9

Commit

77a55a1

verified ·

1 Parent(s): d1c6be0

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -51

app.py CHANGED Viewed

@@ -1,45 +1,56 @@
 import gradio as gr
-import requests
-import base64
 import json
 from PIL import Image
-import io
 import os
-OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
-MODEL = "qwen/qwen-2.5-vl-72b-instruct"
-def image_to_base64(image: Image.Image):
-    buf = io.BytesIO()
-    image.save(buf, format="PNG")
-    return base64.b64encode(buf.getvalue()).decode()
 def extract_document(image: Image.Image):
     if image is None:
-        return {"error": "No image uploaded"}
-    img_b64 = image_to_base64(image)
     prompt = """
-You are a universal document understanding system.
 Rules:
 - Be document-agnostic
-- Detect document type if possible
-- Extract ALL visible structured data
-- Extract tables completely (columns + rows)
 - Preserve numbers exactly
 - Use null for missing values
 - Do NOT hallucinate
-- Return ONLY valid JSON
-Schema:
 {
   "document_type": string | null,
-  "confidence": number (0-1),
   "summary": string,
-  "fields": { "<key>": "<value | null>" },
   "tables": [
     {
       "table_name": string,
@@ -56,47 +67,71 @@ Schema:
             {
                 "role": "user",
                 "content": [
-                    {"type": "input_text", "text": prompt},
-                    {
-                        "type": "input_image",
-                        "image_base64": img_b64
-                    }
                 ]
             }
         ],
-        "temperature": 0
-    }
-    headers = {
-        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-        "Content-Type": "application/json"
     }
-    r = requests.post(
-        "https://openrouter.ai/api/v1/chat/completions",
-        headers=headers,
-        json=payload,
-        timeout=120
-    )
-    response = r.json()
-    text = response["choices"][0]["message"]["content"]
-    try:
         start = text.find("{")
         end = text.rfind("}") + 1
-        return json.loads(text[start:end])
-    except Exception:
-        return {"raw_output": text}
-with gr.Blocks(title="DocAI – Universal Document Extractor") as demo:
-    gr.Markdown("# 📄 DocAI – Universal Document Intelligence")
-    gr.Markdown("Vision-LLM powered. No templates. Any document.")
-    with gr.Row():
-        img = gr.Image(type="pil", label="Upload document")
-        out = gr.JSON(label="Extracted JSON")
-    gr.Button("Extract").click(extract_document, img, out)
 demo.launch()

 import gradio as gr
 import json
 from PIL import Image
+import requests
 import os
+# ================================
+# CONFIG
+# ================================
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")  # set in HF Secrets
+MODEL = "qwen/qwen-2.5-vl-72b-instruct"  # or any vision model on OpenRouter
+HEADERS = {
+    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+    "Content-Type": "application/json",
+    "HTTP-Referer": "https://huggingface.co",
+    "X-Title": "DocAI"
+}
+# ================================
+# CORE EXTRACTION FUNCTION
+# ================================
 def extract_document(image: Image.Image):
     if image is None:
+        return json.dumps({"error": "No image uploaded"}, indent=2)
+    # Convert image to base64
+    import base64
+    from io import BytesIO
+    buffer = BytesIO()
+    image.save(buffer, format="PNG")
+    img_b64 = base64.b64encode(buffer.getvalue()).decode()
     prompt = """
+You are a universal document understanding AI.
 Rules:
 - Be document-agnostic
+- Detect document type
+- Extract all visible structured data
+- Extract tables fully (columns + rows)
 - Preserve numbers exactly
 - Use null for missing values
 - Do NOT hallucinate
+- Output ONLY valid JSON
+JSON schema:
 {
   "document_type": string | null,
+  "confidence": number between 0 and 1,
   "summary": string,
+  "fields": { "<field_name>": "<value or null>" },
   "tables": [
     {
       "table_name": string,
             {
                 "role": "user",
                 "content": [
+                    {"type": "image_url", "image_url": f"data:image/png;base64,{img_b64}"},
+                    {"type": "text", "text": prompt}
                 ]
             }
         ],
+        "temperature": 0,
+        "max_tokens": 2000
     }
+    try:
+        response = requests.post(
+            "https://openrouter.ai/api/v1/chat/completions",
+            headers=HEADERS,
+            json=payload,
+            timeout=120
+        )
+        response.raise_for_status()
+        text = response.json()["choices"][0]["message"]["content"]
         start = text.find("{")
         end = text.rfind("}") + 1
+        parsed = json.loads(text[start:end])
+        return json.dumps(parsed, indent=2)
+    except Exception as e:
+        return json.dumps(
+            {
+                "error": "Extraction failed",
+                "details": str(e)
+            },
+            indent=2
+        )
+# ================================
+# UI (STABLE — NO VIBRATION)
+# ================================
+with gr.Blocks(css=".container { max-width: 1200px; margin: auto; }") as demo:
+    gr.Markdown(
+        """
+# 📄 DocAI — Universal Document Intelligence
+**Vision-LLM powered. No templates. Any document.**
+"""
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_img = gr.Image(
+                type="pil",
+                label="Upload Document",
+                height=420
+            )
+            extract_btn = gr.Button("Extract", variant="primary")
+        with gr.Column(scale=1):
+            output_json = gr.Code(
+                label="Extracted JSON",
+                language="json"
+            )
+    extract_btn.click(
+        fn=extract_document,
+        inputs=input_img,
+        outputs=output_json
+    )
 demo.launch()