| | import os |
| | import json |
| | import time |
| | import gradio as gr |
| | import google.generativeai as genai |
| | from PIL import Image |
| |
|
| | |
| | |
| | |
| |
|
| | API_KEY = os.getenv("GEMINI_API_KEY") |
| | if not API_KEY: |
| | raise RuntimeError("GEMINI_API_KEY missing in Hugging Face Secrets") |
| |
|
| | genai.configure(api_key=API_KEY) |
| |
|
| | MODEL_NAME = "gemini-2.5-flash" |
| | model = genai.GenerativeModel(MODEL_NAME) |
| |
|
| | |
| | LAST_CALL_TS = 0 |
| | MIN_INTERVAL = 3 |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def extract_document(image: Image.Image): |
| | global LAST_CALL_TS |
| |
|
| | |
| | now = time.time() |
| | if now - LAST_CALL_TS < MIN_INTERVAL: |
| | return {"error": "Rate limited. Please wait a few seconds."} |
| | LAST_CALL_TS = now |
| |
|
| | prompt = """ |
| | You are a document intelligence system. |
| | |
| | Your job is to analyze ANY document image and produce a |
| | Docsumo-compatible structured JSON output. |
| | |
| | DOCUMENT TYPES MAY INCLUDE (but are not limited to): |
| | - Financial statements |
| | - Invoices |
| | - Forms |
| | - Reports |
| | - Letters |
| | - Tables-only documents |
| | |
| | -------------------------------- |
| | TASKS |
| | -------------------------------- |
| | 1. Identify document_type and document_subtype. |
| | 2. Extract all key-value fields visible in the document. |
| | 3. Extract ALL tables with exact row/column structure. |
| | 4. If charts/graphs exist, summarize insights textually. |
| | 5. Do NOT hallucinate missing data. |
| | 6. Preserve numbers exactly as shown. |
| | |
| | -------------------------------- |
| | OUTPUT RULES |
| | -------------------------------- |
| | - Output ONLY valid JSON |
| | - No markdown |
| | - No explanations |
| | - Follow the schema EXACTLY |
| | |
| | -------------------------------- |
| | DOCSUMO-COMPATIBLE JSON SCHEMA |
| | -------------------------------- |
| | { |
| | "document_metadata": { |
| | "document_type": string, |
| | "document_subtype": string, |
| | "page_count": number, |
| | "language": string |
| | }, |
| | "extraction": { |
| | "fields": { |
| | "<field_name>": { |
| | "value": string, |
| | "normalized_value": string | null, |
| | "type": "string" | "number" | "date" | "currency" | "enum" |
| | } |
| | }, |
| | "tables": { |
| | "<table_id>": { |
| | "table_label": string, |
| | "headers": [string], |
| | "rows": [ |
| | { "<header>": string } |
| | ] |
| | } |
| | }, |
| | "derived_insights": { |
| | "<insight_name>": { |
| | "value": string |
| | } |
| | } |
| | } |
| | } |
| | """ |
| |
|
| | try: |
| | response = model.generate_content( |
| | [prompt, image], |
| | generation_config={ |
| | "temperature": 0, |
| | "response_mime_type": "application/json" |
| | } |
| | ) |
| |
|
| | return json.loads(response.text) |
| |
|
| | except Exception as e: |
| | return {"error": str(e)} |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | with gr.Blocks(title="DocAI β Docsumo Compatible") as demo: |
| | gr.Markdown(""" |
| | # π DocAI β Docsumo-Compatible Document Intelligence |
| | |
| | Upload **any document image** (invoice, statement, report, form). |
| | |
| | This demo returns a **Docsumo-compatible JSON contract**: |
| | - Document metadata |
| | - Key-value fields |
| | - Tables |
| | - Derived insights |
| | """) |
| |
|
| | image_input = gr.Image(type="pil", label="Upload Document Image") |
| | extract_btn = gr.Button("Extract Document") |
| | output = gr.JSON(label="Docsumo-Compatible JSON Output") |
| |
|
| | extract_btn.click( |
| | fn=extract_document, |
| | inputs=image_input, |
| | outputs=output |
| | ) |
| |
|
| | demo.launch() |
| |
|