File size: 3,672 Bytes
d1c6be0
0d9ba16
 
 
 
53b400b
 
25ca7ed
 
 
53b400b
25ca7ed
 
 
53b400b
25ca7ed
53b400b
25ca7ed
0d9ba16
d1c6be0
25ca7ed
0d9ba16
 
1614ed7
d1c6be0
25ca7ed
 
 
 
 
0d9ba16
77a55a1
25ca7ed
0d9ba16
 
 
 
d1c6be0
 
25ca7ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d9ba16
 
25ca7ed
0d9ba16
25ca7ed
 
 
d1c6be0
25ca7ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1c6be0
25ca7ed
d1c6be0
 
 
77a55a1
0d9ba16
 
 
 
 
 
77a55a1
d1c6be0
0d9ba16
d1c6be0
77a55a1
0d9ba16
 
 
25ca7ed
 
 
 
 
0d9ba16
25ca7ed
cdacb08
25ca7ed
0d9ba16
25ca7ed
 
 
 
 
 
0d9ba16
25ca7ed
 
 
77a55a1
 
25ca7ed
0d9ba16
25ca7ed
77a55a1
cdacb08
d1c6be0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import json
import time
import gradio as gr
import google.generativeai as genai
from PIL import Image

# ============================================================
# Configuration
# ============================================================

API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    raise RuntimeError("GEMINI_API_KEY missing in Hugging Face Secrets")

genai.configure(api_key=API_KEY)

MODEL_NAME = "gemini-2.5-flash"
model = genai.GenerativeModel(MODEL_NAME)

# Simple global rate limit (HF protection)
LAST_CALL_TS = 0
MIN_INTERVAL = 3  # seconds


# ============================================================
# Core Extraction Logic (Doc-Agnostic)
# ============================================================

def extract_document(image: Image.Image):
    global LAST_CALL_TS

    # ---- Rate limiting ----
    now = time.time()
    if now - LAST_CALL_TS < MIN_INTERVAL:
        return {"error": "Rate limited. Please wait a few seconds."}
    LAST_CALL_TS = now

    prompt = """
You are a document intelligence system.

Your job is to analyze ANY document image and produce a
Docsumo-compatible structured JSON output.

DOCUMENT TYPES MAY INCLUDE (but are not limited to):
- Financial statements
- Invoices
- Forms
- Reports
- Letters
- Tables-only documents

--------------------------------
TASKS
--------------------------------
1. Identify document_type and document_subtype.
2. Extract all key-value fields visible in the document.
3. Extract ALL tables with exact row/column structure.
4. If charts/graphs exist, summarize insights textually.
5. Do NOT hallucinate missing data.
6. Preserve numbers exactly as shown.

--------------------------------
OUTPUT RULES
--------------------------------
- Output ONLY valid JSON
- No markdown
- No explanations
- Follow the schema EXACTLY

--------------------------------
DOCSUMO-COMPATIBLE JSON SCHEMA
--------------------------------
{
  "document_metadata": {
    "document_type": string,
    "document_subtype": string,
    "page_count": number,
    "language": string
  },
  "extraction": {
    "fields": {
      "<field_name>": {
        "value": string,
        "normalized_value": string | null,
        "type": "string" | "number" | "date" | "currency" | "enum"
      }
    },
    "tables": {
      "<table_id>": {
        "table_label": string,
        "headers": [string],
        "rows": [
          { "<header>": string }
        ]
      }
    },
    "derived_insights": {
      "<insight_name>": {
        "value": string
      }
    }
  }
}
"""

    try:
        response = model.generate_content(
            [prompt, image],
            generation_config={
                "temperature": 0,
                "response_mime_type": "application/json"
            }
        )

        return json.loads(response.text)

    except Exception as e:
        return {"error": str(e)}


# ============================================================
# Gradio UI (HF)
# ============================================================

with gr.Blocks(title="DocAI โ€“ Docsumo Compatible") as demo:
    gr.Markdown("""
# ๐Ÿ“„ DocAI โ€” Docsumo-Compatible Document Intelligence

Upload **any document image** (invoice, statement, report, form).

This demo returns a **Docsumo-compatible JSON contract**:
- Document metadata
- Key-value fields
- Tables
- Derived insights
""")

    image_input = gr.Image(type="pil", label="Upload Document Image")
    extract_btn = gr.Button("Extract Document")
    output = gr.JSON(label="Docsumo-Compatible JSON Output")

    extract_btn.click(
        fn=extract_document,
        inputs=image_input,
        outputs=output
    )

demo.launch()