akashraut commited on
Commit
0d9ba16
Β·
verified Β·
1 Parent(s): 555a02b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -97
app.py CHANGED
@@ -1,129 +1,110 @@
1
- import gradio as gr
2
- import requests
3
- import json
4
  import os
5
- import base64
 
 
 
6
  from PIL import Image
7
- from io import BytesIO
8
 
9
- OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 
 
 
 
 
10
 
11
- MODEL_ID = "nvidia/nemotron-nano-12b-v2-vl:free"
12
- OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
13
 
 
14
 
15
- def image_to_base64(image: Image.Image) -> str:
16
- buffered = BytesIO()
17
- image.save(buffered, format="PNG")
18
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
19
 
 
 
 
20
 
21
- def extract_document(image: Image.Image):
22
- if image is None:
23
- return {"error": "Please upload an image"}
24
 
25
- if not OPENROUTER_API_KEY:
26
- return {"error": "OPENROUTER_API_KEY not set"}
27
 
28
- img_b64 = image_to_base64(image)
 
 
 
 
29
 
30
  prompt = """
31
- You are a universal document understanding AI.
32
-
33
- Return ONLY valid JSON.
34
- Do NOT include explanations or markdown.
35
-
36
- Rules:
37
- - Be document-agnostic
38
- - Detect document_type if possible
39
- - Extract all visible key-value fields
40
- - Extract tables with full rows and columns
41
- - Preserve numbers exactly
42
- - Use null for missing values
43
- - Do not hallucinate
44
-
45
- JSON schema:
 
46
  {
47
- "document_type": string | null,
48
- "summary": string,
49
- "fields": {
50
- "<field_name>": "<value or null>"
51
- },
52
- "tables": [
53
  {
54
  "table_name": string,
55
- "columns": [string],
56
- "rows": [[string | number | null]]
 
 
 
 
 
 
 
57
  }
58
  ]
59
  }
60
  """
61
 
62
- payload = {
63
- "model": MODEL_ID,
64
- "messages": [
65
- {
66
- "role": "user",
67
- "content": [
68
- {"type": "text", "text": prompt},
69
- {
70
- "type": "image_url",
71
- "image_url": {
72
- "url": f"data:image/png;base64,{img_b64}"
73
- }
74
- }
75
- ]
76
- }
77
- ],
78
- "temperature": 0.0,
79
- "max_tokens": 1200
80
- }
81
-
82
- headers = {
83
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
84
- "Content-Type": "application/json",
85
- "HTTP-Referer": "https://huggingface.co",
86
- "X-Title": "DocAI"
87
- }
88
-
89
  try:
90
- response = requests.post(
91
- OPENROUTER_URL,
92
- headers=headers,
93
- json=payload,
94
- timeout=90
 
95
  )
96
- response.raise_for_status()
97
- content = response.json()["choices"][0]["message"]["content"]
98
 
99
- start = content.find("{")
100
- end = content.rfind("}") + 1
101
- return json.loads(content[start:end])
102
 
103
  except Exception as e:
104
- return {
105
- "error": "Extraction failed",
106
- "details": str(e)
107
- }
108
-
109
-
110
- with gr.Blocks(title="DocAI – Universal Document Intelligence") as demo:
111
- gr.Markdown(
112
- """
113
- # πŸ“„ DocAI – Universal Document Intelligence
114
- Vision-powered. No templates. Any document.
115
- **Model:** NVIDIA Nemotron Nano 12B VL (free)
116
- """
117
- )
118
 
119
  with gr.Row():
120
- input_img = gr.Image(type="pil", label="Upload document")
121
- output_json = gr.JSON(label="Extracted JSON")
 
 
 
 
122
 
123
- extract_btn = gr.Button("Extract", variant="primary")
124
  extract_btn.click(
125
- fn=extract_document,
126
- inputs=input_img,
127
  outputs=output_json
128
  )
129
 
 
 
 
 
1
  import os
2
+ import json
3
+ import time
4
+ import gradio as gr
5
+ import google.generativeai as genai
6
  from PIL import Image
 
7
 
8
+ # -----------------------------
9
+ # Gemini Configuration
10
+ # -----------------------------
11
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
12
+ if not GEMINI_API_KEY:
13
+ raise RuntimeError("GEMINI_API_KEY not found in Hugging Face Secrets")
14
 
15
+ genai.configure(api_key=GEMINI_API_KEY)
 
16
 
17
+ MODEL_NAME = "gemini-1.5-flash-latest"
18
 
19
+ model = genai.GenerativeModel(MODEL_NAME)
 
 
 
20
 
21
+ # Simple rate limiter (protects your quota)
22
+ LAST_CALL_TS = 0
23
+ MIN_INTERVAL = 3 # seconds
24
 
 
 
 
25
 
26
+ def extract_financial_document(image: Image.Image):
27
+ global LAST_CALL_TS
28
 
29
+ # --- Rate limiting ---
30
+ now = time.time()
31
+ if now - LAST_CALL_TS < MIN_INTERVAL:
32
+ return {"error": "Rate limited. Please wait a few seconds."}
33
+ LAST_CALL_TS = now
34
 
35
  prompt = """
36
+ You are a financial document intelligence system.
37
+
38
+ TASKS:
39
+ 1. Identify the document type.
40
+ 2. Extract ALL tables exactly as they appear.
41
+ 3. Preserve row/column structure.
42
+ 4. Convert charts (pie/bar) into numeric insights.
43
+ 5. Do NOT hallucinate values.
44
+ 6. Numbers must be exact.
45
+
46
+ OUTPUT RULES:
47
+ - Return ONLY valid JSON
48
+ - No markdown
49
+ - No explanations
50
+
51
+ JSON SCHEMA:
52
  {
53
+ "document_type": string,
54
+ "summary_fields": { "key": "value" },
55
+ "table_data": [
 
 
 
56
  {
57
  "table_name": string,
58
+ "headers": [string],
59
+ "rows": [[string]]
60
+ }
61
+ ],
62
+ "visual_insights": [
63
+ {
64
+ "chart_title": string,
65
+ "chart_type": string,
66
+ "trends": string
67
  }
68
  ]
69
  }
70
  """
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
+ response = model.generate_content(
74
+ [prompt, image],
75
+ generation_config={
76
+ "temperature": 0,
77
+ "response_mime_type": "application/json"
78
+ }
79
  )
 
 
80
 
81
+ # Ensure valid JSON
82
+ return json.loads(response.text)
 
83
 
84
  except Exception as e:
85
+ return {"error": str(e)}
86
+
87
+
88
+ # -----------------------------
89
+ # Gradio UI
90
+ # -----------------------------
91
+ with gr.Blocks(title="Financial DocAI (Gemini Vision)") as demo:
92
+ gr.Markdown("""
93
+ # πŸ“„ Financial DocAI β€” Gemini Vision
94
+ Upload a financial document image (portfolio report, MF statement, etc.)
95
+ """)
 
 
 
96
 
97
  with gr.Row():
98
+ image_input = gr.Image(type="pil", label="Upload Document Image")
99
+
100
+ with gr.Row():
101
+ extract_btn = gr.Button("Extract Data")
102
+
103
+ output_json = gr.JSON(label="Extracted Structured Data")
104
 
 
105
  extract_btn.click(
106
+ fn=extract_financial_document,
107
+ inputs=image_input,
108
  outputs=output_json
109
  )
110