akashraut commited on
Commit
77a55a1
Β·
verified Β·
1 Parent(s): d1c6be0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -51
app.py CHANGED
@@ -1,45 +1,56 @@
1
  import gradio as gr
2
- import requests
3
- import base64
4
  import json
5
  from PIL import Image
6
- import io
7
  import os
8
 
9
- OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
10
-
11
- MODEL = "qwen/qwen-2.5-vl-72b-instruct"
12
-
13
- def image_to_base64(image: Image.Image):
14
- buf = io.BytesIO()
15
- image.save(buf, format="PNG")
16
- return base64.b64encode(buf.getvalue()).decode()
 
 
 
 
17
 
 
 
 
18
  def extract_document(image: Image.Image):
19
  if image is None:
20
- return {"error": "No image uploaded"}
21
 
22
- img_b64 = image_to_base64(image)
 
 
 
 
 
 
23
 
24
  prompt = """
25
- You are a universal document understanding system.
26
 
27
  Rules:
28
  - Be document-agnostic
29
- - Detect document type if possible
30
- - Extract ALL visible structured data
31
- - Extract tables completely (columns + rows)
32
  - Preserve numbers exactly
33
  - Use null for missing values
34
  - Do NOT hallucinate
35
- - Return ONLY valid JSON
36
 
37
- Schema:
38
  {
39
  "document_type": string | null,
40
- "confidence": number (0-1),
41
  "summary": string,
42
- "fields": { "<key>": "<value | null>" },
43
  "tables": [
44
  {
45
  "table_name": string,
@@ -56,47 +67,71 @@ Schema:
56
  {
57
  "role": "user",
58
  "content": [
59
- {"type": "input_text", "text": prompt},
60
- {
61
- "type": "input_image",
62
- "image_base64": img_b64
63
- }
64
  ]
65
  }
66
  ],
67
- "temperature": 0
68
- }
69
-
70
- headers = {
71
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
72
- "Content-Type": "application/json"
73
  }
74
 
75
- r = requests.post(
76
- "https://openrouter.ai/api/v1/chat/completions",
77
- headers=headers,
78
- json=payload,
79
- timeout=120
80
- )
 
 
81
 
82
- response = r.json()
83
- text = response["choices"][0]["message"]["content"]
84
 
85
- try:
86
  start = text.find("{")
87
  end = text.rfind("}") + 1
88
- return json.loads(text[start:end])
89
- except Exception:
90
- return {"raw_output": text}
91
 
92
- with gr.Blocks(title="DocAI – Universal Document Extractor") as demo:
93
- gr.Markdown("# πŸ“„ DocAI – Universal Document Intelligence")
94
- gr.Markdown("Vision-LLM powered. No templates. Any document.")
95
 
96
- with gr.Row():
97
- img = gr.Image(type="pil", label="Upload document")
98
- out = gr.JSON(label="Extracted JSON")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- gr.Button("Extract").click(extract_document, img, out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  demo.launch()
 
1
  import gradio as gr
 
 
2
  import json
3
  from PIL import Image
4
+ import requests
5
  import os
6
 
7
+ # ================================
8
+ # CONFIG
9
+ # ================================
10
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # set in HF Secrets
11
+ MODEL = "qwen/qwen-2.5-vl-72b-instruct" # or any vision model on OpenRouter
12
+
13
+ HEADERS = {
14
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
15
+ "Content-Type": "application/json",
16
+ "HTTP-Referer": "https://huggingface.co",
17
+ "X-Title": "DocAI"
18
+ }
19
 
20
+ # ================================
21
+ # CORE EXTRACTION FUNCTION
22
+ # ================================
23
  def extract_document(image: Image.Image):
24
  if image is None:
25
+ return json.dumps({"error": "No image uploaded"}, indent=2)
26
 
27
+ # Convert image to base64
28
+ import base64
29
+ from io import BytesIO
30
+
31
+ buffer = BytesIO()
32
+ image.save(buffer, format="PNG")
33
+ img_b64 = base64.b64encode(buffer.getvalue()).decode()
34
 
35
  prompt = """
36
+ You are a universal document understanding AI.
37
 
38
  Rules:
39
  - Be document-agnostic
40
+ - Detect document type
41
+ - Extract all visible structured data
42
+ - Extract tables fully (columns + rows)
43
  - Preserve numbers exactly
44
  - Use null for missing values
45
  - Do NOT hallucinate
46
+ - Output ONLY valid JSON
47
 
48
+ JSON schema:
49
  {
50
  "document_type": string | null,
51
+ "confidence": number between 0 and 1,
52
  "summary": string,
53
+ "fields": { "<field_name>": "<value or null>" },
54
  "tables": [
55
  {
56
  "table_name": string,
 
67
  {
68
  "role": "user",
69
  "content": [
70
+ {"type": "image_url", "image_url": f"data:image/png;base64,{img_b64}"},
71
+ {"type": "text", "text": prompt}
 
 
 
72
  ]
73
  }
74
  ],
75
+ "temperature": 0,
76
+ "max_tokens": 2000
 
 
 
 
77
  }
78
 
79
+ try:
80
+ response = requests.post(
81
+ "https://openrouter.ai/api/v1/chat/completions",
82
+ headers=HEADERS,
83
+ json=payload,
84
+ timeout=120
85
+ )
86
+ response.raise_for_status()
87
 
88
+ text = response.json()["choices"][0]["message"]["content"]
 
89
 
 
90
  start = text.find("{")
91
  end = text.rfind("}") + 1
92
+ parsed = json.loads(text[start:end])
 
 
93
 
94
+ return json.dumps(parsed, indent=2)
 
 
95
 
96
+ except Exception as e:
97
+ return json.dumps(
98
+ {
99
+ "error": "Extraction failed",
100
+ "details": str(e)
101
+ },
102
+ indent=2
103
+ )
104
+
105
+ # ================================
106
+ # UI (STABLE β€” NO VIBRATION)
107
+ # ================================
108
+ with gr.Blocks(css=".container { max-width: 1200px; margin: auto; }") as demo:
109
+ gr.Markdown(
110
+ """
111
+ # πŸ“„ DocAI β€” Universal Document Intelligence
112
+ **Vision-LLM powered. No templates. Any document.**
113
+ """
114
+ )
115
 
116
+ with gr.Row():
117
+ with gr.Column(scale=1):
118
+ input_img = gr.Image(
119
+ type="pil",
120
+ label="Upload Document",
121
+ height=420
122
+ )
123
+ extract_btn = gr.Button("Extract", variant="primary")
124
+
125
+ with gr.Column(scale=1):
126
+ output_json = gr.Code(
127
+ label="Extracted JSON",
128
+ language="json"
129
+ )
130
+
131
+ extract_btn.click(
132
+ fn=extract_document,
133
+ inputs=input_img,
134
+ outputs=output_json
135
+ )
136
 
137
  demo.launch()