akashraut commited on
Commit
d1c6be0
Β·
verified Β·
1 Parent(s): 9417519

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -75
app.py CHANGED
@@ -1,86 +1,102 @@
1
  import gradio as gr
2
- import torch
 
3
  import json
4
- import os
5
  from PIL import Image
6
- from transformers import AutoProcessor, AutoModelForVision2Seq
7
- from qwen_vl_utils import process_vision_info
8
-
9
- # Use 3B to stay within 16GB RAM limit
10
- MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
11
-
12
- print("Loading processor...")
13
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
14
-
15
- print(f"Loading model {MODEL_ID} on CPU...")
16
- # AutoModelForVision2Seq is the correct class for Vision-Language models
17
- model = AutoModelForVision2Seq.from_pretrained(
18
- MODEL_ID,
19
- trust_remote_code=True,
20
- torch_dtype=torch.float32,
21
- low_cpu_mem_usage=True,
22
- device_map="cpu"
23
- )
24
- model.eval()
25
- print("Model loaded successfully!")
26
 
27
  def extract_document(image: Image.Image):
28
  if image is None:
29
- return {"error": "Please upload an image."}
30
-
31
- # Format prompt for Qwen2.5-VL
32
- messages = [
33
- {
34
- "role": "user",
35
- "content": [
36
- {"type": "image", "image": image},
37
- {"type": "text", "text": "Extract all data from this document as a clean JSON object. Include document_type, fields, and tables."},
38
- ],
39
- }
40
- ]
41
-
42
- # Prepare inputs
43
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
- image_inputs, video_inputs = process_vision_info(messages)
45
-
46
- inputs = processor(
47
- text=[text],
48
- images=image_inputs,
49
- videos=video_inputs,
50
- padding=True,
51
- return_tensors="pt",
52
- ).to("cpu")
53
-
54
- # Generate output (Takes ~1 min on CPU)
55
- with torch.no_grad():
56
- generated_ids = model.generate(**inputs, max_new_tokens=1024)
57
-
58
- generated_ids_trimmed = [
59
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
60
- ]
61
- output_text = processor.batch_decode(
62
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
63
- )[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  try:
66
- # Simple extraction of JSON block
67
- start = output_text.find("{")
68
- end = output_text.rfind("}") + 1
69
- return json.loads(output_text[start:end])
70
- except:
71
- return {"raw_output": output_text}
72
-
73
- with gr.Blocks() as demo:
74
- gr.Markdown("# πŸ“„ DocAI β€” Universal Document Intelligence")
75
- gr.Markdown("Using Qwen2.5-VL-3B on CPU.")
76
-
77
  with gr.Row():
78
- with gr.Column():
79
- input_img = gr.Image(type="pil", label="Upload Document")
80
- submit_btn = gr.Button("Extract Data", variant="primary")
81
- with gr.Column():
82
- output_json = gr.JSON(label="Extracted JSON")
83
 
84
- submit_btn.click(fn=extract_document, inputs=input_img, outputs=output_json)
85
 
86
- demo.launch()
 
1
  import gradio as gr
2
+ import requests
3
+ import base64
4
  import json
 
5
  from PIL import Image
6
+ import io
7
+ import os
8
+
9
+ OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
10
+
11
+ MODEL = "qwen/qwen-2.5-vl-72b-instruct"
12
+
13
+ def image_to_base64(image: Image.Image):
14
+ buf = io.BytesIO()
15
+ image.save(buf, format="PNG")
16
+ return base64.b64encode(buf.getvalue()).decode()
 
 
 
 
 
 
 
 
 
17
 
18
  def extract_document(image: Image.Image):
19
  if image is None:
20
+ return {"error": "No image uploaded"}
21
+
22
+ img_b64 = image_to_base64(image)
23
+
24
+ prompt = """
25
+ You are a universal document understanding system.
26
+
27
+ Rules:
28
+ - Be document-agnostic
29
+ - Detect document type if possible
30
+ - Extract ALL visible structured data
31
+ - Extract tables completely (columns + rows)
32
+ - Preserve numbers exactly
33
+ - Use null for missing values
34
+ - Do NOT hallucinate
35
+ - Return ONLY valid JSON
36
+
37
+ Schema:
38
+ {
39
+ "document_type": string | null,
40
+ "confidence": number (0-1),
41
+ "summary": string,
42
+ "fields": { "<key>": "<value | null>" },
43
+ "tables": [
44
+ {
45
+ "table_name": string,
46
+ "columns": [string],
47
+ "rows": [[string | number | null]]
48
+ }
49
+ ]
50
+ }
51
+ """
52
+
53
+ payload = {
54
+ "model": MODEL,
55
+ "messages": [
56
+ {
57
+ "role": "user",
58
+ "content": [
59
+ {"type": "input_text", "text": prompt},
60
+ {
61
+ "type": "input_image",
62
+ "image_base64": img_b64
63
+ }
64
+ ]
65
+ }
66
+ ],
67
+ "temperature": 0
68
+ }
69
+
70
+ headers = {
71
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
72
+ "Content-Type": "application/json"
73
+ }
74
+
75
+ r = requests.post(
76
+ "https://openrouter.ai/api/v1/chat/completions",
77
+ headers=headers,
78
+ json=payload,
79
+ timeout=120
80
+ )
81
+
82
+ response = r.json()
83
+ text = response["choices"][0]["message"]["content"]
84
 
85
  try:
86
+ start = text.find("{")
87
+ end = text.rfind("}") + 1
88
+ return json.loads(text[start:end])
89
+ except Exception:
90
+ return {"raw_output": text}
91
+
92
+ with gr.Blocks(title="DocAI – Universal Document Extractor") as demo:
93
+ gr.Markdown("# πŸ“„ DocAI – Universal Document Intelligence")
94
+ gr.Markdown("Vision-LLM powered. No templates. Any document.")
95
+
 
96
  with gr.Row():
97
+ img = gr.Image(type="pil", label="Upload document")
98
+ out = gr.JSON(label="Extracted JSON")
 
 
 
99
 
100
+ gr.Button("Extract").click(extract_document, img, out)
101
 
102
+ demo.launch()