Liviu16 commited on
Commit
d0c8e87
Β·
verified Β·
1 Parent(s): 7080e09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -18
app.py CHANGED
@@ -8,7 +8,7 @@ import io
8
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
9
  from qwen_vl_utils import process_vision_info
10
 
11
- # --- DETAILED SCHEMAS RESTORED ---
12
  SCHEMAS = {
13
  "VODAFONE": {
14
  "vendor": "VODAFONE ROMANIA",
@@ -45,7 +45,6 @@ SCHEMAS = {
45
  MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
46
 
47
  def load_model():
48
- # Keep 4-bit for speed even on ZeroGPU
49
  quant_config = BitsAndBytesConfig(
50
  load_in_4bit=True,
51
  bnb_4bit_compute_dtype=torch.float16,
@@ -55,7 +54,7 @@ def load_model():
55
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
  MODEL_ID,
57
  torch_dtype="auto",
58
- device_map="cuda", # Explicit for ZeroGPU
59
  quantization_config=quant_config
60
  )
61
  processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=1280*1280)
@@ -66,26 +65,30 @@ model, processor = load_model()
66
  # --- PDF TO IMAGE HELPER ---
67
  def get_pdf_page_image(pdf_path):
68
  doc = fitz.open(pdf_path)
69
- page = doc.load_page(0) # First page only
70
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom
71
  img = Image.open(io.BytesIO(pix.tobytes()))
72
  doc.close()
73
  return img
74
 
75
  # --- INFERENCE ---
76
  @spaces.GPU(duration=60)
77
- def process_invoice(file_info):
78
- if file_info is None: return {"error": "No file uploaded"}
 
79
 
80
- # Handle File Type
 
81
  if file_info.name.lower().endswith(".pdf"):
82
  image = get_pdf_page_image(file_info.name)
83
  else:
84
  image = Image.open(file_info.name)
85
 
86
- # 1. Router
 
87
  decision_prompt = "Identify vendor: VODAFONE, DIGI, or GENERAL. Reply with one word."
88
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": decision_prompt}]}]
 
89
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
90
  image_inputs, _ = process_vision_info(messages)
91
  inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
@@ -95,7 +98,8 @@ def process_invoice(file_info):
95
 
96
  vendor_key = "VODAFONE" if "VODAFONE" in raw_choice else ("DIGI" if "DIGI" in raw_choice else "GENERAL")
97
 
98
- # 2. Specialist
 
99
  schema_json = json.dumps(SCHEMAS[vendor_key], indent=2)
100
  extract_prompt = f"Extract details as JSON strictly following this schema: {schema_json}. Return ONLY valid JSON."
101
 
@@ -106,22 +110,37 @@ def process_invoice(file_info):
106
  generated_ids = model.generate(**inputs, max_new_tokens=1536)
107
  result = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
108
 
 
 
 
109
  try:
110
- return json.loads(result.strip().replace('```json', '').replace('```', ''))
 
 
111
  except:
112
- return {"raw_output": result}
 
113
 
114
  # --- INTERFACE ---
115
- with gr.Blocks(title="InvoiceRecon") as demo:
116
- gr.Markdown("# πŸ“‘ IntelliReceipt: Local AI Invoice Parser")
 
 
117
  with gr.Row():
118
  with gr.Column(scale=1):
119
- # Using gr.File for the PDF preview experience
120
- file_input = gr.File(label="Upload Invoice (PDF or Image)", file_types=[".pdf", ".png", ".jpg"])
 
121
  run_btn = gr.Button("πŸš€ Extract Data", variant="primary")
 
122
  with gr.Column(scale=1):
123
- json_output = gr.JSON(label="Extracted Result")
124
 
125
- run_btn.click(fn=process_invoice, inputs=file_input, outputs=json_output)
 
 
 
 
 
126
 
127
  demo.launch()
 
8
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
9
  from qwen_vl_utils import process_vision_info
10
 
11
+ # --- DETAILED SCHEMAS ---
12
  SCHEMAS = {
13
  "VODAFONE": {
14
  "vendor": "VODAFONE ROMANIA",
 
45
  MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
46
 
47
  def load_model():
 
48
  quant_config = BitsAndBytesConfig(
49
  load_in_4bit=True,
50
  bnb_4bit_compute_dtype=torch.float16,
 
54
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
55
  MODEL_ID,
56
  torch_dtype="auto",
57
+ device_map="cuda",
58
  quantization_config=quant_config
59
  )
60
  processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=1280*1280)
 
65
  # --- PDF TO IMAGE HELPER ---
66
  def get_pdf_page_image(pdf_path):
67
  doc = fitz.open(pdf_path)
68
+ page = doc.load_page(0)
69
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
70
  img = Image.open(io.BytesIO(pix.tobytes()))
71
  doc.close()
72
  return img
73
 
74
  # --- INFERENCE ---
75
  @spaces.GPU(duration=60)
76
+ def process_invoice(file_info, progress=gr.Progress()):
77
+ if file_info is None:
78
+ return None, {"error": "No file uploaded"}
79
 
80
+ # 1. Handle File Type and Preview
81
+ progress(0.1, desc="πŸ“„ Processing document...")
82
  if file_info.name.lower().endswith(".pdf"):
83
  image = get_pdf_page_image(file_info.name)
84
  else:
85
  image = Image.open(file_info.name)
86
 
87
+ # 2. Router (Identify Vendor)
88
+ progress(0.3, desc="πŸ” Identifying vendor (Router)...")
89
  decision_prompt = "Identify vendor: VODAFONE, DIGI, or GENERAL. Reply with one word."
90
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": decision_prompt}]}]
91
+
92
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
93
  image_inputs, _ = process_vision_info(messages)
94
  inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
 
98
 
99
  vendor_key = "VODAFONE" if "VODAFONE" in raw_choice else ("DIGI" if "DIGI" in raw_choice else "GENERAL")
100
 
101
+ # 3. Specialist (Extract Data)
102
+ progress(0.6, desc=f"πŸ€– Extracting {vendor_key} details...")
103
  schema_json = json.dumps(SCHEMAS[vendor_key], indent=2)
104
  extract_prompt = f"Extract details as JSON strictly following this schema: {schema_json}. Return ONLY valid JSON."
105
 
 
110
  generated_ids = model.generate(**inputs, max_new_tokens=1536)
111
  result = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
112
 
113
+ progress(0.9, desc="βš™οΈ Finalizing result...")
114
+
115
+ # 4. Return Image for Preview and JSON for data
116
  try:
117
+ data = json.loads(result.strip().replace('```json', '').replace('```', ''))
118
+ progress(1.0, desc="βœ… Success!")
119
+ return image, data
120
  except:
121
+ progress(1.0, desc="⚠️ Extraction complete with formatting issues")
122
+ return image, {"raw_output": result}
123
 
124
  # --- INTERFACE ---
125
+ with gr.Blocks(title="InvoiceRecon", theme=gr.themes.Soft()) as demo:
126
+ gr.Markdown("# πŸ“‘ IntelliReceipt: Real-Time Invoice AI")
127
+ gr.Markdown("Upload a Romanian invoice (PDF or Image) to extract structured data using Qwen2.5-VL.")
128
+
129
  with gr.Row():
130
  with gr.Column(scale=1):
131
+ file_input = gr.File(label="1. Upload Invoice", file_types=[".pdf", ".png", ".jpg"])
132
+ # The preview component to show the first page
133
+ preview_output = gr.Image(label="2. Document Preview", type="pil")
134
  run_btn = gr.Button("πŸš€ Extract Data", variant="primary")
135
+
136
  with gr.Column(scale=1):
137
+ json_output = gr.JSON(label="3. Extracted JSON Result")
138
 
139
+ # Important: Ensure inputs and outputs match function signature
140
+ run_btn.click(
141
+ fn=process_invoice,
142
+ inputs=file_input,
143
+ outputs=[preview_output, json_output]
144
+ )
145
 
146
  demo.launch()