Liviu16 commited on
Commit
7080e09
Β·
verified Β·
1 Parent(s): 4de3590

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -22
app.py CHANGED
@@ -2,39 +2,88 @@ import gradio as gr
2
  import torch
3
  import json
4
  import spaces
5
- import fitz # PyMuPDF
6
  from PIL import Image
7
  import io
8
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
9
  from qwen_vl_utils import process_vision_info
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # --- MODEL LOADING ---
12
  MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
13
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda")
14
- processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=1280*1280)
15
 
16
- # --- PDF HELPER ---
17
- def pdf_to_image(pdf_path):
18
- """Converts the first page of a PDF to a PIL Image."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  doc = fitz.open(pdf_path)
20
- page = doc.load_page(0) # Extract only first page for demo
21
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better OCR
22
  img = Image.open(io.BytesIO(pix.tobytes()))
23
  doc.close()
24
  return img
25
 
 
26
  @spaces.GPU(duration=60)
27
  def process_invoice(file_info):
28
  if file_info is None: return {"error": "No file uploaded"}
29
 
30
- # 1. Handle PDF vs Image
31
- file_path = file_info.name
32
- if file_path.lower().endswith(".pdf"):
33
- image = pdf_to_image(file_path)
34
  else:
35
- image = Image.open(file_path)
36
 
37
- # 2. Identify Vendor (Router)
38
  decision_prompt = "Identify vendor: VODAFONE, DIGI, or GENERAL. Reply with one word."
39
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": decision_prompt}]}]
40
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -44,11 +93,12 @@ def process_invoice(file_info):
44
  generated_ids = model.generate(**inputs, max_new_tokens=10)
45
  raw_choice = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0].strip().upper()
46
 
47
- # [Your Schema Logic Here...]
48
  vendor_key = "VODAFONE" if "VODAFONE" in raw_choice else ("DIGI" if "DIGI" in raw_choice else "GENERAL")
49
 
50
- # 3. Extract Data (Specialist)
51
- extract_prompt = f"Return ONLY valid JSON for {vendor_key} invoice."
 
 
52
  messages[0]["content"][1]["text"] = extract_prompt
53
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
54
  inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
@@ -61,13 +111,13 @@ def process_invoice(file_info):
61
  except:
62
  return {"raw_output": result}
63
 
64
- # --- TRANSKRIBUS LAYOUT ---
65
  with gr.Blocks(title="InvoiceRecon") as demo:
66
- gr.Markdown("# πŸ“‘ IntelliReceipt: Real-Time Invoice AI")
67
  with gr.Row():
68
  with gr.Column(scale=1):
69
- # gr.File supports the PDF preview you want to see
70
- file_input = gr.File(label="Upload Invoice (PDF, PNG, JPG)", file_types=[".pdf", ".png", ".jpg"])
71
  run_btn = gr.Button("πŸš€ Extract Data", variant="primary")
72
  with gr.Column(scale=1):
73
  json_output = gr.JSON(label="Extracted Result")
 
2
  import torch
3
  import json
4
  import spaces
5
+ import fitz # PyMuPDF for PDF handling
6
  from PIL import Image
7
  import io
8
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
9
  from qwen_vl_utils import process_vision_info
10
 
11
+ # --- DETAILED SCHEMAS RESTORED ---
12
+ SCHEMAS = {
13
+ "VODAFONE": {
14
+ "vendor": "VODAFONE ROMANIA",
15
+ "invoice_number": "string",
16
+ "date": "string (DD-MM-YYYY)",
17
+ "client_name": "string",
18
+ "client_address": "string",
19
+ "account_id": "string",
20
+ "billing_period": "string",
21
+ "totals": {
22
+ "subtotal_no_vat": "number",
23
+ "vat_amount": "number",
24
+ "grand_total": "number",
25
+ "currency": "RON"
26
+ }
27
+ },
28
+ "DIGI": {
29
+ "vendor": "DIGI (RCS & RDS)",
30
+ "invoice_number": "string",
31
+ "contract_id": "string",
32
+ "total_amount": "number",
33
+ "iban": "string"
34
+ },
35
+ "GENERAL": {
36
+ "vendor_name": "string",
37
+ "invoice_id": "string",
38
+ "date": "string",
39
+ "total_with_vat": "number",
40
+ "client_name": "string"
41
+ }
42
+ }
43
+
44
  # --- MODEL LOADING ---
45
  MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 
46
 
47
+ def load_model():
48
+ # Keep 4-bit for speed even on ZeroGPU
49
+ quant_config = BitsAndBytesConfig(
50
+ load_in_4bit=True,
51
+ bnb_4bit_compute_dtype=torch.float16,
52
+ bnb_4bit_quant_type="nf4",
53
+ bnb_4bit_use_double_quant=True
54
+ )
55
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
+ MODEL_ID,
57
+ torch_dtype="auto",
58
+ device_map="cuda", # Explicit for ZeroGPU
59
+ quantization_config=quant_config
60
+ )
61
+ processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=1280*1280)
62
+ return model, processor
63
+
64
+ model, processor = load_model()
65
+
66
+ # --- PDF TO IMAGE HELPER ---
67
+ def get_pdf_page_image(pdf_path):
68
  doc = fitz.open(pdf_path)
69
+ page = doc.load_page(0) # First page only
70
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom
71
  img = Image.open(io.BytesIO(pix.tobytes()))
72
  doc.close()
73
  return img
74
 
75
+ # --- INFERENCE ---
76
  @spaces.GPU(duration=60)
77
  def process_invoice(file_info):
78
  if file_info is None: return {"error": "No file uploaded"}
79
 
80
+ # Handle File Type
81
+ if file_info.name.lower().endswith(".pdf"):
82
+ image = get_pdf_page_image(file_info.name)
 
83
  else:
84
+ image = Image.open(file_info.name)
85
 
86
+ # 1. Router
87
  decision_prompt = "Identify vendor: VODAFONE, DIGI, or GENERAL. Reply with one word."
88
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": decision_prompt}]}]
89
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
93
  generated_ids = model.generate(**inputs, max_new_tokens=10)
94
  raw_choice = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0].strip().upper()
95
 
 
96
  vendor_key = "VODAFONE" if "VODAFONE" in raw_choice else ("DIGI" if "DIGI" in raw_choice else "GENERAL")
97
 
98
+ # 2. Specialist
99
+ schema_json = json.dumps(SCHEMAS[vendor_key], indent=2)
100
+ extract_prompt = f"Extract details as JSON strictly following this schema: {schema_json}. Return ONLY valid JSON."
101
+
102
  messages[0]["content"][1]["text"] = extract_prompt
103
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
104
  inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
 
111
  except:
112
  return {"raw_output": result}
113
 
114
+ # --- INTERFACE ---
115
  with gr.Blocks(title="InvoiceRecon") as demo:
116
+ gr.Markdown("# πŸ“‘ IntelliReceipt: Local AI Invoice Parser")
117
  with gr.Row():
118
  with gr.Column(scale=1):
119
+ # Using gr.File for the PDF preview experience
120
+ file_input = gr.File(label="Upload Invoice (PDF or Image)", file_types=[".pdf", ".png", ".jpg"])
121
  run_btn = gr.Button("πŸš€ Extract Data", variant="primary")
122
  with gr.Column(scale=1):
123
  json_output = gr.JSON(label="Extracted Result")