Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import io
|
|
| 8 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
|
| 9 |
from qwen_vl_utils import process_vision_info
|
| 10 |
|
| 11 |
-
# --- DETAILED SCHEMAS
|
| 12 |
SCHEMAS = {
|
| 13 |
"VODAFONE": {
|
| 14 |
"vendor": "VODAFONE ROMANIA",
|
|
@@ -45,7 +45,6 @@ SCHEMAS = {
|
|
| 45 |
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
| 46 |
|
| 47 |
def load_model():
|
| 48 |
-
# Keep 4-bit for speed even on ZeroGPU
|
| 49 |
quant_config = BitsAndBytesConfig(
|
| 50 |
load_in_4bit=True,
|
| 51 |
bnb_4bit_compute_dtype=torch.float16,
|
|
@@ -55,7 +54,7 @@ def load_model():
|
|
| 55 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 56 |
MODEL_ID,
|
| 57 |
torch_dtype="auto",
|
| 58 |
-
device_map="cuda",
|
| 59 |
quantization_config=quant_config
|
| 60 |
)
|
| 61 |
processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=1280*1280)
|
|
@@ -66,26 +65,30 @@ model, processor = load_model()
|
|
| 66 |
# --- PDF TO IMAGE HELPER ---
|
| 67 |
def get_pdf_page_image(pdf_path):
|
| 68 |
doc = fitz.open(pdf_path)
|
| 69 |
-
page = doc.load_page(0)
|
| 70 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 71 |
img = Image.open(io.BytesIO(pix.tobytes()))
|
| 72 |
doc.close()
|
| 73 |
return img
|
| 74 |
|
| 75 |
# --- INFERENCE ---
|
| 76 |
@spaces.GPU(duration=60)
|
| 77 |
-
def process_invoice(file_info):
|
| 78 |
-
if file_info is None:
|
|
|
|
| 79 |
|
| 80 |
-
# Handle File Type
|
|
|
|
| 81 |
if file_info.name.lower().endswith(".pdf"):
|
| 82 |
image = get_pdf_page_image(file_info.name)
|
| 83 |
else:
|
| 84 |
image = Image.open(file_info.name)
|
| 85 |
|
| 86 |
-
#
|
|
|
|
| 87 |
decision_prompt = "Identify vendor: VODAFONE, DIGI, or GENERAL. Reply with one word."
|
| 88 |
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": decision_prompt}]}]
|
|
|
|
| 89 |
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 90 |
image_inputs, _ = process_vision_info(messages)
|
| 91 |
inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
|
|
@@ -95,7 +98,8 @@ def process_invoice(file_info):
|
|
| 95 |
|
| 96 |
vendor_key = "VODAFONE" if "VODAFONE" in raw_choice else ("DIGI" if "DIGI" in raw_choice else "GENERAL")
|
| 97 |
|
| 98 |
-
#
|
|
|
|
| 99 |
schema_json = json.dumps(SCHEMAS[vendor_key], indent=2)
|
| 100 |
extract_prompt = f"Extract details as JSON strictly following this schema: {schema_json}. Return ONLY valid JSON."
|
| 101 |
|
|
@@ -106,22 +110,37 @@ def process_invoice(file_info):
|
|
| 106 |
generated_ids = model.generate(**inputs, max_new_tokens=1536)
|
| 107 |
result = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
|
| 108 |
|
|
|
|
|
|
|
|
|
|
| 109 |
try:
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
except:
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
# --- INTERFACE ---
|
| 115 |
-
with gr.Blocks(title="InvoiceRecon") as demo:
|
| 116 |
-
gr.Markdown("# π IntelliReceipt:
|
|
|
|
|
|
|
| 117 |
with gr.Row():
|
| 118 |
with gr.Column(scale=1):
|
| 119 |
-
|
| 120 |
-
|
|
|
|
| 121 |
run_btn = gr.Button("π Extract Data", variant="primary")
|
|
|
|
| 122 |
with gr.Column(scale=1):
|
| 123 |
-
json_output = gr.JSON(label="Extracted Result")
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
demo.launch()
|
|
|
|
| 8 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
|
| 9 |
from qwen_vl_utils import process_vision_info
|
| 10 |
|
| 11 |
+
# --- DETAILED SCHEMAS ---
|
| 12 |
SCHEMAS = {
|
| 13 |
"VODAFONE": {
|
| 14 |
"vendor": "VODAFONE ROMANIA",
|
|
|
|
| 45 |
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
| 46 |
|
| 47 |
def load_model():
|
|
|
|
| 48 |
quant_config = BitsAndBytesConfig(
|
| 49 |
load_in_4bit=True,
|
| 50 |
bnb_4bit_compute_dtype=torch.float16,
|
|
|
|
| 54 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 55 |
MODEL_ID,
|
| 56 |
torch_dtype="auto",
|
| 57 |
+
device_map="cuda",
|
| 58 |
quantization_config=quant_config
|
| 59 |
)
|
| 60 |
processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=1280*1280)
|
|
|
|
| 65 |
# --- PDF TO IMAGE HELPER ---
|
| 66 |
def get_pdf_page_image(pdf_path):
|
| 67 |
doc = fitz.open(pdf_path)
|
| 68 |
+
page = doc.load_page(0)
|
| 69 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| 70 |
img = Image.open(io.BytesIO(pix.tobytes()))
|
| 71 |
doc.close()
|
| 72 |
return img
|
| 73 |
|
| 74 |
# --- INFERENCE ---
|
| 75 |
@spaces.GPU(duration=60)
|
| 76 |
+
def process_invoice(file_info, progress=gr.Progress()):
|
| 77 |
+
if file_info is None:
|
| 78 |
+
return None, {"error": "No file uploaded"}
|
| 79 |
|
| 80 |
+
# 1. Handle File Type and Preview
|
| 81 |
+
progress(0.1, desc="π Processing document...")
|
| 82 |
if file_info.name.lower().endswith(".pdf"):
|
| 83 |
image = get_pdf_page_image(file_info.name)
|
| 84 |
else:
|
| 85 |
image = Image.open(file_info.name)
|
| 86 |
|
| 87 |
+
# 2. Router (Identify Vendor)
|
| 88 |
+
progress(0.3, desc="π Identifying vendor (Router)...")
|
| 89 |
decision_prompt = "Identify vendor: VODAFONE, DIGI, or GENERAL. Reply with one word."
|
| 90 |
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": decision_prompt}]}]
|
| 91 |
+
|
| 92 |
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 93 |
image_inputs, _ = process_vision_info(messages)
|
| 94 |
inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
|
|
|
|
| 98 |
|
| 99 |
vendor_key = "VODAFONE" if "VODAFONE" in raw_choice else ("DIGI" if "DIGI" in raw_choice else "GENERAL")
|
| 100 |
|
| 101 |
+
# 3. Specialist (Extract Data)
|
| 102 |
+
progress(0.6, desc=f"π€ Extracting {vendor_key} details...")
|
| 103 |
schema_json = json.dumps(SCHEMAS[vendor_key], indent=2)
|
| 104 |
extract_prompt = f"Extract details as JSON strictly following this schema: {schema_json}. Return ONLY valid JSON."
|
| 105 |
|
|
|
|
| 110 |
generated_ids = model.generate(**inputs, max_new_tokens=1536)
|
| 111 |
result = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
|
| 112 |
|
| 113 |
+
progress(0.9, desc="βοΈ Finalizing result...")
|
| 114 |
+
|
| 115 |
+
# 4. Return Image for Preview and JSON for data
|
| 116 |
try:
|
| 117 |
+
data = json.loads(result.strip().replace('```json', '').replace('```', ''))
|
| 118 |
+
progress(1.0, desc="β
Success!")
|
| 119 |
+
return image, data
|
| 120 |
except:
|
| 121 |
+
progress(1.0, desc="β οΈ Extraction complete with formatting issues")
|
| 122 |
+
return image, {"raw_output": result}
|
| 123 |
|
| 124 |
# --- INTERFACE ---
|
| 125 |
+
with gr.Blocks(title="InvoiceRecon", theme=gr.themes.Soft()) as demo:
|
| 126 |
+
gr.Markdown("# π IntelliReceipt: Real-Time Invoice AI")
|
| 127 |
+
gr.Markdown("Upload a Romanian invoice (PDF or Image) to extract structured data using Qwen2.5-VL.")
|
| 128 |
+
|
| 129 |
with gr.Row():
|
| 130 |
with gr.Column(scale=1):
|
| 131 |
+
file_input = gr.File(label="1. Upload Invoice", file_types=[".pdf", ".png", ".jpg"])
|
| 132 |
+
# The preview component to show the first page
|
| 133 |
+
preview_output = gr.Image(label="2. Document Preview", type="pil")
|
| 134 |
run_btn = gr.Button("π Extract Data", variant="primary")
|
| 135 |
+
|
| 136 |
with gr.Column(scale=1):
|
| 137 |
+
json_output = gr.JSON(label="3. Extracted JSON Result")
|
| 138 |
|
| 139 |
+
# Important: Ensure inputs and outputs match function signature
|
| 140 |
+
run_btn.click(
|
| 141 |
+
fn=process_invoice,
|
| 142 |
+
inputs=file_input,
|
| 143 |
+
outputs=[preview_output, json_output]
|
| 144 |
+
)
|
| 145 |
|
| 146 |
demo.launch()
|