Invoice_parser / app.py
ashvin-savani's picture
Test
338c5eb
import os
import time
import base64
import json
import gc
import torch
import io
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
import gradio as gr
import spaces
# Model setup
MODEL_NAME = "numind/NuExtract-2.0-4B"
device = "cuda"
model = AutoModelForImageTextToText.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
dtype=torch.bfloat16,
)
processor = AutoProcessor.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side='left',
use_fast=True,
)
# Invoice schema
invoice_schema = {
"invoice_number": "",
"invoice_date": "",
"supplier_name": "",
"supplier_address": "",
"total_amount": "",
"currency": "",
"items": [
{
"description": "",
"quantity": "",
"unit_price": "",
"total_price": ""
}
]
}
def encode_image_from_pil(image):
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
@spaces.GPU
def process_images(files, schema_str):
if not files:
return "No images provided."
try:
custom_schema = json.loads(schema_str)
except json.JSONDecodeError:
return "Invalid JSON schema."
results = []
model.to(device)
for file_obj in files:
image = Image.open(file_obj.name).convert("RGB")
base64_str = encode_image_from_pil(image)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"data:image;base64,{base64_str}"}
]
}
]
text = processor.tokenizer.apply_chat_template(
messages,
template=json.dumps(custom_schema, indent=4),
tokenize=False,
add_generation_prompt=True
)
image_inputs = process_vision_info(messages)[0] or []
inputs = processor(
text=[text],
images=image_inputs,
padding=True,
return_tensors="pt",
).to(device)
generated_ids = model.generate(
**inputs,
do_sample=False,
num_beams=1,
max_new_tokens=2048,
)
trimmed = [
out[len(in_ids):] for in_ids, out in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
results.append({
"filename": os.path.basename(file_obj.name),
"output": output_text
})
return json.dumps(results, indent=4)
# Gradio UI
iface = gr.Interface(
fn=process_images,
inputs=[
gr.File(
label="Upload Invoice Images",
type="filepath",
file_count="multiple",
),
gr.Textbox(
label="Custom Schema (JSON)",
value=json.dumps(invoice_schema, indent=4),
lines=12,
)
],
outputs=gr.Textbox(
label="Extracted JSON Data",
lines=40,
max_lines=200,
autoscroll=True,
interactive=True,
show_copy_button=True,
),
title="Invoice Parser with NuExtract (Multi-Image)",
description="Upload one or more invoice images. Each will be processed independently with your custom JSON schema.",
)
iface.launch()