Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import time | |
| import base64 | |
| import json | |
| import gc | |
| import torch | |
| import io | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| from qwen_vl_utils import process_vision_info | |
| import gradio as gr | |
| import spaces | |
| # Model setup | |
| MODEL_NAME = "numind/NuExtract-2.0-4B" | |
| device = "cuda" # ZeroGPU provides GPU | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16, | |
| device_map=None, # Load on CPU, move to GPU in function | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| padding_side='left', | |
| use_fast=True, | |
| ) | |
| # Invoice schema | |
| invoice_schema = { | |
| "invoice_number": "", | |
| "invoice_date": "", | |
| "supplier_name": "", | |
| "supplier_address": "", | |
| "total_amount": "", | |
| "currency": "", | |
| "items": [ | |
| { | |
| "description": "", | |
| "quantity": "", | |
| "unit_price": "", | |
| "total_price": "" | |
| } | |
| ] | |
| } | |
| def encode_image_to_base64(image_path): | |
| with open(image_path, "rb") as img_file: | |
| return base64.b64encode(img_file.read()).decode("utf-8") | |
| def encode_image_from_pil(image): | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="PNG") | |
| return base64.b64encode(buffer.getvalue()).decode("utf-8") | |
| def prepare_prompt(image_path): | |
| base64_image = encode_image_to_base64(image_path) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"data:image;base64,{base64_image}"} | |
| ] | |
| } | |
| ] | |
| text = processor.tokenizer.apply_chat_template( | |
| messages, | |
| template=json.dumps(invoice_schema, indent=4), | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| return messages, text | |
| def process_image(image): | |
| if image is None: | |
| return "No image provided." | |
| base64_str = encode_image_from_pil(image) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"data:image;base64,{base64_str}"} | |
| ] | |
| } | |
| ] | |
| text = processor.tokenizer.apply_chat_template( | |
| messages, | |
| template=json.dumps(invoice_schema, indent=4), | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| image_inputs = process_vision_info(messages)[0] or [] | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ).to(device) | |
| generation_config = { | |
| "do_sample": False, | |
| "num_beams": 1, | |
| "max_new_tokens": 2048, | |
| } | |
| generated_ids = model.generate(**inputs, **generation_config) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False, | |
| )[0] | |
| return output_text | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=process_image, | |
| inputs=gr.Image(type="pil", label="Upload Invoice Image"), | |
| outputs=gr.Textbox(label="Extracted Invoice Data (JSON)"), | |
| title="Invoice Parser with NuExtract", | |
| description="Upload an invoice image to extract structured data using AI." | |
| ) | |
| iface.launch() |