import os import time import base64 import json import gc import torch import io from PIL import Image from transformers import AutoProcessor, AutoModelForImageTextToText from qwen_vl_utils import process_vision_info import gradio as gr import spaces # Model setup MODEL_NAME = "numind/NuExtract-2.0-4B" device = "cuda" model = AutoModelForImageTextToText.from_pretrained( MODEL_NAME, trust_remote_code=True, dtype=torch.bfloat16, ) processor = AutoProcessor.from_pretrained( MODEL_NAME, trust_remote_code=True, padding_side='left', use_fast=True, ) # Invoice schema invoice_schema = { "invoice_number": "", "invoice_date": "", "supplier_name": "", "supplier_address": "", "total_amount": "", "currency": "", "items": [ { "description": "", "quantity": "", "unit_price": "", "total_price": "" } ] } def encode_image_from_pil(image): buffer = io.BytesIO() image.save(buffer, format="PNG") return base64.b64encode(buffer.getvalue()).decode("utf-8") @spaces.GPU def process_images(files, schema_str): if not files: return "No images provided." try: custom_schema = json.loads(schema_str) except json.JSONDecodeError: return "Invalid JSON schema." results = [] model.to(device) for file_obj in files: image = Image.open(file_obj.name).convert("RGB") base64_str = encode_image_from_pil(image) messages = [ { "role": "user", "content": [ {"type": "image", "image": f"data:image;base64,{base64_str}"} ] } ] text = processor.tokenizer.apply_chat_template( messages, template=json.dumps(custom_schema, indent=4), tokenize=False, add_generation_prompt=True ) image_inputs = process_vision_info(messages)[0] or [] inputs = processor( text=[text], images=image_inputs, padding=True, return_tensors="pt", ).to(device) generated_ids = model.generate( **inputs, do_sample=False, num_beams=1, max_new_tokens=2048, ) trimmed = [ out[len(in_ids):] for in_ids, out in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] results.append({ "filename": os.path.basename(file_obj.name), "output": output_text }) return json.dumps(results, indent=4) # Gradio UI iface = gr.Interface( fn=process_images, inputs=[ gr.File( label="Upload Invoice Images", type="filepath", file_count="multiple", ), gr.Textbox( label="Custom Schema (JSON)", value=json.dumps(invoice_schema, indent=4), lines=12, ) ], outputs=gr.Textbox( label="Extracted JSON Data", lines=40, max_lines=200, autoscroll=True, interactive=True, show_copy_button=True, ), title="Invoice Parser with NuExtract (Multi-Image)", description="Upload one or more invoice images. Each will be processed independently with your custom JSON schema.", ) iface.launch()