Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import time | |
| import base64 | |
| import json | |
| import gc | |
| import torch | |
| import io | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| from qwen_vl_utils import process_vision_info | |
| import gradio as gr | |
| import spaces | |
| # Model setup | |
| MODEL_NAME = "numind/NuExtract-2.0-4B" | |
| device = "cuda" | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16, | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, | |
| padding_side='left', | |
| use_fast=True, | |
| ) | |
| # Invoice schema | |
| invoice_schema = { | |
| "invoice_number": "", | |
| "invoice_date": "", | |
| "supplier_name": "", | |
| "supplier_address": "", | |
| "total_amount": "", | |
| "currency": "", | |
| "items": [ | |
| { | |
| "description": "", | |
| "quantity": "", | |
| "unit_price": "", | |
| "total_price": "" | |
| } | |
| ] | |
| } | |
| def encode_image_from_pil(image): | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="PNG") | |
| return base64.b64encode(buffer.getvalue()).decode("utf-8") | |
| def process_images(files, schema_str): | |
| if not files: | |
| return "No images provided." | |
| try: | |
| custom_schema = json.loads(schema_str) | |
| except json.JSONDecodeError: | |
| return "Invalid JSON schema." | |
| results = [] | |
| model.to(device) | |
| for file_obj in files: | |
| image = Image.open(file_obj.name).convert("RGB") | |
| base64_str = encode_image_from_pil(image) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"data:image;base64,{base64_str}"} | |
| ] | |
| } | |
| ] | |
| text = processor.tokenizer.apply_chat_template( | |
| messages, | |
| template=json.dumps(custom_schema, indent=4), | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| image_inputs = process_vision_info(messages)[0] or [] | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ).to(device) | |
| generated_ids = model.generate( | |
| **inputs, | |
| do_sample=False, | |
| num_beams=1, | |
| max_new_tokens=2048, | |
| ) | |
| trimmed = [ | |
| out[len(in_ids):] for in_ids, out in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False, | |
| )[0] | |
| results.append({ | |
| "filename": os.path.basename(file_obj.name), | |
| "output": output_text | |
| }) | |
| return json.dumps(results, indent=4) | |
| # Gradio UI | |
| iface = gr.Interface( | |
| fn=process_images, | |
| inputs=[ | |
| gr.File( | |
| label="Upload Invoice Images", | |
| type="filepath", | |
| file_count="multiple", | |
| ), | |
| gr.Textbox( | |
| label="Custom Schema (JSON)", | |
| value=json.dumps(invoice_schema, indent=4), | |
| lines=12, | |
| ) | |
| ], | |
| outputs=gr.Textbox( | |
| label="Extracted JSON Data", | |
| lines=40, | |
| max_lines=200, | |
| autoscroll=True, | |
| interactive=True, | |
| show_copy_button=True, | |
| ), | |
| title="Invoice Parser with NuExtract (Multi-Image)", | |
| description="Upload one or more invoice images. Each will be processed independently with your custom JSON schema.", | |
| ) | |
| iface.launch() | |