Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,376 Bytes
294474c 3889a50 294474c 3889a50 294474c 3889a50 294474c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import time
import base64
import json
import gc
import torch
import io
from transformers import AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
import gradio as gr
import spaces
# Model setup
MODEL_NAME = "numind/NuExtract-2.0-4B"
device = "cuda" # ZeroGPU provides GPU
model = AutoModelForImageTextToText.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
dtype=torch.bfloat16,
device_map=None, # Load on CPU, move to GPU in function
)
processor = AutoProcessor.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
padding_side='left',
use_fast=True,
)
# Invoice schema
invoice_schema = {
"invoice_number": "",
"invoice_date": "",
"supplier_name": "",
"supplier_address": "",
"total_amount": "",
"currency": "",
"items": [
{
"description": "",
"quantity": "",
"unit_price": "",
"total_price": ""
}
]
}
def encode_image_to_base64(image_path):
with open(image_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode("utf-8")
def encode_image_from_pil(image):
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
def prepare_prompt(image_path):
base64_image = encode_image_to_base64(image_path)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"data:image;base64,{base64_image}"}
]
}
]
text = processor.tokenizer.apply_chat_template(
messages,
template=json.dumps(invoice_schema, indent=4),
tokenize=False,
add_generation_prompt=True
)
return messages, text
@spaces.GPU
def process_image(image):
if image is None:
return "No image provided."
base64_str = encode_image_from_pil(image)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"data:image;base64,{base64_str}"}
]
}
]
text = processor.tokenizer.apply_chat_template(
messages,
template=json.dumps(invoice_schema, indent=4),
tokenize=False,
add_generation_prompt=True
)
image_inputs = process_vision_info(messages)[0] or []
inputs = processor(
text=[text],
images=image_inputs,
padding=True,
return_tensors="pt",
).to(device)
generation_config = {
"do_sample": False,
"num_beams": 1,
"max_new_tokens": 2048,
}
generated_ids = model.generate(**inputs, **generation_config)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
return output_text
# Gradio interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil", label="Upload Invoice Image"),
outputs=gr.Textbox(label="Extracted Invoice Data (JSON)"),
title="Invoice Parser with NuExtract",
description="Upload an invoice image to extract structured data using AI."
)
iface.launch() |