ashvin-savani commited on
Commit
338c5eb
·
1 Parent(s): a9a2fa5
Files changed (1) hide show
  1. app.py +84 -85
app.py CHANGED
@@ -5,6 +5,7 @@ import json
5
  import gc
6
  import torch
7
  import io
 
8
  from transformers import AutoProcessor, AutoModelForImageTextToText
9
  from qwen_vl_utils import process_vision_info
10
  import gradio as gr
@@ -12,15 +13,13 @@ import spaces
12
 
13
  # Model setup
14
  MODEL_NAME = "numind/NuExtract-2.0-4B"
15
- device = "cuda" # ZeroGPU provides GPU
16
 
17
  model = AutoModelForImageTextToText.from_pretrained(
18
  MODEL_NAME,
19
  trust_remote_code=True,
20
  dtype=torch.bfloat16,
21
- device_map=None, # Load on CPU, move to GPU in function
22
  )
23
-
24
  processor = AutoProcessor.from_pretrained(
25
  MODEL_NAME,
26
  trust_remote_code=True,
@@ -46,106 +45,106 @@ invoice_schema = {
46
  ]
47
  }
48
 
49
- def encode_image_to_base64(image_path):
50
- with open(image_path, "rb") as img_file:
51
- return base64.b64encode(img_file.read()).decode("utf-8")
52
 
53
  def encode_image_from_pil(image):
54
  buffer = io.BytesIO()
55
  image.save(buffer, format="PNG")
56
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
57
 
58
- def prepare_prompt(image_path):
59
- base64_image = encode_image_to_base64(image_path)
60
- messages = [
61
- {
62
- "role": "user",
63
- "content": [
64
- {"type": "image", "image": f"data:image;base64,{base64_image}"}
65
- ]
66
- }
67
- ]
68
- text = processor.tokenizer.apply_chat_template(
69
- messages,
70
- template=json.dumps(invoice_schema, indent=4),
71
- tokenize=False,
72
- add_generation_prompt=True
73
- )
74
- return messages, text
75
 
76
  @spaces.GPU
77
- def process_image(image, schema_str):
78
- if image is None:
79
- return "No image provided."
80
-
81
  try:
82
  custom_schema = json.loads(schema_str)
83
  except json.JSONDecodeError:
84
- return "Invalid JSON schema provided."
85
-
86
- base64_str = encode_image_from_pil(image)
87
- messages = [
88
- {
89
- "role": "user",
90
- "content": [
91
- {"type": "image", "image": f"data:image;base64,{base64_str}"}
92
- ]
93
- }
94
- ]
95
- text = processor.tokenizer.apply_chat_template(
96
- messages,
97
- template=json.dumps(custom_schema, indent=4),
98
- tokenize=False,
99
- add_generation_prompt=True
100
- )
101
-
102
- image_inputs = process_vision_info(messages)[0] or []
103
-
104
- inputs = processor(
105
- text=[text],
106
- images=image_inputs,
107
- padding=True,
108
- return_tensors="pt",
109
- ).to(device)
110
-
111
- # Move model to GPU
112
  model.to(device)
113
-
114
- generation_config = {
115
- "do_sample": False,
116
- "num_beams": 1,
117
- "max_new_tokens": 2048,
118
- }
119
-
120
- generated_ids = model.generate(**inputs, **generation_config)
121
-
122
- generated_ids_trimmed = [
123
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
124
- ]
125
-
126
- output_text = processor.batch_decode(
127
- generated_ids_trimmed,
128
- skip_special_tokens=True,
129
- clean_up_tokenization_spaces=False,
130
- )[0]
131
-
132
- return output_text
133
-
134
- # Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  iface = gr.Interface(
136
- fn=process_image,
137
  inputs=[
138
- gr.Image(type="pil", label="Upload Invoice Image"),
 
 
 
 
139
  gr.Textbox(
140
  label="Custom Schema (JSON)",
141
  value=json.dumps(invoice_schema, indent=4),
142
- lines=10,
143
- placeholder="Enter your custom JSON schema here..."
144
  )
145
  ],
146
- outputs=gr.Textbox(label="Extracted Data (JSON)"),
147
- title="Invoice Parser with NuExtract",
148
- description="Upload an invoice image and provide a custom JSON schema to extract structured data using AI."
 
 
 
 
 
 
 
149
  )
150
 
151
- iface.launch()
 
5
  import gc
6
  import torch
7
  import io
8
+ from PIL import Image
9
  from transformers import AutoProcessor, AutoModelForImageTextToText
10
  from qwen_vl_utils import process_vision_info
11
  import gradio as gr
 
13
 
14
  # Model setup
15
  MODEL_NAME = "numind/NuExtract-2.0-4B"
16
+ device = "cuda"
17
 
18
  model = AutoModelForImageTextToText.from_pretrained(
19
  MODEL_NAME,
20
  trust_remote_code=True,
21
  dtype=torch.bfloat16,
 
22
  )
 
23
  processor = AutoProcessor.from_pretrained(
24
  MODEL_NAME,
25
  trust_remote_code=True,
 
45
  ]
46
  }
47
 
 
 
 
48
 
49
  def encode_image_from_pil(image):
50
  buffer = io.BytesIO()
51
  image.save(buffer, format="PNG")
52
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  @spaces.GPU
56
+ def process_images(files, schema_str):
57
+ if not files:
58
+ return "No images provided."
59
+
60
  try:
61
  custom_schema = json.loads(schema_str)
62
  except json.JSONDecodeError:
63
+ return "Invalid JSON schema."
64
+
65
+ results = []
66
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  model.to(device)
68
+
69
+ for file_obj in files:
70
+ image = Image.open(file_obj.name).convert("RGB")
71
+ base64_str = encode_image_from_pil(image)
72
+
73
+ messages = [
74
+ {
75
+ "role": "user",
76
+ "content": [
77
+ {"type": "image", "image": f"data:image;base64,{base64_str}"}
78
+ ]
79
+ }
80
+ ]
81
+
82
+ text = processor.tokenizer.apply_chat_template(
83
+ messages,
84
+ template=json.dumps(custom_schema, indent=4),
85
+ tokenize=False,
86
+ add_generation_prompt=True
87
+ )
88
+
89
+ image_inputs = process_vision_info(messages)[0] or []
90
+
91
+ inputs = processor(
92
+ text=[text],
93
+ images=image_inputs,
94
+ padding=True,
95
+ return_tensors="pt",
96
+ ).to(device)
97
+
98
+ generated_ids = model.generate(
99
+ **inputs,
100
+ do_sample=False,
101
+ num_beams=1,
102
+ max_new_tokens=2048,
103
+ )
104
+
105
+ trimmed = [
106
+ out[len(in_ids):] for in_ids, out in zip(inputs.input_ids, generated_ids)
107
+ ]
108
+
109
+ output_text = processor.batch_decode(
110
+ trimmed,
111
+ skip_special_tokens=True,
112
+ clean_up_tokenization_spaces=False,
113
+ )[0]
114
+
115
+ results.append({
116
+ "filename": os.path.basename(file_obj.name),
117
+ "output": output_text
118
+ })
119
+
120
+ return json.dumps(results, indent=4)
121
+
122
+
123
+ # Gradio UI
124
  iface = gr.Interface(
125
+ fn=process_images,
126
  inputs=[
127
+ gr.File(
128
+ label="Upload Invoice Images",
129
+ type="filepath",
130
+ file_count="multiple",
131
+ ),
132
  gr.Textbox(
133
  label="Custom Schema (JSON)",
134
  value=json.dumps(invoice_schema, indent=4),
135
+ lines=12,
 
136
  )
137
  ],
138
+ outputs=gr.Textbox(
139
+ label="Extracted JSON Data",
140
+ lines=40,
141
+ max_lines=200,
142
+ autoscroll=True,
143
+ interactive=True,
144
+ show_copy_button=True,
145
+ ),
146
+ title="Invoice Parser with NuExtract (Multi-Image)",
147
+ description="Upload one or more invoice images. Each will be processed independently with your custom JSON schema.",
148
  )
149
 
150
+ iface.launch()