github-actions[bot] commited on
Commit
d062149
·
1 Parent(s): 50fd07f

Sync from GitHub: 10945f8bcad8f91e0ef20a88f2630fa1409bb1e5

Browse files
.gitignore CHANGED
@@ -37,4 +37,6 @@ frontend/.env.local
37
  test*
38
  executable.py
39
  client_example.py
40
- Docs
 
 
 
37
  test*
38
  executable.py
39
  client_example.py
40
+ Docs
41
+
42
+ prompt.txt
app.py CHANGED
@@ -99,7 +99,8 @@ async def health_check():
99
  async def extract_invoice(
100
  file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
101
  doc_id: Optional[str] = Form(None, description="Optional document identifier"),
102
- enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing")
 
103
  ):
104
  """
105
  Extract information from invoice image
@@ -172,7 +173,7 @@ async def extract_invoice(
172
  doc_id = os.path.splitext(file.filename)[0]
173
 
174
  # Process invoice
175
- result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image)
176
 
177
  # Add total request time (includes file I/O)
178
  result['total_request_time_sec'] = round(time.time() - request_start, 2)
@@ -201,7 +202,8 @@ async def extract_invoice(
201
  @app.post("/process-invoice")
202
  async def process_invoice(
203
  file: UploadFile = File(..., description="Invoice image file"),
204
- enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing")
 
205
  ):
206
  """
207
  Process a single invoice and return extracted information
@@ -210,6 +212,7 @@ async def process_invoice(
210
  **Parameters:**
211
  - **file**: Invoice image file (required)
212
  - **enhance_image**: Apply OpenCV enhancement preprocessing (optional)
 
213
 
214
  **Returns:**
215
  - JSON with extracted_text, signature_coords, stamp_coords
@@ -241,7 +244,7 @@ async def process_invoice(
241
  doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
242
 
243
  # Process invoice
244
- result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image)
245
 
246
  # Extract fields from result
247
  fields = result.get("fields", {})
 
99
  async def extract_invoice(
100
  file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
101
  doc_id: Optional[str] = Form(None, description="Optional document identifier"),
102
+ enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing"),
103
+ reasoning_mode: Optional[str] = Form("simple", description="VLM reasoning mode: 'simple' or 'reason'")
104
  ):
105
  """
106
  Extract information from invoice image
 
173
  doc_id = os.path.splitext(file.filename)[0]
174
 
175
  # Process invoice
176
+ result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image, reasoning_mode)
177
 
178
  # Add total request time (includes file I/O)
179
  result['total_request_time_sec'] = round(time.time() - request_start, 2)
 
202
  @app.post("/process-invoice")
203
  async def process_invoice(
204
  file: UploadFile = File(..., description="Invoice image file"),
205
+ enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing"),
206
+ reasoning_mode: Optional[str] = Form("simple", description="VLM reasoning mode: 'simple' or 'reason'")
207
  ):
208
  """
209
  Process a single invoice and return extracted information
 
212
  **Parameters:**
213
  - **file**: Invoice image file (required)
214
  - **enhance_image**: Apply OpenCV enhancement preprocessing (optional)
215
+ - **reasoning_mode**: VLM reasoning mode: 'simple' for single-step, 'reason' for Chain of Thought (optional)
216
 
217
  **Returns:**
218
  - JSON with extracted_text, signature_coords, stamp_coords
 
244
  doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
245
 
246
  # Process invoice
247
+ result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image, reasoning_mode)
248
 
249
  # Extract fields from result
250
  fields = result.get("fields", {})
frontend/src/App.jsx CHANGED
@@ -18,6 +18,7 @@ function App() {
18
  const [resolutionMap, setResolutionMap] = useState({});
19
  const [resultResolutionMap, setResultResolutionMap] = useState({});
20
  const [enhancedMap, setEnhancedMap] = useState({}); // Track which images are enhanced
 
21
 
22
  const handleFilesSelected = async (files) => {
23
  setProcessing(false);
@@ -27,6 +28,7 @@ function App() {
27
  setPreviewImages([]);
28
  setResolutionMap({});
29
  setEnhancedMap({}); // Reset enhanced state
 
30
 
31
  try {
32
  // Step 1: Convert all files to images and show previews
@@ -95,8 +97,9 @@ function App() {
95
  const processData = resolutionMap[preview.key] || { dataUrl: preview.dataUrl, resolution: 100 };
96
  const blob = dataUrlToBlob(processData.dataUrl);
97
  const isEnhanced = enhancedMap[preview.key] || false;
 
98
 
99
- const result = await processSingleInvoice(blob, preview.filename, isEnhanced);
100
 
101
  const resultWithMetadata = {
102
  ...result,
@@ -143,8 +146,9 @@ function App() {
143
  // Use resolution-adjusted image from ResultCard
144
  const blob = dataUrlToBlob(adjustedDataUrl || imageDataMap[result.key]);
145
  const isEnhanced = enhancedMap[result.key] || false;
 
146
 
147
- const newResult = await processSingleInvoice(blob, result.filename, isEnhanced);
148
 
149
  const resultWithMetadata = {
150
  ...newResult,
@@ -183,6 +187,13 @@ function App() {
183
  }));
184
  };
185
 
 
 
 
 
 
 
 
186
  return (
187
  <div className="min-h-screen py-8 px-4 sm:px-6 lg:px-8">
188
  <div className="max-w-7xl mx-auto">
@@ -230,6 +241,8 @@ function App() {
230
  {previewImages.map((preview, idx) => (
231
  <ImagePreview
232
  key={preview.key}
 
 
233
  imageData={preview.dataUrl}
234
  fileName={preview.filename}
235
  onResolutionChange={(dataUrl, resolution) =>
 
18
  const [resolutionMap, setResolutionMap] = useState({});
19
  const [resultResolutionMap, setResultResolutionMap] = useState({});
20
  const [enhancedMap, setEnhancedMap] = useState({}); // Track which images are enhanced
21
+ const [reasoningMap, setReasoningMap] = useState({}); // Track which images use reasoning mode
22
 
23
  const handleFilesSelected = async (files) => {
24
  setProcessing(false);
 
28
  setPreviewImages([]);
29
  setResolutionMap({});
30
  setEnhancedMap({}); // Reset enhanced state
31
+ setReasoningMap({}); // Reset reasoning state
32
 
33
  try {
34
  // Step 1: Convert all files to images and show previews
 
97
  const processData = resolutionMap[preview.key] || { dataUrl: preview.dataUrl, resolution: 100 };
98
  const blob = dataUrlToBlob(processData.dataUrl);
99
  const isEnhanced = enhancedMap[preview.key] || false;
100
+ const reasoningMode = reasoningMap[preview.key] ? "reason" : "simple";
101
 
102
+ const result = await processSingleInvoice(blob, preview.filename, isEnhanced, reasoningMode);
103
 
104
  const resultWithMetadata = {
105
  ...result,
 
146
  // Use resolution-adjusted image from ResultCard
147
  const blob = dataUrlToBlob(adjustedDataUrl || imageDataMap[result.key]);
148
  const isEnhanced = enhancedMap[result.key] || false;
149
+ const reasoningMode = reasoningMap[result.key] ? "reason" : "simple";
150
 
151
+ const newResult = await processSingleInvoice(blob, result.filename, isEnhanced, reasoningMode);
152
 
153
  const resultWithMetadata = {
154
  ...newResult,
 
187
  }));
188
  };
189
 
190
+ const handleReasoningModeToggle = (key) => {
191
+ setReasoningMap(prev => ({
192
+ ...prev,
193
+ [key]: !prev[key]
194
+ }));
195
+ };
196
+
197
  return (
198
  <div className="min-h-screen py-8 px-4 sm:px-6 lg:px-8">
199
  <div className="max-w-7xl mx-auto">
 
241
  {previewImages.map((preview, idx) => (
242
  <ImagePreview
243
  key={preview.key}
244
+ onReasoningModeToggle={() => handleReasoningModeToggle(preview.key)}
245
+ useReasoning={reasoningMap[preview.key] || false}
246
  imageData={preview.dataUrl}
247
  fileName={preview.filename}
248
  onResolutionChange={(dataUrl, resolution) =>
frontend/src/components/ImagePreview.jsx CHANGED
@@ -1,7 +1,7 @@
1
  import React, { useState, useEffect, useRef } from 'react';
2
- import { SlidersHorizontal, Sparkles } from 'lucide-react';
3
 
4
- const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle, isEnhanced }) => {
5
  const [resolution, setResolution] = useState(100);
6
  const canvasRef = useRef(null);
7
  const [originalDimensions, setOriginalDimensions] = useState({ width: 0, height: 0 });
@@ -87,6 +87,25 @@ const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle
87
  </div>
88
  )}
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  <div className="space-y-2">
91
  <div className="flex items-center justify-between">
92
  <label className="text-sm font-medium text-gray-700 flex items-center gap-2">
 
1
  import React, { useState, useEffect, useRef } from 'react';
2
+ import { SlidersHorizontal, Sparkles, Brain } from 'lucide-react';
3
 
4
+ const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle, isEnhanced, onReasoningModeToggle, useReasoning }) => {
5
  const [resolution, setResolution] = useState(100);
6
  const canvasRef = useRef(null);
7
  const [originalDimensions, setOriginalDimensions] = useState({ width: 0, height: 0 });
 
87
  </div>
88
  )}
89
 
90
+ {/* Reasoning Mode Toggle */}
91
+ <button
92
+ onClick={() => onReasoningModeToggle && onReasoningModeToggle()}
93
+ className={`w-full py-2 px-4 rounded-lg font-medium transition-all flex items-center justify-center gap-2 ${
94
+ useReasoning
95
+ ? 'bg-blue-600 hover:bg-blue-700 text-white shadow-lg'
96
+ : 'bg-gradient-to-r from-blue-500 to-cyan-500 hover:from-blue-600 hover:to-cyan-600 text-white shadow-md'
97
+ }`}
98
+ >
99
+ <Brain className="w-4 h-4" />
100
+ {useReasoning ? 'Chain of Thought ✓' : 'Simple Mode'}
101
+ </button>
102
+
103
+ {useReasoning && (
104
+ <div className="bg-blue-50 border border-blue-200 rounded p-2 text-xs text-blue-700">
105
+ 🧠 VLM will use 2-step reasoning: first analyze document structure, then extract fields
106
+ </div>
107
+ )}
108
+
109
  <div className="space-y-2">
110
  <div className="flex items-center justify-between">
111
  <label className="text-sm font-medium text-gray-700 flex items-center gap-2">
frontend/src/utils/api.js CHANGED
@@ -8,12 +8,14 @@ const API_BASE_URL = import.meta.env.VITE_API_URL || window.location.origin;
8
  * @param {Blob} imageBlob - Image blob
9
  * @param {string} filename - Original filename
10
  * @param {boolean} enhanceImage - Whether to apply OpenCV enhancement
 
11
  * @returns {Promise<Object>} Processed result
12
  */
13
- export async function processSingleInvoice(imageBlob, filename, enhanceImage = false) {
14
  const formData = new FormData();
15
  formData.append('file', imageBlob, filename);
16
  formData.append('enhance_image', enhanceImage);
 
17
 
18
  const response = await axios.post(`${API_BASE_URL}/process-invoice`, formData, {
19
  headers: {
 
8
  * @param {Blob} imageBlob - Image blob
9
  * @param {string} filename - Original filename
10
  * @param {boolean} enhanceImage - Whether to apply OpenCV enhancement
11
+ * @param {string} reasoningMode - VLM reasoning mode: "simple" or "reason"
12
  * @returns {Promise<Object>} Processed result
13
  */
14
+ export async function processSingleInvoice(imageBlob, filename, enhanceImage = false, reasoningMode = "simple") {
15
  const formData = new FormData();
16
  formData.append('file', imageBlob, filename);
17
  formData.append('enhance_image', enhanceImage);
18
+ formData.append('reasoning_mode', reasoningMode);
19
 
20
  const response = await axios.post(`${API_BASE_URL}/process-invoice`, formData, {
21
  headers: {
inference.py CHANGED
@@ -22,6 +22,7 @@ from config import (
22
  from model_manager import model_manager
23
 
24
 
 
25
  EXTRACTION_PROMPT = """
26
  You are an expert at reading noisy, handwritten Indian invoices and quotations.
27
 
@@ -62,6 +63,161 @@ Output rules:
62
  """
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  class InferenceProcessor:
66
  """Handles VLM inference, validation, and result processing"""
67
 
@@ -184,6 +340,143 @@ class InferenceProcessor:
184
 
185
  return output_text, latency
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  @staticmethod
188
  def extract_json_from_output(text: str) -> Dict:
189
  """Extract JSON from model output"""
@@ -328,7 +621,7 @@ class InferenceProcessor:
328
  return validated, field_confidence, warnings
329
 
330
  @staticmethod
331
- def process_invoice(image_path: str, doc_id: str = None, enhance_image: bool = False) -> Dict:
332
  """
333
  Complete invoice processing pipeline
334
 
@@ -336,6 +629,7 @@ class InferenceProcessor:
336
  image_path: Path to invoice image
337
  doc_id: Document identifier (optional)
338
  enhance_image: Whether to apply OpenCV enhancement (optional)
 
339
 
340
  Returns:
341
  dict: Complete JSON output with all fields
@@ -364,10 +658,28 @@ class InferenceProcessor:
364
  signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
365
  timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
366
 
367
- # Step 3: VLM Extraction
368
  t3 = time.time()
369
- vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
370
- timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
  # Clean up image
373
  image.close()
 
22
  from model_manager import model_manager
23
 
24
 
25
+ # Single-step extraction prompt (original "simple" mode)
26
  EXTRACTION_PROMPT = """
27
  You are an expert at reading noisy, handwritten Indian invoices and quotations.
28
 
 
63
  """
64
 
65
 
66
+ # Two-step Chain of Thought prompts (reasoning mode)
67
+ REASONING_PROMPT = """
68
+ You are an expert at analyzing noisy, handwritten Indian invoices and quotations for tractors.
69
+
70
+ Your task is to carefully observe and describe the document structure WITHOUT extracting yet.
71
+
72
+ Analyze this tractor invoice image and provide detailed observations about:
73
+
74
+ 1. DEALER/COMPANY NAME
75
+ - Where is it located? (top header, letterhead, stamp, footer)
76
+ - What language is it written in?
77
+ - Is it printed or handwritten?
78
+ - Exact text you see (preserve original language)
79
+
80
+ 2. MODEL INFORMATION
81
+ - Where is the model mentioned? (checkbox list, handwritten field, printed table, near "Model:" label)
82
+ - Are there multiple model options shown?
83
+ - If checkboxes exist, which one is marked? (look for ✓, ✗, [X], ●, ☑, filled boxes)
84
+ - Is the model name in English or regional language?
85
+ - Exact text you see for the selected/mentioned model
86
+
87
+ 3. HORSE POWER (HP)
88
+ - Where is HP information located? (separate field, within model name, checkbox list, specifications table)
89
+ - Is HP explicitly written or implied from model code?
90
+ - If there's a checkbox list with HP options, which one is selected?
91
+ - Are there multiple HP values shown? Which one corresponds to the selected model?
92
+ - Exact HP text you see (e.g., "49 HP", "63hp", "HP-30")
93
+
94
+ 4. TOTAL AMOUNT/ASSET COST
95
+ - Where is the final total located? (bottom of page, after tax section, grand total line)
96
+ - What label is used? (Total, Grand Total, Final Amount, कुल राशि, etc.)
97
+ - Are there multiple amount fields? Which is the final one after all taxes/charges?
98
+ - Exact amount you see with any currency symbols
99
+
100
+ 5. CHECKBOX SELECTIONS (if applicable)
101
+ - Are there any checkbox lists on the page?
102
+ - What options are available in these lists?
103
+ - Which options are clearly marked/selected? (describe the selection mark)
104
+ - Which options are clearly unmarked/unselected?
105
+
106
+ 6. AMBIGUITIES OR CHALLENGES
107
+ - Is any handwriting difficult to read?
108
+ - Are any fields unclear or could have multiple interpretations?
109
+ - Are there any conflicting pieces of information?
110
+
111
+ Return ONLY valid JSON in this exact format:
112
+
113
+ {
114
+ "dealer_location": string,
115
+ "dealer_text_observed": string,
116
+ "dealer_language": string,
117
+ "model_location": string,
118
+ "model_format": string,
119
+ "model_text_observed": string,
120
+ "model_is_checkbox": boolean,
121
+ "model_selected_option": string,
122
+ "hp_location": string,
123
+ "hp_format": string,
124
+ "hp_text_observed": string,
125
+ "hp_is_checkbox": boolean,
126
+ "hp_value_observed": string,
127
+ "amount_location": string,
128
+ "amount_label": string,
129
+ "amount_text_observed": string,
130
+ "checkboxes_present": boolean,
131
+ "checkbox_details": string,
132
+ "ambiguities": string,
133
+ "overall_document_quality": string
134
+ }
135
+
136
+ Guidelines:
137
+ - Be extremely specific about locations (e.g., "top-left header", "middle section below tractor image", "bottom-right in total box")
138
+ - Preserve original language text in observations
139
+ - Describe what you see, don't interpret or extract yet
140
+ - If something is unclear, describe why
141
+ - Focus on SELECTED/MARKED options when checkboxes are present
142
+
143
+ Output rules:
144
+ - Output ONLY valid JSON
145
+ - Do NOT include markdown, explanations, or extra text
146
+ """
147
+
148
+
149
+ EXTRACTION_WITH_CONTEXT_PROMPT = """
150
+ You are an expert at extracting structured data from Indian invoices and quotations.
151
+
152
+ You have already analyzed this document. Here is your previous analysis:
153
+
154
+ CONTEXT FROM REASONING:
155
+ {reasoning_output}
156
+
157
+ Based on your previous analysis, now extract the exact field values.
158
+
159
+ Return ONLY valid JSON in this exact format:
160
+
161
+ {{
162
+ "dealer_name": string,
163
+ "model_name": string,
164
+ "horse_power": number,
165
+ "asset_cost": number
166
+ }}
167
+
168
+ Critical extraction rules:
169
+
170
+ 1. DEALER NAME
171
+ - Copy EXACTLY as it appears in the original language and spelling
172
+ - Do NOT translate from Hindi/Marathi/Kannada to English
173
+ - Do NOT correct spelling or expand abbreviations
174
+ - Include any punctuation or special characters as shown
175
+
176
+ 2. MODEL NAME
177
+ - Copy EXACTLY as it appears in the original language
178
+ - If from checkbox selection, extract ONLY the selected/marked option
179
+ - Do NOT translate or normalize
180
+ - Preserve numbers, hyphens, and spacing exactly
181
+ - Do NOT include HP value within model name
182
+
183
+ 3. HORSE POWER
184
+ - Must be a number only (integer or decimal)
185
+ - Extract from explicit HP mentions only (never infer from model codes)
186
+ - If from checkbox, use only the selected option's HP value
187
+ - Remove text like "HP", "hp", "हॉर्स पावर" - keep only the number
188
+ - If HP appears as "49 HP" → extract: 49
189
+ - If HP appears as "63.5hp" → extract: 63.5
190
+ - If multiple HP values exist, use the one for the selected model
191
+
192
+ 4. ASSET COST
193
+ - Must be a number only (integer or decimal)
194
+ - Use the FINAL total amount after all taxes and charges
195
+ - Remove currency symbols (₹, Rs, INR)
196
+ - Remove commas (e.g., "1,50,000" → 150000)
197
+ - If amount is "₹ 1,75,500.00" → extract: 175500
198
+ - Use the largest/final amount if multiple totals exist
199
+
200
+ Data validation:
201
+ - dealer_name: Must be non-empty string in original language
202
+ - model_name: Must be non-empty string in original language
203
+ - horse_power: Must be positive number (typically between 15-100 for tractors)
204
+ - asset_cost: Must be positive number (typically between 100000-3000000 for tractors)
205
+
206
+ Special handling based on your reasoning:
207
+ - If you noted checkboxes: Extract ONLY marked/selected options
208
+ - If you noted ambiguities: Make best judgment and use most likely value
209
+ - If you noted poor handwriting: Interpret characters as best as possible while preserving language
210
+ - If you noted multiple values: Use the one that matches the selected/final configuration
211
+
212
+ Output rules:
213
+ - Output ONLY valid JSON
214
+ - Do NOT include markdown code fences
215
+ - Do NOT include explanations or extra text
216
+ - Ensure all four fields are present
217
+ - Ensure numbers are actual numbers, not strings with currency/commas
218
+ """
219
+
220
+
221
  class InferenceProcessor:
222
  """Handles VLM inference, validation, and result processing"""
223
 
 
340
 
341
  return output_text, latency
342
 
343
+ @staticmethod
344
+ def run_vlm_reasoning(image: Image.Image) -> Tuple[str, float]:
345
+ """
346
+ Run VLM model for Chain of Thought reasoning phase (step 1 of 2)
347
+ Analyzes document structure and observes field locations
348
+ """
349
+ if not model_manager.is_loaded():
350
+ raise RuntimeError("Models not loaded")
351
+
352
+ model = model_manager.vlm_model
353
+ processor = model_manager.processor
354
+
355
+ messages = [
356
+ {
357
+ "role": "user",
358
+ "content": [
359
+ {"type": "image", "image": image},
360
+ {"type": "text", "text": REASONING_PROMPT}
361
+ ]
362
+ }
363
+ ]
364
+
365
+ # Apply chat template
366
+ text = processor.apply_chat_template(
367
+ messages,
368
+ tokenize=False,
369
+ add_generation_prompt=True
370
+ )
371
+
372
+ # Process vision input
373
+ image_inputs, video_inputs = process_vision_info(messages)
374
+ inputs = processor(
375
+ text=[text],
376
+ images=image_inputs,
377
+ videos=video_inputs,
378
+ padding=True,
379
+ return_tensors="pt",
380
+ )
381
+ inputs = inputs.to("cuda")
382
+
383
+ start = time.time()
384
+
385
+ # Generate (allow more tokens for detailed reasoning)
386
+ generated_ids = model.generate(**inputs, max_new_tokens=512)
387
+
388
+ latency = time.time() - start
389
+
390
+ # Decode output
391
+ generated_ids_trimmed = [
392
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
393
+ ]
394
+ output_text = processor.batch_decode(
395
+ generated_ids_trimmed,
396
+ skip_special_tokens=True,
397
+ clean_up_tokenization_spaces=False
398
+ )
399
+
400
+ output_text = output_text[0] if isinstance(output_text, list) else output_text
401
+
402
+ # Clean up GPU memory
403
+ del inputs, generated_ids, generated_ids_trimmed
404
+ if torch.cuda.is_available():
405
+ torch.cuda.empty_cache()
406
+
407
+ print(f"🧠 Reasoning phase completed in {latency:.2f}s")
408
+ return output_text, latency
409
+
410
+ @staticmethod
411
+ def run_vlm_extraction_with_context(image: Image.Image, reasoning_output: str) -> Tuple[str, float]:
412
+ """
413
+ Run VLM model for extraction phase (step 2 of 2) using reasoning context
414
+ Extracts structured fields based on previous reasoning
415
+ """
416
+ if not model_manager.is_loaded():
417
+ raise RuntimeError("Models not loaded")
418
+
419
+ model = model_manager.vlm_model
420
+ processor = model_manager.processor
421
+
422
+ # Format the extraction prompt with reasoning context
423
+ extraction_prompt = EXTRACTION_WITH_CONTEXT_PROMPT.format(reasoning_output=reasoning_output)
424
+
425
+ messages = [
426
+ {
427
+ "role": "user",
428
+ "content": [
429
+ {"type": "image", "image": image},
430
+ {"type": "text", "text": extraction_prompt}
431
+ ]
432
+ }
433
+ ]
434
+
435
+ # Apply chat template
436
+ text = processor.apply_chat_template(
437
+ messages,
438
+ tokenize=False,
439
+ add_generation_prompt=True
440
+ )
441
+
442
+ # Process vision input
443
+ image_inputs, video_inputs = process_vision_info(messages)
444
+ inputs = processor(
445
+ text=[text],
446
+ images=image_inputs,
447
+ videos=video_inputs,
448
+ padding=True,
449
+ return_tensors="pt",
450
+ )
451
+ inputs = inputs.to("cuda")
452
+
453
+ start = time.time()
454
+
455
+ # Generate
456
+ generated_ids = model.generate(**inputs, max_new_tokens=256)
457
+
458
+ latency = time.time() - start
459
+
460
+ # Decode output
461
+ generated_ids_trimmed = [
462
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
463
+ ]
464
+ output_text = processor.batch_decode(
465
+ generated_ids_trimmed,
466
+ skip_special_tokens=True,
467
+ clean_up_tokenization_spaces=False
468
+ )
469
+
470
+ output_text = output_text[0] if isinstance(output_text, list) else output_text
471
+
472
+ # Clean up GPU memory
473
+ del inputs, generated_ids, generated_ids_trimmed
474
+ if torch.cuda.is_available():
475
+ torch.cuda.empty_cache()
476
+
477
+ print(f"📝 Extraction phase completed in {latency:.2f}s")
478
+ return output_text, latency
479
+
480
  @staticmethod
481
  def extract_json_from_output(text: str) -> Dict:
482
  """Extract JSON from model output"""
 
621
  return validated, field_confidence, warnings
622
 
623
  @staticmethod
624
+ def process_invoice(image_path: str, doc_id: str = None, enhance_image: bool = False, reasoning_mode: str = "simple") -> Dict:
625
  """
626
  Complete invoice processing pipeline
627
 
 
629
  image_path: Path to invoice image
630
  doc_id: Document identifier (optional)
631
  enhance_image: Whether to apply OpenCV enhancement (optional)
632
+ reasoning_mode: "simple" for single-step extraction, "reason" for Chain of Thought (optional)
633
 
634
  Returns:
635
  dict: Complete JSON output with all fields
 
658
  signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
659
  timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
660
 
661
+ # Step 3: VLM Extraction (either simple or with Chain of Thought reasoning)
662
  t3 = time.time()
663
+ if reasoning_mode == "reason":
664
+ # Two-step Chain of Thought approach
665
+ print("🧠 Using Chain of Thought reasoning mode (2-step)")
666
+
667
+ # Step 3a: Reasoning phase
668
+ reasoning_output, reasoning_latency = InferenceProcessor.run_vlm_reasoning(image)
669
+ timing_breakdown['vlm_reasoning'] = round(reasoning_latency, 3)
670
+
671
+ # Step 3b: Extraction phase with context
672
+ vlm_output, extraction_latency = InferenceProcessor.run_vlm_extraction_with_context(image, reasoning_output)
673
+ timing_breakdown['vlm_extraction'] = round(extraction_latency, 3)
674
+ timing_breakdown['vlm_inference_total'] = round(reasoning_latency + extraction_latency, 3)
675
+
676
+ # Store reasoning for debugging/transparency
677
+ timing_breakdown['reasoning_output'] = reasoning_output
678
+ else:
679
+ # Single-step simple extraction (original approach)
680
+ print("⚡ Using simple mode (1-step)")
681
+ vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
682
+ timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
683
 
684
  # Clean up image
685
  image.close()