Spaces:

quantumbit
/

invoice_extractor

Paused

App Files Files Community

github-actions[bot] commited on 18 days ago

Commit

d062149

1 Parent(s): 50fd07f

Sync from GitHub: 10945f8bcad8f91e0ef20a88f2630fa1409bb1e5

Browse files

Files changed (6) hide show

.gitignore +3 -1
app.py +7 -4
frontend/src/App.jsx +15 -2
frontend/src/components/ImagePreview.jsx +21 -2
frontend/src/utils/api.js +3 -1
inference.py +316 -4

.gitignore CHANGED Viewed

@@ -37,4 +37,6 @@ frontend/.env.local
 test*
 executable.py
 client_example.py
-Docs

 test*
 executable.py
 client_example.py
+Docs
+prompt.txt

app.py CHANGED Viewed

@@ -99,7 +99,8 @@ async def health_check():
 async def extract_invoice(
     file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
     doc_id: Optional[str] = Form(None, description="Optional document identifier"),
-    enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing")
 ):
     """
     Extract information from invoice image
@@ -172,7 +173,7 @@ async def extract_invoice(
             doc_id = os.path.splitext(file.filename)[0]
         # Process invoice
-        result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image)
         # Add total request time (includes file I/O)
         result['total_request_time_sec'] = round(time.time() - request_start, 2)
@@ -201,7 +202,8 @@ async def extract_invoice(
 @app.post("/process-invoice")
 async def process_invoice(
     file: UploadFile = File(..., description="Invoice image file"),
-    enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing")
 ):
     """
     Process a single invoice and return extracted information
@@ -210,6 +212,7 @@ async def process_invoice(
     **Parameters:**
     - **file**: Invoice image file (required)
     - **enhance_image**: Apply OpenCV enhancement preprocessing (optional)
     **Returns:**
     - JSON with extracted_text, signature_coords, stamp_coords
@@ -241,7 +244,7 @@ async def process_invoice(
         doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
         # Process invoice
-        result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image)
         # Extract fields from result
         fields = result.get("fields", {})

 async def extract_invoice(
     file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
     doc_id: Optional[str] = Form(None, description="Optional document identifier"),
+    enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing"),
+    reasoning_mode: Optional[str] = Form("simple", description="VLM reasoning mode: 'simple' or 'reason'")
 ):
     """
     Extract information from invoice image
             doc_id = os.path.splitext(file.filename)[0]
         # Process invoice
+        result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image, reasoning_mode)
         # Add total request time (includes file I/O)
         result['total_request_time_sec'] = round(time.time() - request_start, 2)
 @app.post("/process-invoice")
 async def process_invoice(
     file: UploadFile = File(..., description="Invoice image file"),
+    enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing"),
+    reasoning_mode: Optional[str] = Form("simple", description="VLM reasoning mode: 'simple' or 'reason'")
 ):
     """
     Process a single invoice and return extracted information
     **Parameters:**
     - **file**: Invoice image file (required)
     - **enhance_image**: Apply OpenCV enhancement preprocessing (optional)
+    - **reasoning_mode**: VLM reasoning mode: 'simple' for single-step, 'reason' for Chain of Thought (optional)
     **Returns:**
     - JSON with extracted_text, signature_coords, stamp_coords
         doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
         # Process invoice
+        result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image, reasoning_mode)
         # Extract fields from result
         fields = result.get("fields", {})

frontend/src/App.jsx CHANGED Viewed

@@ -18,6 +18,7 @@ function App() {
   const [resolutionMap, setResolutionMap] = useState({});
   const [resultResolutionMap, setResultResolutionMap] = useState({});
   const [enhancedMap, setEnhancedMap] = useState({}); // Track which images are enhanced
   const handleFilesSelected = async (files) => {
     setProcessing(false);
@@ -27,6 +28,7 @@ function App() {
     setPreviewImages([]);
     setResolutionMap({});
     setEnhancedMap({}); // Reset enhanced state
     try {
       // Step 1: Convert all files to images and show previews
@@ -95,8 +97,9 @@ function App() {
           const processData = resolutionMap[preview.key] || { dataUrl: preview.dataUrl, resolution: 100 };
           const blob = dataUrlToBlob(processData.dataUrl);
           const isEnhanced = enhancedMap[preview.key] || false;
-          const result = await processSingleInvoice(blob, preview.filename, isEnhanced);
           const resultWithMetadata = {
             ...result,
@@ -143,8 +146,9 @@ function App() {
       // Use resolution-adjusted image from ResultCard
       const blob = dataUrlToBlob(adjustedDataUrl || imageDataMap[result.key]);
       const isEnhanced = enhancedMap[result.key] || false;
-      const newResult = await processSingleInvoice(blob, result.filename, isEnhanced);
       const resultWithMetadata = {
         ...newResult,
@@ -183,6 +187,13 @@ function App() {
     }));
   };
   return (
     <div className="min-h-screen py-8 px-4 sm:px-6 lg:px-8">
       <div className="max-w-7xl mx-auto">
@@ -230,6 +241,8 @@ function App() {
                 {previewImages.map((preview, idx) => (
                   <ImagePreview
                     key={preview.key}
                     imageData={preview.dataUrl}
                     fileName={preview.filename}
                     onResolutionChange={(dataUrl, resolution) =>

   const [resolutionMap, setResolutionMap] = useState({});
   const [resultResolutionMap, setResultResolutionMap] = useState({});
   const [enhancedMap, setEnhancedMap] = useState({}); // Track which images are enhanced
+  const [reasoningMap, setReasoningMap] = useState({}); // Track which images use reasoning mode
   const handleFilesSelected = async (files) => {
     setProcessing(false);
     setPreviewImages([]);
     setResolutionMap({});
     setEnhancedMap({}); // Reset enhanced state
+    setReasoningMap({}); // Reset reasoning state
     try {
       // Step 1: Convert all files to images and show previews
           const processData = resolutionMap[preview.key] || { dataUrl: preview.dataUrl, resolution: 100 };
           const blob = dataUrlToBlob(processData.dataUrl);
           const isEnhanced = enhancedMap[preview.key] || false;
+          const reasoningMode = reasoningMap[preview.key] ? "reason" : "simple";
+          const result = await processSingleInvoice(blob, preview.filename, isEnhanced, reasoningMode);
           const resultWithMetadata = {
             ...result,
       // Use resolution-adjusted image from ResultCard
       const blob = dataUrlToBlob(adjustedDataUrl || imageDataMap[result.key]);
       const isEnhanced = enhancedMap[result.key] || false;
+      const reasoningMode = reasoningMap[result.key] ? "reason" : "simple";
+      const newResult = await processSingleInvoice(blob, result.filename, isEnhanced, reasoningMode);
       const resultWithMetadata = {
         ...newResult,
     }));
   };
+  const handleReasoningModeToggle = (key) => {
+    setReasoningMap(prev => ({
+      ...prev,
+      [key]: !prev[key]
+    }));
+  };
   return (
     <div className="min-h-screen py-8 px-4 sm:px-6 lg:px-8">
       <div className="max-w-7xl mx-auto">
                 {previewImages.map((preview, idx) => (
                   <ImagePreview
                     key={preview.key}
+                    onReasoningModeToggle={() => handleReasoningModeToggle(preview.key)}
+                    useReasoning={reasoningMap[preview.key] || false}
                     imageData={preview.dataUrl}
                     fileName={preview.filename}
                     onResolutionChange={(dataUrl, resolution) =>

frontend/src/components/ImagePreview.jsx CHANGED Viewed

@@ -1,7 +1,7 @@
 import React, { useState, useEffect, useRef } from 'react';
-import { SlidersHorizontal, Sparkles } from 'lucide-react';
-const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle, isEnhanced }) => {
   const [resolution, setResolution] = useState(100);
   const canvasRef = useRef(null);
   const [originalDimensions, setOriginalDimensions] = useState({ width: 0, height: 0 });
@@ -87,6 +87,25 @@ const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle
         </div>
       )}
       <div className="space-y-2">
         <div className="flex items-center justify-between">
           <label className="text-sm font-medium text-gray-700 flex items-center gap-2">

 import React, { useState, useEffect, useRef } from 'react';
+import { SlidersHorizontal, Sparkles, Brain } from 'lucide-react';
+const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle, isEnhanced, onReasoningModeToggle, useReasoning }) => {
   const [resolution, setResolution] = useState(100);
   const canvasRef = useRef(null);
   const [originalDimensions, setOriginalDimensions] = useState({ width: 0, height: 0 });
         </div>
       )}
+      {/* Reasoning Mode Toggle */}
+      <button
+        onClick={() => onReasoningModeToggle && onReasoningModeToggle()}
+        className={`w-full py-2 px-4 rounded-lg font-medium transition-all flex items-center justify-center gap-2 ${
+          useReasoning
+            ? 'bg-blue-600 hover:bg-blue-700 text-white shadow-lg'
+            : 'bg-gradient-to-r from-blue-500 to-cyan-500 hover:from-blue-600 hover:to-cyan-600 text-white shadow-md'
+        }`}
+      >
+        <Brain className="w-4 h-4" />
+        {useReasoning ? 'Chain of Thought ✓' : 'Simple Mode'}
+      </button>
+      {useReasoning && (
+        <div className="bg-blue-50 border border-blue-200 rounded p-2 text-xs text-blue-700">
+          🧠 VLM will use 2-step reasoning: first analyze document structure, then extract fields
+        </div>
+      )}
       <div className="space-y-2">
         <div className="flex items-center justify-between">
           <label className="text-sm font-medium text-gray-700 flex items-center gap-2">

frontend/src/utils/api.js CHANGED Viewed

@@ -8,12 +8,14 @@ const API_BASE_URL = import.meta.env.VITE_API_URL || window.location.origin;
  * @param {Blob} imageBlob - Image blob
  * @param {string} filename - Original filename
  * @param {boolean} enhanceImage - Whether to apply OpenCV enhancement
  * @returns {Promise<Object>} Processed result
  */
-export async function processSingleInvoice(imageBlob, filename, enhanceImage = false) {
   const formData = new FormData();
   formData.append('file', imageBlob, filename);
   formData.append('enhance_image', enhanceImage);
   const response = await axios.post(`${API_BASE_URL}/process-invoice`, formData, {
     headers: {

  * @param {Blob} imageBlob - Image blob
  * @param {string} filename - Original filename
  * @param {boolean} enhanceImage - Whether to apply OpenCV enhancement
+ * @param {string} reasoningMode - VLM reasoning mode: "simple" or "reason"
  * @returns {Promise<Object>} Processed result
  */
+export async function processSingleInvoice(imageBlob, filename, enhanceImage = false, reasoningMode = "simple") {
   const formData = new FormData();
   formData.append('file', imageBlob, filename);
   formData.append('enhance_image', enhanceImage);
+  formData.append('reasoning_mode', reasoningMode);
   const response = await axios.post(`${API_BASE_URL}/process-invoice`, formData, {
     headers: {

inference.py CHANGED Viewed

@@ -22,6 +22,7 @@ from config import (
 from model_manager import model_manager
 EXTRACTION_PROMPT = """
 You are an expert at reading noisy, handwritten Indian invoices and quotations.
@@ -62,6 +63,161 @@ Output rules:
 """
 class InferenceProcessor:
     """Handles VLM inference, validation, and result processing"""
@@ -184,6 +340,143 @@ class InferenceProcessor:
         return output_text, latency
     @staticmethod
     def extract_json_from_output(text: str) -> Dict:
         """Extract JSON from model output"""
@@ -328,7 +621,7 @@ class InferenceProcessor:
         return validated, field_confidence, warnings
     @staticmethod
-    def process_invoice(image_path: str, doc_id: str = None, enhance_image: bool = False) -> Dict:
         """
         Complete invoice processing pipeline
@@ -336,6 +629,7 @@ class InferenceProcessor:
             image_path: Path to invoice image
             doc_id: Document identifier (optional)
             enhance_image: Whether to apply OpenCV enhancement (optional)
         Returns:
             dict: Complete JSON output with all fields
@@ -364,10 +658,28 @@ class InferenceProcessor:
         signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
         timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
-        # Step 3: VLM Extraction
         t3 = time.time()
-        vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
-        timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
         # Clean up image
         image.close()

 from model_manager import model_manager
+# Single-step extraction prompt (original "simple" mode)
 EXTRACTION_PROMPT = """
 You are an expert at reading noisy, handwritten Indian invoices and quotations.
 """
+# Two-step Chain of Thought prompts (reasoning mode)
+REASONING_PROMPT = """
+You are an expert at analyzing noisy, handwritten Indian invoices and quotations for tractors.
+Your task is to carefully observe and describe the document structure WITHOUT extracting yet.
+Analyze this tractor invoice image and provide detailed observations about:
+1. DEALER/COMPANY NAME
+   - Where is it located? (top header, letterhead, stamp, footer)
+   - What language is it written in?
+   - Is it printed or handwritten?
+   - Exact text you see (preserve original language)
+2. MODEL INFORMATION
+   - Where is the model mentioned? (checkbox list, handwritten field, printed table, near "Model:" label)
+   - Are there multiple model options shown?
+   - If checkboxes exist, which one is marked? (look for ✓, ✗, [X], ●, ☑, filled boxes)
+   - Is the model name in English or regional language?
+   - Exact text you see for the selected/mentioned model
+3. HORSE POWER (HP)
+   - Where is HP information located? (separate field, within model name, checkbox list, specifications table)
+   - Is HP explicitly written or implied from model code?
+   - If there's a checkbox list with HP options, which one is selected?
+   - Are there multiple HP values shown? Which one corresponds to the selected model?
+   - Exact HP text you see (e.g., "49 HP", "63hp", "HP-30")
+4. TOTAL AMOUNT/ASSET COST
+   - Where is the final total located? (bottom of page, after tax section, grand total line)
+   - What label is used? (Total, Grand Total, Final Amount, कुल राशि, etc.)
+   - Are there multiple amount fields? Which is the final one after all taxes/charges?
+   - Exact amount you see with any currency symbols
+5. CHECKBOX SELECTIONS (if applicable)
+   - Are there any checkbox lists on the page?
+   - What options are available in these lists?
+   - Which options are clearly marked/selected? (describe the selection mark)
+   - Which options are clearly unmarked/unselected?
+6. AMBIGUITIES OR CHALLENGES
+   - Is any handwriting difficult to read?
+   - Are any fields unclear or could have multiple interpretations?
+   - Are there any conflicting pieces of information?
+Return ONLY valid JSON in this exact format:
+{
+  "dealer_location": string,
+  "dealer_text_observed": string,
+  "dealer_language": string,
+  "model_location": string,
+  "model_format": string,
+  "model_text_observed": string,
+  "model_is_checkbox": boolean,
+  "model_selected_option": string,
+  "hp_location": string,
+  "hp_format": string,
+  "hp_text_observed": string,
+  "hp_is_checkbox": boolean,
+  "hp_value_observed": string,
+  "amount_location": string,
+  "amount_label": string,
+  "amount_text_observed": string,
+  "checkboxes_present": boolean,
+  "checkbox_details": string,
+  "ambiguities": string,
+  "overall_document_quality": string
+}
+Guidelines:
+- Be extremely specific about locations (e.g., "top-left header", "middle section below tractor image", "bottom-right in total box")
+- Preserve original language text in observations
+- Describe what you see, don't interpret or extract yet
+- If something is unclear, describe why
+- Focus on SELECTED/MARKED options when checkboxes are present
+Output rules:
+- Output ONLY valid JSON
+- Do NOT include markdown, explanations, or extra text
+"""
+EXTRACTION_WITH_CONTEXT_PROMPT = """
+You are an expert at extracting structured data from Indian invoices and quotations.
+You have already analyzed this document. Here is your previous analysis:
+CONTEXT FROM REASONING:
+{reasoning_output}
+Based on your previous analysis, now extract the exact field values.
+Return ONLY valid JSON in this exact format:
+{{
+  "dealer_name": string,
+  "model_name": string,
+  "horse_power": number,
+  "asset_cost": number
+}}
+Critical extraction rules:
+1. DEALER NAME
+   - Copy EXACTLY as it appears in the original language and spelling
+   - Do NOT translate from Hindi/Marathi/Kannada to English
+   - Do NOT correct spelling or expand abbreviations
+   - Include any punctuation or special characters as shown
+2. MODEL NAME
+   - Copy EXACTLY as it appears in the original language
+   - If from checkbox selection, extract ONLY the selected/marked option
+   - Do NOT translate or normalize
+   - Preserve numbers, hyphens, and spacing exactly
+   - Do NOT include HP value within model name
+3. HORSE POWER
+   - Must be a number only (integer or decimal)
+   - Extract from explicit HP mentions only (never infer from model codes)
+   - If from checkbox, use only the selected option's HP value
+   - Remove text like "HP", "hp", "हॉर्स पावर" - keep only the number
+   - If HP appears as "49 HP" → extract: 49
+   - If HP appears as "63.5hp" → extract: 63.5
+   - If multiple HP values exist, use the one for the selected model
+4. ASSET COST
+   - Must be a number only (integer or decimal)
+   - Use the FINAL total amount after all taxes and charges
+   - Remove currency symbols (₹, Rs, INR)
+   - Remove commas (e.g., "1,50,000" → 150000)
+   - If amount is "₹ 1,75,500.00" → extract: 175500
+   - Use the largest/final amount if multiple totals exist
+Data validation:
+- dealer_name: Must be non-empty string in original language
+- model_name: Must be non-empty string in original language
+- horse_power: Must be positive number (typically between 15-100 for tractors)
+- asset_cost: Must be positive number (typically between 100000-3000000 for tractors)
+Special handling based on your reasoning:
+- If you noted checkboxes: Extract ONLY marked/selected options
+- If you noted ambiguities: Make best judgment and use most likely value
+- If you noted poor handwriting: Interpret characters as best as possible while preserving language
+- If you noted multiple values: Use the one that matches the selected/final configuration
+Output rules:
+- Output ONLY valid JSON
+- Do NOT include markdown code fences
+- Do NOT include explanations or extra text
+- Ensure all four fields are present
+- Ensure numbers are actual numbers, not strings with currency/commas
+"""
 class InferenceProcessor:
     """Handles VLM inference, validation, and result processing"""
         return output_text, latency
+    @staticmethod
+    def run_vlm_reasoning(image: Image.Image) -> Tuple[str, float]:
+        """
+        Run VLM model for Chain of Thought reasoning phase (step 1 of 2)
+        Analyzes document structure and observes field locations
+        """
+        if not model_manager.is_loaded():
+            raise RuntimeError("Models not loaded")
+        model = model_manager.vlm_model
+        processor = model_manager.processor
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": REASONING_PROMPT}
+                ]
+            }
+        ]
+        # Apply chat template
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process vision input
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        start = time.time()
+        # Generate (allow more tokens for detailed reasoning)
+        generated_ids = model.generate(**inputs, max_new_tokens=512)
+        latency = time.time() - start
+        # Decode output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )
+        output_text = output_text[0] if isinstance(output_text, list) else output_text
+        # Clean up GPU memory
+        del inputs, generated_ids, generated_ids_trimmed
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        print(f"🧠 Reasoning phase completed in {latency:.2f}s")
+        return output_text, latency
+    @staticmethod
+    def run_vlm_extraction_with_context(image: Image.Image, reasoning_output: str) -> Tuple[str, float]:
+        """
+        Run VLM model for extraction phase (step 2 of 2) using reasoning context
+        Extracts structured fields based on previous reasoning
+        """
+        if not model_manager.is_loaded():
+            raise RuntimeError("Models not loaded")
+        model = model_manager.vlm_model
+        processor = model_manager.processor
+        # Format the extraction prompt with reasoning context
+        extraction_prompt = EXTRACTION_WITH_CONTEXT_PROMPT.format(reasoning_output=reasoning_output)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": extraction_prompt}
+                ]
+            }
+        ]
+        # Apply chat template
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process vision input
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        start = time.time()
+        # Generate
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+        latency = time.time() - start
+        # Decode output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )
+        output_text = output_text[0] if isinstance(output_text, list) else output_text
+        # Clean up GPU memory
+        del inputs, generated_ids, generated_ids_trimmed
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        print(f"📝 Extraction phase completed in {latency:.2f}s")
+        return output_text, latency
     @staticmethod
     def extract_json_from_output(text: str) -> Dict:
         """Extract JSON from model output"""
         return validated, field_confidence, warnings
     @staticmethod
+    def process_invoice(image_path: str, doc_id: str = None, enhance_image: bool = False, reasoning_mode: str = "simple") -> Dict:
         """
         Complete invoice processing pipeline
             image_path: Path to invoice image
             doc_id: Document identifier (optional)
             enhance_image: Whether to apply OpenCV enhancement (optional)
+            reasoning_mode: "simple" for single-step extraction, "reason" for Chain of Thought (optional)
         Returns:
             dict: Complete JSON output with all fields
         signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
         timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
+        # Step 3: VLM Extraction (either simple or with Chain of Thought reasoning)
         t3 = time.time()
+        if reasoning_mode == "reason":
+            # Two-step Chain of Thought approach
+            print("🧠 Using Chain of Thought reasoning mode (2-step)")
+            # Step 3a: Reasoning phase
+            reasoning_output, reasoning_latency = InferenceProcessor.run_vlm_reasoning(image)
+            timing_breakdown['vlm_reasoning'] = round(reasoning_latency, 3)
+            # Step 3b: Extraction phase with context
+            vlm_output, extraction_latency = InferenceProcessor.run_vlm_extraction_with_context(image, reasoning_output)
+            timing_breakdown['vlm_extraction'] = round(extraction_latency, 3)
+            timing_breakdown['vlm_inference_total'] = round(reasoning_latency + extraction_latency, 3)
+            # Store reasoning for debugging/transparency
+            timing_breakdown['reasoning_output'] = reasoning_output
+        else:
+            # Single-step simple extraction (original approach)
+            print("⚡ Using simple mode (1-step)")
+            vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
+            timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
         # Clean up image
         image.close()