Spaces:

quantumbit
/

invoice_extractor

Paused

App Files Files Community

github-actions[bot] commited on Feb 6

Commit

a54dc28

1 Parent(s): a736bf4

Sync from GitHub: 3bae2496b8a3786d399c36363516d096a3b7421b

Browse files

Files changed (1) hide show

inference.py +61 -123

inference.py CHANGED Viewed

@@ -63,57 +63,50 @@ Output rules:
 """
-# Two-step Chain of Thought prompts (reasoning mode) - OPTIMIZED FOR SPEED
-REASONING_PROMPT = """
-Analyze this Indian tractor invoice and share your observations about extracting these 2 fields:
-'model name' and its corresponding 'Horse Power'
-Think through each field:
 MODEL NAME:
-- How is the model presented? (checkbox list, handwritten field, printed text)
-- If checkboxes exist, which one is marked/selected?
-- What exact text do you see for the model?
-- Is it in English or regional language?
 HORSE POWER:
-- Where do you see HP mentioned?
-- Is it explicit (like "49 HP") or in a checkbox list?
-- If checkboxes, which HP value is selected?
-- What exact text shows the HP?
-- Horse power must come ONLY from explicit HP text, never from model numbers.
-- Horse power may appear as "HP", handwritten like "49 HP", "63hp", "HP-30".
-Express your observations naturally. Be specific about what you see and any uncertainties.
-"""
-EXTRACTION_WITH_CONTEXT_PROMPT = """
-Based on the image and following analysis regarding 'model':
-{reasoning_output}
-Extract these fields from image and analysis:
 {{
   "dealer_name": string,
   "model_name": string,
   "horse_power": number,
   "asset_cost": number
 }}
-Critical rules:
-- Dealer name must be copied exactly from the image in the original language and spelling.
-- Model name must be copied exactly from the image without translation.
-- HP: Number only (e.g., "49 HP" → 49). Use selected checkbox if applicable
-- ASSET COST: Final total as number (remove ₹, commas: "1,50,000" → 150000)
-- Checkboxes: Extract only marked options
-Extraction hints:
-- Asset cost is the total amount, usually the largest number on the page, the total amount after TAX, final price or final cost.
-- Dealer name is usually at the top header or company name.
-- Model name often appears near words like Model, Tractor, Variant.
-- If handwriting is unclear, make your best reasonable interpretation of the characters — but preserve language.
 Output ONLY valid JSON, no markdown.
 """
@@ -242,10 +235,10 @@ class InferenceProcessor:
         return output_text, latency
     @staticmethod
-    def run_vlm_reasoning(image: Image.Image) -> Tuple[str, float]:
         """
-        Run VLM model for Chain of Thought reasoning phase (step 1 of 2)
-        Analyzes document structure and observes field locations
         """
         if not model_manager.is_loaded():
             raise RuntimeError("Models not loaded")
@@ -258,7 +251,7 @@ class InferenceProcessor:
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
-                    {"type": "text", "text": REASONING_PROMPT}
                 ]
             }
         ]
@@ -283,8 +276,8 @@ class InferenceProcessor:
         start = time.time()
-        # Generate (reduced tokens for faster processing)
-        generated_ids = model.generate(**inputs, max_new_tokens=256)
         latency = time.time() - start
@@ -305,78 +298,29 @@ class InferenceProcessor:
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        print(f"🧠 Reasoning phase completed in {latency:.2f}s")
-        return output_text, latency
-    @staticmethod
-    def run_vlm_extraction_with_context(image: Image.Image, reasoning_output: str) -> Tuple[str, float]:
-        """
-        Run VLM model for extraction phase (step 2 of 2) using reasoning context
-        Extracts structured fields based on previous reasoning
-        """
-        if not model_manager.is_loaded():
-            raise RuntimeError("Models not loaded")
-        model = model_manager.vlm_model
-        processor = model_manager.processor
-        # Format the extraction prompt with reasoning context
-        extraction_prompt = EXTRACTION_WITH_CONTEXT_PROMPT.format(reasoning_output=reasoning_output)
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": extraction_prompt}
-                ]
-            }
-        ]
-        # Apply chat template
-        text = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Process vision input
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to("cuda")
-        start = time.time()
-        # Generate
-        generated_ids = model.generate(**inputs, max_new_tokens=256)
-        latency = time.time() - start
-        # Decode output
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )
-        output_text = output_text[0] if isinstance(output_text, list) else output_text
-        # Clean up GPU memory
-        del inputs, generated_ids, generated_ids_trimmed
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        print(f"📝 Extraction phase completed in {latency:.2f}s")
-        return output_text, latency
     @staticmethod
     def extract_json_from_output(text: str) -> Dict:
@@ -562,17 +506,11 @@ class InferenceProcessor:
         # Step 3: VLM Extraction (either simple or with Chain of Thought reasoning)
         t3 = time.time()
         if reasoning_mode == "reason":
-            # Two-step Chain of Thought approach
-            print("🧠 Using Chain of Thought reasoning mode (2-step)")
-            # Step 3a: Reasoning phase
-            reasoning_output, reasoning_latency = InferenceProcessor.run_vlm_reasoning(image)
-            timing_breakdown['vlm_reasoning'] = round(reasoning_latency, 3)
-            # Step 3b: Extraction phase with context
-            vlm_output, extraction_latency = InferenceProcessor.run_vlm_extraction_with_context(image, reasoning_output)
-            timing_breakdown['vlm_extraction'] = round(extraction_latency, 3)
-            timing_breakdown['vlm_inference_total'] = round(reasoning_latency + extraction_latency, 3)
             # Store reasoning for debugging/transparency
             timing_breakdown['reasoning_output'] = reasoning_output

 """
+# Combined Chain of Thought prompt (reasoning mode) - Single call with reasoning and extraction
+COMBINED_REASONING_EXTRACTION_PROMPT = """
+Analyze this Indian tractor invoice using Chain of Thought reasoning.
+First, share your observations about the 4 key fields:
+DEALER NAME:
+- Where do you see it? (header, letterhead, stamp)
+- What language? What exact text?
 MODEL NAME:
+- How is it presented? (checkbox/handwritten/printed)
+- If checkboxes, which is marked?
+- What exact text do you see?
 HORSE POWER:
+- Where is HP mentioned?
+- Explicit text like "49 HP" or in checkbox?
+- Which value is selected?
+- HP must come from explicit HP text only, never from model numbers
+ASSET COST:
+- Where is the final total?
+- Which amount is after all taxes?
+- What exact amount with currency?
+After reasoning, extract the fields.
+Return ONLY valid JSON:
 {{
+  "reasoning": "your observations and thoughts here",
   "dealer_name": string,
   "model_name": string,
   "horse_power": number,
   "asset_cost": number
 }}
+Rules for extraction:
+- Copy dealer/model names EXACTLY in original language, don't translate
+- HP as number only ("49 HP" → 49), use selected checkbox
+- Asset cost as number (remove ₹, commas: "1,50,000" → 150000)
+- Asset cost is the final total after TAX
+- Dealer is usually at top header
+- If handwriting unclear, make best interpretation but preserve language
 Output ONLY valid JSON, no markdown.
 """
         return output_text, latency
     @staticmethod
+    def run_vlm_reasoning_and_extraction(image: Image.Image) -> Tuple[str, str, float]:
         """
+        Run VLM model with combined Chain of Thought reasoning and extraction in single call
+        Returns: (reasoning_text, extraction_json_str, latency)
         """
         if not model_manager.is_loaded():
             raise RuntimeError("Models not loaded")
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": COMBINED_REASONING_EXTRACTION_PROMPT}
                 ]
             }
         ]
         start = time.time()
+        # Generate with more tokens for combined reasoning + extraction
+        generated_ids = model.generate(**inputs, max_new_tokens=384)
         latency = time.time() - start
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        # Parse the combined output to separate reasoning from extraction
+        reasoning_text = ""
+        extraction_json = output_text
+        try:
+            # Try to parse as JSON
+            parsed = json.loads(output_text.strip())
+            if "reasoning" in parsed:
+                reasoning_text = parsed["reasoning"]
+                # Remove reasoning from output to get clean extraction JSON
+                extraction_dict = {k: v for k, v in parsed.items() if k != "reasoning"}
+                extraction_json = json.dumps(extraction_dict)
+        except:
+            # If parsing fails, try to split manually
+            # Look for JSON pattern
+            json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', output_text, re.DOTALL)
+            if json_match:
+                extraction_json = json_match.group(0)
+                # Everything before JSON is reasoning
+                reasoning_text = output_text[:json_match.start()].strip()
+        print(f"🧠 Combined reasoning + extraction completed in {latency:.2f}s")
+        return reasoning_text, extraction_json, latency
     @staticmethod
     def extract_json_from_output(text: str) -> Dict:
         # Step 3: VLM Extraction (either simple or with Chain of Thought reasoning)
         t3 = time.time()
         if reasoning_mode == "reason":
+            # Combined Chain of Thought: reasoning + extraction in single call
+            print("🧠 Using Chain of Thought reasoning mode (single call)")
+            reasoning_output, vlm_output, vlm_latency = InferenceProcessor.run_vlm_reasoning_and_extraction(image)
+            timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
             # Store reasoning for debugging/transparency
             timing_breakdown['reasoning_output'] = reasoning_output