github-actions[bot] commited on
Commit ·
d062149
1
Parent(s): 50fd07f
Sync from GitHub: 10945f8bcad8f91e0ef20a88f2630fa1409bb1e5
Browse files- .gitignore +3 -1
- app.py +7 -4
- frontend/src/App.jsx +15 -2
- frontend/src/components/ImagePreview.jsx +21 -2
- frontend/src/utils/api.js +3 -1
- inference.py +316 -4
.gitignore
CHANGED
|
@@ -37,4 +37,6 @@ frontend/.env.local
|
|
| 37 |
test*
|
| 38 |
executable.py
|
| 39 |
client_example.py
|
| 40 |
-
Docs
|
|
|
|
|
|
|
|
|
| 37 |
test*
|
| 38 |
executable.py
|
| 39 |
client_example.py
|
| 40 |
+
Docs
|
| 41 |
+
|
| 42 |
+
prompt.txt
|
app.py
CHANGED
|
@@ -99,7 +99,8 @@ async def health_check():
|
|
| 99 |
async def extract_invoice(
|
| 100 |
file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
|
| 101 |
doc_id: Optional[str] = Form(None, description="Optional document identifier"),
|
| 102 |
-
enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing")
|
|
|
|
| 103 |
):
|
| 104 |
"""
|
| 105 |
Extract information from invoice image
|
|
@@ -172,7 +173,7 @@ async def extract_invoice(
|
|
| 172 |
doc_id = os.path.splitext(file.filename)[0]
|
| 173 |
|
| 174 |
# Process invoice
|
| 175 |
-
result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image)
|
| 176 |
|
| 177 |
# Add total request time (includes file I/O)
|
| 178 |
result['total_request_time_sec'] = round(time.time() - request_start, 2)
|
|
@@ -201,7 +202,8 @@ async def extract_invoice(
|
|
| 201 |
@app.post("/process-invoice")
|
| 202 |
async def process_invoice(
|
| 203 |
file: UploadFile = File(..., description="Invoice image file"),
|
| 204 |
-
enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing")
|
|
|
|
| 205 |
):
|
| 206 |
"""
|
| 207 |
Process a single invoice and return extracted information
|
|
@@ -210,6 +212,7 @@ async def process_invoice(
|
|
| 210 |
**Parameters:**
|
| 211 |
- **file**: Invoice image file (required)
|
| 212 |
- **enhance_image**: Apply OpenCV enhancement preprocessing (optional)
|
|
|
|
| 213 |
|
| 214 |
**Returns:**
|
| 215 |
- JSON with extracted_text, signature_coords, stamp_coords
|
|
@@ -241,7 +244,7 @@ async def process_invoice(
|
|
| 241 |
doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
|
| 242 |
|
| 243 |
# Process invoice
|
| 244 |
-
result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image)
|
| 245 |
|
| 246 |
# Extract fields from result
|
| 247 |
fields = result.get("fields", {})
|
|
|
|
| 99 |
async def extract_invoice(
|
| 100 |
file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
|
| 101 |
doc_id: Optional[str] = Form(None, description="Optional document identifier"),
|
| 102 |
+
enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing"),
|
| 103 |
+
reasoning_mode: Optional[str] = Form("simple", description="VLM reasoning mode: 'simple' or 'reason'")
|
| 104 |
):
|
| 105 |
"""
|
| 106 |
Extract information from invoice image
|
|
|
|
| 173 |
doc_id = os.path.splitext(file.filename)[0]
|
| 174 |
|
| 175 |
# Process invoice
|
| 176 |
+
result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image, reasoning_mode)
|
| 177 |
|
| 178 |
# Add total request time (includes file I/O)
|
| 179 |
result['total_request_time_sec'] = round(time.time() - request_start, 2)
|
|
|
|
| 202 |
@app.post("/process-invoice")
|
| 203 |
async def process_invoice(
|
| 204 |
file: UploadFile = File(..., description="Invoice image file"),
|
| 205 |
+
enhance_image: Optional[bool] = Form(False, description="Apply OpenCV enhancement preprocessing"),
|
| 206 |
+
reasoning_mode: Optional[str] = Form("simple", description="VLM reasoning mode: 'simple' or 'reason'")
|
| 207 |
):
|
| 208 |
"""
|
| 209 |
Process a single invoice and return extracted information
|
|
|
|
| 212 |
**Parameters:**
|
| 213 |
- **file**: Invoice image file (required)
|
| 214 |
- **enhance_image**: Apply OpenCV enhancement preprocessing (optional)
|
| 215 |
+
- **reasoning_mode**: VLM reasoning mode: 'simple' for single-step, 'reason' for Chain of Thought (optional)
|
| 216 |
|
| 217 |
**Returns:**
|
| 218 |
- JSON with extracted_text, signature_coords, stamp_coords
|
|
|
|
| 244 |
doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
|
| 245 |
|
| 246 |
# Process invoice
|
| 247 |
+
result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance_image, reasoning_mode)
|
| 248 |
|
| 249 |
# Extract fields from result
|
| 250 |
fields = result.get("fields", {})
|
frontend/src/App.jsx
CHANGED
|
@@ -18,6 +18,7 @@ function App() {
|
|
| 18 |
const [resolutionMap, setResolutionMap] = useState({});
|
| 19 |
const [resultResolutionMap, setResultResolutionMap] = useState({});
|
| 20 |
const [enhancedMap, setEnhancedMap] = useState({}); // Track which images are enhanced
|
|
|
|
| 21 |
|
| 22 |
const handleFilesSelected = async (files) => {
|
| 23 |
setProcessing(false);
|
|
@@ -27,6 +28,7 @@ function App() {
|
|
| 27 |
setPreviewImages([]);
|
| 28 |
setResolutionMap({});
|
| 29 |
setEnhancedMap({}); // Reset enhanced state
|
|
|
|
| 30 |
|
| 31 |
try {
|
| 32 |
// Step 1: Convert all files to images and show previews
|
|
@@ -95,8 +97,9 @@ function App() {
|
|
| 95 |
const processData = resolutionMap[preview.key] || { dataUrl: preview.dataUrl, resolution: 100 };
|
| 96 |
const blob = dataUrlToBlob(processData.dataUrl);
|
| 97 |
const isEnhanced = enhancedMap[preview.key] || false;
|
|
|
|
| 98 |
|
| 99 |
-
const result = await processSingleInvoice(blob, preview.filename, isEnhanced);
|
| 100 |
|
| 101 |
const resultWithMetadata = {
|
| 102 |
...result,
|
|
@@ -143,8 +146,9 @@ function App() {
|
|
| 143 |
// Use resolution-adjusted image from ResultCard
|
| 144 |
const blob = dataUrlToBlob(adjustedDataUrl || imageDataMap[result.key]);
|
| 145 |
const isEnhanced = enhancedMap[result.key] || false;
|
|
|
|
| 146 |
|
| 147 |
-
const newResult = await processSingleInvoice(blob, result.filename, isEnhanced);
|
| 148 |
|
| 149 |
const resultWithMetadata = {
|
| 150 |
...newResult,
|
|
@@ -183,6 +187,13 @@ function App() {
|
|
| 183 |
}));
|
| 184 |
};
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
return (
|
| 187 |
<div className="min-h-screen py-8 px-4 sm:px-6 lg:px-8">
|
| 188 |
<div className="max-w-7xl mx-auto">
|
|
@@ -230,6 +241,8 @@ function App() {
|
|
| 230 |
{previewImages.map((preview, idx) => (
|
| 231 |
<ImagePreview
|
| 232 |
key={preview.key}
|
|
|
|
|
|
|
| 233 |
imageData={preview.dataUrl}
|
| 234 |
fileName={preview.filename}
|
| 235 |
onResolutionChange={(dataUrl, resolution) =>
|
|
|
|
| 18 |
const [resolutionMap, setResolutionMap] = useState({});
|
| 19 |
const [resultResolutionMap, setResultResolutionMap] = useState({});
|
| 20 |
const [enhancedMap, setEnhancedMap] = useState({}); // Track which images are enhanced
|
| 21 |
+
const [reasoningMap, setReasoningMap] = useState({}); // Track which images use reasoning mode
|
| 22 |
|
| 23 |
const handleFilesSelected = async (files) => {
|
| 24 |
setProcessing(false);
|
|
|
|
| 28 |
setPreviewImages([]);
|
| 29 |
setResolutionMap({});
|
| 30 |
setEnhancedMap({}); // Reset enhanced state
|
| 31 |
+
setReasoningMap({}); // Reset reasoning state
|
| 32 |
|
| 33 |
try {
|
| 34 |
// Step 1: Convert all files to images and show previews
|
|
|
|
| 97 |
const processData = resolutionMap[preview.key] || { dataUrl: preview.dataUrl, resolution: 100 };
|
| 98 |
const blob = dataUrlToBlob(processData.dataUrl);
|
| 99 |
const isEnhanced = enhancedMap[preview.key] || false;
|
| 100 |
+
const reasoningMode = reasoningMap[preview.key] ? "reason" : "simple";
|
| 101 |
|
| 102 |
+
const result = await processSingleInvoice(blob, preview.filename, isEnhanced, reasoningMode);
|
| 103 |
|
| 104 |
const resultWithMetadata = {
|
| 105 |
...result,
|
|
|
|
| 146 |
// Use resolution-adjusted image from ResultCard
|
| 147 |
const blob = dataUrlToBlob(adjustedDataUrl || imageDataMap[result.key]);
|
| 148 |
const isEnhanced = enhancedMap[result.key] || false;
|
| 149 |
+
const reasoningMode = reasoningMap[result.key] ? "reason" : "simple";
|
| 150 |
|
| 151 |
+
const newResult = await processSingleInvoice(blob, result.filename, isEnhanced, reasoningMode);
|
| 152 |
|
| 153 |
const resultWithMetadata = {
|
| 154 |
...newResult,
|
|
|
|
| 187 |
}));
|
| 188 |
};
|
| 189 |
|
| 190 |
+
const handleReasoningModeToggle = (key) => {
|
| 191 |
+
setReasoningMap(prev => ({
|
| 192 |
+
...prev,
|
| 193 |
+
[key]: !prev[key]
|
| 194 |
+
}));
|
| 195 |
+
};
|
| 196 |
+
|
| 197 |
return (
|
| 198 |
<div className="min-h-screen py-8 px-4 sm:px-6 lg:px-8">
|
| 199 |
<div className="max-w-7xl mx-auto">
|
|
|
|
| 241 |
{previewImages.map((preview, idx) => (
|
| 242 |
<ImagePreview
|
| 243 |
key={preview.key}
|
| 244 |
+
onReasoningModeToggle={() => handleReasoningModeToggle(preview.key)}
|
| 245 |
+
useReasoning={reasoningMap[preview.key] || false}
|
| 246 |
imageData={preview.dataUrl}
|
| 247 |
fileName={preview.filename}
|
| 248 |
onResolutionChange={(dataUrl, resolution) =>
|
frontend/src/components/ImagePreview.jsx
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import React, { useState, useEffect, useRef } from 'react';
|
| 2 |
-
import { SlidersHorizontal, Sparkles } from 'lucide-react';
|
| 3 |
|
| 4 |
-
const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle, isEnhanced }) => {
|
| 5 |
const [resolution, setResolution] = useState(100);
|
| 6 |
const canvasRef = useRef(null);
|
| 7 |
const [originalDimensions, setOriginalDimensions] = useState({ width: 0, height: 0 });
|
|
@@ -87,6 +87,25 @@ const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle
|
|
| 87 |
</div>
|
| 88 |
)}
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
<div className="space-y-2">
|
| 91 |
<div className="flex items-center justify-between">
|
| 92 |
<label className="text-sm font-medium text-gray-700 flex items-center gap-2">
|
|
|
|
| 1 |
import React, { useState, useEffect, useRef } from 'react';
|
| 2 |
+
import { SlidersHorizontal, Sparkles, Brain } from 'lucide-react';
|
| 3 |
|
| 4 |
+
const ImagePreview = ({ imageData, fileName, onResolutionChange, onEnhanceToggle, isEnhanced, onReasoningModeToggle, useReasoning }) => {
|
| 5 |
const [resolution, setResolution] = useState(100);
|
| 6 |
const canvasRef = useRef(null);
|
| 7 |
const [originalDimensions, setOriginalDimensions] = useState({ width: 0, height: 0 });
|
|
|
|
| 87 |
</div>
|
| 88 |
)}
|
| 89 |
|
| 90 |
+
{/* Reasoning Mode Toggle */}
|
| 91 |
+
<button
|
| 92 |
+
onClick={() => onReasoningModeToggle && onReasoningModeToggle()}
|
| 93 |
+
className={`w-full py-2 px-4 rounded-lg font-medium transition-all flex items-center justify-center gap-2 ${
|
| 94 |
+
useReasoning
|
| 95 |
+
? 'bg-blue-600 hover:bg-blue-700 text-white shadow-lg'
|
| 96 |
+
: 'bg-gradient-to-r from-blue-500 to-cyan-500 hover:from-blue-600 hover:to-cyan-600 text-white shadow-md'
|
| 97 |
+
}`}
|
| 98 |
+
>
|
| 99 |
+
<Brain className="w-4 h-4" />
|
| 100 |
+
{useReasoning ? 'Chain of Thought ✓' : 'Simple Mode'}
|
| 101 |
+
</button>
|
| 102 |
+
|
| 103 |
+
{useReasoning && (
|
| 104 |
+
<div className="bg-blue-50 border border-blue-200 rounded p-2 text-xs text-blue-700">
|
| 105 |
+
🧠 VLM will use 2-step reasoning: first analyze document structure, then extract fields
|
| 106 |
+
</div>
|
| 107 |
+
)}
|
| 108 |
+
|
| 109 |
<div className="space-y-2">
|
| 110 |
<div className="flex items-center justify-between">
|
| 111 |
<label className="text-sm font-medium text-gray-700 flex items-center gap-2">
|
frontend/src/utils/api.js
CHANGED
|
@@ -8,12 +8,14 @@ const API_BASE_URL = import.meta.env.VITE_API_URL || window.location.origin;
|
|
| 8 |
* @param {Blob} imageBlob - Image blob
|
| 9 |
* @param {string} filename - Original filename
|
| 10 |
* @param {boolean} enhanceImage - Whether to apply OpenCV enhancement
|
|
|
|
| 11 |
* @returns {Promise<Object>} Processed result
|
| 12 |
*/
|
| 13 |
-
export async function processSingleInvoice(imageBlob, filename, enhanceImage = false) {
|
| 14 |
const formData = new FormData();
|
| 15 |
formData.append('file', imageBlob, filename);
|
| 16 |
formData.append('enhance_image', enhanceImage);
|
|
|
|
| 17 |
|
| 18 |
const response = await axios.post(`${API_BASE_URL}/process-invoice`, formData, {
|
| 19 |
headers: {
|
|
|
|
| 8 |
* @param {Blob} imageBlob - Image blob
|
| 9 |
* @param {string} filename - Original filename
|
| 10 |
* @param {boolean} enhanceImage - Whether to apply OpenCV enhancement
|
| 11 |
+
* @param {string} reasoningMode - VLM reasoning mode: "simple" or "reason"
|
| 12 |
* @returns {Promise<Object>} Processed result
|
| 13 |
*/
|
| 14 |
+
export async function processSingleInvoice(imageBlob, filename, enhanceImage = false, reasoningMode = "simple") {
|
| 15 |
const formData = new FormData();
|
| 16 |
formData.append('file', imageBlob, filename);
|
| 17 |
formData.append('enhance_image', enhanceImage);
|
| 18 |
+
formData.append('reasoning_mode', reasoningMode);
|
| 19 |
|
| 20 |
const response = await axios.post(`${API_BASE_URL}/process-invoice`, formData, {
|
| 21 |
headers: {
|
inference.py
CHANGED
|
@@ -22,6 +22,7 @@ from config import (
|
|
| 22 |
from model_manager import model_manager
|
| 23 |
|
| 24 |
|
|
|
|
| 25 |
EXTRACTION_PROMPT = """
|
| 26 |
You are an expert at reading noisy, handwritten Indian invoices and quotations.
|
| 27 |
|
|
@@ -62,6 +63,161 @@ Output rules:
|
|
| 62 |
"""
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
class InferenceProcessor:
|
| 66 |
"""Handles VLM inference, validation, and result processing"""
|
| 67 |
|
|
@@ -184,6 +340,143 @@ class InferenceProcessor:
|
|
| 184 |
|
| 185 |
return output_text, latency
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
@staticmethod
|
| 188 |
def extract_json_from_output(text: str) -> Dict:
|
| 189 |
"""Extract JSON from model output"""
|
|
@@ -328,7 +621,7 @@ class InferenceProcessor:
|
|
| 328 |
return validated, field_confidence, warnings
|
| 329 |
|
| 330 |
@staticmethod
|
| 331 |
-
def process_invoice(image_path: str, doc_id: str = None, enhance_image: bool = False) -> Dict:
|
| 332 |
"""
|
| 333 |
Complete invoice processing pipeline
|
| 334 |
|
|
@@ -336,6 +629,7 @@ class InferenceProcessor:
|
|
| 336 |
image_path: Path to invoice image
|
| 337 |
doc_id: Document identifier (optional)
|
| 338 |
enhance_image: Whether to apply OpenCV enhancement (optional)
|
|
|
|
| 339 |
|
| 340 |
Returns:
|
| 341 |
dict: Complete JSON output with all fields
|
|
@@ -364,10 +658,28 @@ class InferenceProcessor:
|
|
| 364 |
signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
|
| 365 |
timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
|
| 366 |
|
| 367 |
-
# Step 3: VLM Extraction
|
| 368 |
t3 = time.time()
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
# Clean up image
|
| 373 |
image.close()
|
|
|
|
| 22 |
from model_manager import model_manager
|
| 23 |
|
| 24 |
|
| 25 |
+
# Single-step extraction prompt (original "simple" mode)
|
| 26 |
EXTRACTION_PROMPT = """
|
| 27 |
You are an expert at reading noisy, handwritten Indian invoices and quotations.
|
| 28 |
|
|
|
|
| 63 |
"""
|
| 64 |
|
| 65 |
|
| 66 |
+
# Two-step Chain of Thought prompts (reasoning mode)
|
| 67 |
+
REASONING_PROMPT = """
|
| 68 |
+
You are an expert at analyzing noisy, handwritten Indian invoices and quotations for tractors.
|
| 69 |
+
|
| 70 |
+
Your task is to carefully observe and describe the document structure WITHOUT extracting yet.
|
| 71 |
+
|
| 72 |
+
Analyze this tractor invoice image and provide detailed observations about:
|
| 73 |
+
|
| 74 |
+
1. DEALER/COMPANY NAME
|
| 75 |
+
- Where is it located? (top header, letterhead, stamp, footer)
|
| 76 |
+
- What language is it written in?
|
| 77 |
+
- Is it printed or handwritten?
|
| 78 |
+
- Exact text you see (preserve original language)
|
| 79 |
+
|
| 80 |
+
2. MODEL INFORMATION
|
| 81 |
+
- Where is the model mentioned? (checkbox list, handwritten field, printed table, near "Model:" label)
|
| 82 |
+
- Are there multiple model options shown?
|
| 83 |
+
- If checkboxes exist, which one is marked? (look for ✓, ✗, [X], ●, ☑, filled boxes)
|
| 84 |
+
- Is the model name in English or regional language?
|
| 85 |
+
- Exact text you see for the selected/mentioned model
|
| 86 |
+
|
| 87 |
+
3. HORSE POWER (HP)
|
| 88 |
+
- Where is HP information located? (separate field, within model name, checkbox list, specifications table)
|
| 89 |
+
- Is HP explicitly written or implied from model code?
|
| 90 |
+
- If there's a checkbox list with HP options, which one is selected?
|
| 91 |
+
- Are there multiple HP values shown? Which one corresponds to the selected model?
|
| 92 |
+
- Exact HP text you see (e.g., "49 HP", "63hp", "HP-30")
|
| 93 |
+
|
| 94 |
+
4. TOTAL AMOUNT/ASSET COST
|
| 95 |
+
- Where is the final total located? (bottom of page, after tax section, grand total line)
|
| 96 |
+
- What label is used? (Total, Grand Total, Final Amount, कुल राशि, etc.)
|
| 97 |
+
- Are there multiple amount fields? Which is the final one after all taxes/charges?
|
| 98 |
+
- Exact amount you see with any currency symbols
|
| 99 |
+
|
| 100 |
+
5. CHECKBOX SELECTIONS (if applicable)
|
| 101 |
+
- Are there any checkbox lists on the page?
|
| 102 |
+
- What options are available in these lists?
|
| 103 |
+
- Which options are clearly marked/selected? (describe the selection mark)
|
| 104 |
+
- Which options are clearly unmarked/unselected?
|
| 105 |
+
|
| 106 |
+
6. AMBIGUITIES OR CHALLENGES
|
| 107 |
+
- Is any handwriting difficult to read?
|
| 108 |
+
- Are any fields unclear or could have multiple interpretations?
|
| 109 |
+
- Are there any conflicting pieces of information?
|
| 110 |
+
|
| 111 |
+
Return ONLY valid JSON in this exact format:
|
| 112 |
+
|
| 113 |
+
{
|
| 114 |
+
"dealer_location": string,
|
| 115 |
+
"dealer_text_observed": string,
|
| 116 |
+
"dealer_language": string,
|
| 117 |
+
"model_location": string,
|
| 118 |
+
"model_format": string,
|
| 119 |
+
"model_text_observed": string,
|
| 120 |
+
"model_is_checkbox": boolean,
|
| 121 |
+
"model_selected_option": string,
|
| 122 |
+
"hp_location": string,
|
| 123 |
+
"hp_format": string,
|
| 124 |
+
"hp_text_observed": string,
|
| 125 |
+
"hp_is_checkbox": boolean,
|
| 126 |
+
"hp_value_observed": string,
|
| 127 |
+
"amount_location": string,
|
| 128 |
+
"amount_label": string,
|
| 129 |
+
"amount_text_observed": string,
|
| 130 |
+
"checkboxes_present": boolean,
|
| 131 |
+
"checkbox_details": string,
|
| 132 |
+
"ambiguities": string,
|
| 133 |
+
"overall_document_quality": string
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
Guidelines:
|
| 137 |
+
- Be extremely specific about locations (e.g., "top-left header", "middle section below tractor image", "bottom-right in total box")
|
| 138 |
+
- Preserve original language text in observations
|
| 139 |
+
- Describe what you see, don't interpret or extract yet
|
| 140 |
+
- If something is unclear, describe why
|
| 141 |
+
- Focus on SELECTED/MARKED options when checkboxes are present
|
| 142 |
+
|
| 143 |
+
Output rules:
|
| 144 |
+
- Output ONLY valid JSON
|
| 145 |
+
- Do NOT include markdown, explanations, or extra text
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
EXTRACTION_WITH_CONTEXT_PROMPT = """
|
| 150 |
+
You are an expert at extracting structured data from Indian invoices and quotations.
|
| 151 |
+
|
| 152 |
+
You have already analyzed this document. Here is your previous analysis:
|
| 153 |
+
|
| 154 |
+
CONTEXT FROM REASONING:
|
| 155 |
+
{reasoning_output}
|
| 156 |
+
|
| 157 |
+
Based on your previous analysis, now extract the exact field values.
|
| 158 |
+
|
| 159 |
+
Return ONLY valid JSON in this exact format:
|
| 160 |
+
|
| 161 |
+
{{
|
| 162 |
+
"dealer_name": string,
|
| 163 |
+
"model_name": string,
|
| 164 |
+
"horse_power": number,
|
| 165 |
+
"asset_cost": number
|
| 166 |
+
}}
|
| 167 |
+
|
| 168 |
+
Critical extraction rules:
|
| 169 |
+
|
| 170 |
+
1. DEALER NAME
|
| 171 |
+
- Copy EXACTLY as it appears in the original language and spelling
|
| 172 |
+
- Do NOT translate from Hindi/Marathi/Kannada to English
|
| 173 |
+
- Do NOT correct spelling or expand abbreviations
|
| 174 |
+
- Include any punctuation or special characters as shown
|
| 175 |
+
|
| 176 |
+
2. MODEL NAME
|
| 177 |
+
- Copy EXACTLY as it appears in the original language
|
| 178 |
+
- If from checkbox selection, extract ONLY the selected/marked option
|
| 179 |
+
- Do NOT translate or normalize
|
| 180 |
+
- Preserve numbers, hyphens, and spacing exactly
|
| 181 |
+
- Do NOT include HP value within model name
|
| 182 |
+
|
| 183 |
+
3. HORSE POWER
|
| 184 |
+
- Must be a number only (integer or decimal)
|
| 185 |
+
- Extract from explicit HP mentions only (never infer from model codes)
|
| 186 |
+
- If from checkbox, use only the selected option's HP value
|
| 187 |
+
- Remove text like "HP", "hp", "हॉर्स पावर" - keep only the number
|
| 188 |
+
- If HP appears as "49 HP" → extract: 49
|
| 189 |
+
- If HP appears as "63.5hp" → extract: 63.5
|
| 190 |
+
- If multiple HP values exist, use the one for the selected model
|
| 191 |
+
|
| 192 |
+
4. ASSET COST
|
| 193 |
+
- Must be a number only (integer or decimal)
|
| 194 |
+
- Use the FINAL total amount after all taxes and charges
|
| 195 |
+
- Remove currency symbols (₹, Rs, INR)
|
| 196 |
+
- Remove commas (e.g., "1,50,000" → 150000)
|
| 197 |
+
- If amount is "₹ 1,75,500.00" → extract: 175500
|
| 198 |
+
- Use the largest/final amount if multiple totals exist
|
| 199 |
+
|
| 200 |
+
Data validation:
|
| 201 |
+
- dealer_name: Must be non-empty string in original language
|
| 202 |
+
- model_name: Must be non-empty string in original language
|
| 203 |
+
- horse_power: Must be positive number (typically between 15-100 for tractors)
|
| 204 |
+
- asset_cost: Must be positive number (typically between 100000-3000000 for tractors)
|
| 205 |
+
|
| 206 |
+
Special handling based on your reasoning:
|
| 207 |
+
- If you noted checkboxes: Extract ONLY marked/selected options
|
| 208 |
+
- If you noted ambiguities: Make best judgment and use most likely value
|
| 209 |
+
- If you noted poor handwriting: Interpret characters as best as possible while preserving language
|
| 210 |
+
- If you noted multiple values: Use the one that matches the selected/final configuration
|
| 211 |
+
|
| 212 |
+
Output rules:
|
| 213 |
+
- Output ONLY valid JSON
|
| 214 |
+
- Do NOT include markdown code fences
|
| 215 |
+
- Do NOT include explanations or extra text
|
| 216 |
+
- Ensure all four fields are present
|
| 217 |
+
- Ensure numbers are actual numbers, not strings with currency/commas
|
| 218 |
+
"""
|
| 219 |
+
|
| 220 |
+
|
| 221 |
class InferenceProcessor:
|
| 222 |
"""Handles VLM inference, validation, and result processing"""
|
| 223 |
|
|
|
|
| 340 |
|
| 341 |
return output_text, latency
|
| 342 |
|
| 343 |
+
@staticmethod
|
| 344 |
+
def run_vlm_reasoning(image: Image.Image) -> Tuple[str, float]:
|
| 345 |
+
"""
|
| 346 |
+
Run VLM model for Chain of Thought reasoning phase (step 1 of 2)
|
| 347 |
+
Analyzes document structure and observes field locations
|
| 348 |
+
"""
|
| 349 |
+
if not model_manager.is_loaded():
|
| 350 |
+
raise RuntimeError("Models not loaded")
|
| 351 |
+
|
| 352 |
+
model = model_manager.vlm_model
|
| 353 |
+
processor = model_manager.processor
|
| 354 |
+
|
| 355 |
+
messages = [
|
| 356 |
+
{
|
| 357 |
+
"role": "user",
|
| 358 |
+
"content": [
|
| 359 |
+
{"type": "image", "image": image},
|
| 360 |
+
{"type": "text", "text": REASONING_PROMPT}
|
| 361 |
+
]
|
| 362 |
+
}
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
# Apply chat template
|
| 366 |
+
text = processor.apply_chat_template(
|
| 367 |
+
messages,
|
| 368 |
+
tokenize=False,
|
| 369 |
+
add_generation_prompt=True
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# Process vision input
|
| 373 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 374 |
+
inputs = processor(
|
| 375 |
+
text=[text],
|
| 376 |
+
images=image_inputs,
|
| 377 |
+
videos=video_inputs,
|
| 378 |
+
padding=True,
|
| 379 |
+
return_tensors="pt",
|
| 380 |
+
)
|
| 381 |
+
inputs = inputs.to("cuda")
|
| 382 |
+
|
| 383 |
+
start = time.time()
|
| 384 |
+
|
| 385 |
+
# Generate (allow more tokens for detailed reasoning)
|
| 386 |
+
generated_ids = model.generate(**inputs, max_new_tokens=512)
|
| 387 |
+
|
| 388 |
+
latency = time.time() - start
|
| 389 |
+
|
| 390 |
+
# Decode output
|
| 391 |
+
generated_ids_trimmed = [
|
| 392 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 393 |
+
]
|
| 394 |
+
output_text = processor.batch_decode(
|
| 395 |
+
generated_ids_trimmed,
|
| 396 |
+
skip_special_tokens=True,
|
| 397 |
+
clean_up_tokenization_spaces=False
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
output_text = output_text[0] if isinstance(output_text, list) else output_text
|
| 401 |
+
|
| 402 |
+
# Clean up GPU memory
|
| 403 |
+
del inputs, generated_ids, generated_ids_trimmed
|
| 404 |
+
if torch.cuda.is_available():
|
| 405 |
+
torch.cuda.empty_cache()
|
| 406 |
+
|
| 407 |
+
print(f"🧠 Reasoning phase completed in {latency:.2f}s")
|
| 408 |
+
return output_text, latency
|
| 409 |
+
|
| 410 |
+
@staticmethod
|
| 411 |
+
def run_vlm_extraction_with_context(image: Image.Image, reasoning_output: str) -> Tuple[str, float]:
|
| 412 |
+
"""
|
| 413 |
+
Run VLM model for extraction phase (step 2 of 2) using reasoning context
|
| 414 |
+
Extracts structured fields based on previous reasoning
|
| 415 |
+
"""
|
| 416 |
+
if not model_manager.is_loaded():
|
| 417 |
+
raise RuntimeError("Models not loaded")
|
| 418 |
+
|
| 419 |
+
model = model_manager.vlm_model
|
| 420 |
+
processor = model_manager.processor
|
| 421 |
+
|
| 422 |
+
# Format the extraction prompt with reasoning context
|
| 423 |
+
extraction_prompt = EXTRACTION_WITH_CONTEXT_PROMPT.format(reasoning_output=reasoning_output)
|
| 424 |
+
|
| 425 |
+
messages = [
|
| 426 |
+
{
|
| 427 |
+
"role": "user",
|
| 428 |
+
"content": [
|
| 429 |
+
{"type": "image", "image": image},
|
| 430 |
+
{"type": "text", "text": extraction_prompt}
|
| 431 |
+
]
|
| 432 |
+
}
|
| 433 |
+
]
|
| 434 |
+
|
| 435 |
+
# Apply chat template
|
| 436 |
+
text = processor.apply_chat_template(
|
| 437 |
+
messages,
|
| 438 |
+
tokenize=False,
|
| 439 |
+
add_generation_prompt=True
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
# Process vision input
|
| 443 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 444 |
+
inputs = processor(
|
| 445 |
+
text=[text],
|
| 446 |
+
images=image_inputs,
|
| 447 |
+
videos=video_inputs,
|
| 448 |
+
padding=True,
|
| 449 |
+
return_tensors="pt",
|
| 450 |
+
)
|
| 451 |
+
inputs = inputs.to("cuda")
|
| 452 |
+
|
| 453 |
+
start = time.time()
|
| 454 |
+
|
| 455 |
+
# Generate
|
| 456 |
+
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
| 457 |
+
|
| 458 |
+
latency = time.time() - start
|
| 459 |
+
|
| 460 |
+
# Decode output
|
| 461 |
+
generated_ids_trimmed = [
|
| 462 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 463 |
+
]
|
| 464 |
+
output_text = processor.batch_decode(
|
| 465 |
+
generated_ids_trimmed,
|
| 466 |
+
skip_special_tokens=True,
|
| 467 |
+
clean_up_tokenization_spaces=False
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
output_text = output_text[0] if isinstance(output_text, list) else output_text
|
| 471 |
+
|
| 472 |
+
# Clean up GPU memory
|
| 473 |
+
del inputs, generated_ids, generated_ids_trimmed
|
| 474 |
+
if torch.cuda.is_available():
|
| 475 |
+
torch.cuda.empty_cache()
|
| 476 |
+
|
| 477 |
+
print(f"📝 Extraction phase completed in {latency:.2f}s")
|
| 478 |
+
return output_text, latency
|
| 479 |
+
|
| 480 |
@staticmethod
|
| 481 |
def extract_json_from_output(text: str) -> Dict:
|
| 482 |
"""Extract JSON from model output"""
|
|
|
|
| 621 |
return validated, field_confidence, warnings
|
| 622 |
|
| 623 |
@staticmethod
|
| 624 |
+
def process_invoice(image_path: str, doc_id: str = None, enhance_image: bool = False, reasoning_mode: str = "simple") -> Dict:
|
| 625 |
"""
|
| 626 |
Complete invoice processing pipeline
|
| 627 |
|
|
|
|
| 629 |
image_path: Path to invoice image
|
| 630 |
doc_id: Document identifier (optional)
|
| 631 |
enhance_image: Whether to apply OpenCV enhancement (optional)
|
| 632 |
+
reasoning_mode: "simple" for single-step extraction, "reason" for Chain of Thought (optional)
|
| 633 |
|
| 634 |
Returns:
|
| 635 |
dict: Complete JSON output with all fields
|
|
|
|
| 658 |
signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
|
| 659 |
timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
|
| 660 |
|
| 661 |
+
# Step 3: VLM Extraction (either simple or with Chain of Thought reasoning)
|
| 662 |
t3 = time.time()
|
| 663 |
+
if reasoning_mode == "reason":
|
| 664 |
+
# Two-step Chain of Thought approach
|
| 665 |
+
print("🧠 Using Chain of Thought reasoning mode (2-step)")
|
| 666 |
+
|
| 667 |
+
# Step 3a: Reasoning phase
|
| 668 |
+
reasoning_output, reasoning_latency = InferenceProcessor.run_vlm_reasoning(image)
|
| 669 |
+
timing_breakdown['vlm_reasoning'] = round(reasoning_latency, 3)
|
| 670 |
+
|
| 671 |
+
# Step 3b: Extraction phase with context
|
| 672 |
+
vlm_output, extraction_latency = InferenceProcessor.run_vlm_extraction_with_context(image, reasoning_output)
|
| 673 |
+
timing_breakdown['vlm_extraction'] = round(extraction_latency, 3)
|
| 674 |
+
timing_breakdown['vlm_inference_total'] = round(reasoning_latency + extraction_latency, 3)
|
| 675 |
+
|
| 676 |
+
# Store reasoning for debugging/transparency
|
| 677 |
+
timing_breakdown['reasoning_output'] = reasoning_output
|
| 678 |
+
else:
|
| 679 |
+
# Single-step simple extraction (original approach)
|
| 680 |
+
print("⚡ Using simple mode (1-step)")
|
| 681 |
+
vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
|
| 682 |
+
timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
|
| 683 |
|
| 684 |
# Clean up image
|
| 685 |
image.close()
|