Spaces:

quantumbit
/

invoice_extractor

Paused

App Files Files Community

github-actions[bot] commited on Feb 4

Commit

1e91d4e

1 Parent(s): 1b8dcf1

Sync from GitHub: 0326ea25edafa877b6e50d9380e8b84ad62476c1

Browse files

Files changed (9) hide show

.gitattributes +2 -31
.github/workflows/push_to_huggingface.yml +63 -0
.gitignore +34 -0
README.md +0 -4
app.py +8 -0
client_example.py +177 -0
inference.py +12 -0
model_manager.py +203 -145
requirements.txt +11 -11

.gitattributes CHANGED Viewed

@@ -1,35 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

.github/workflows/push_to_huggingface.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+name: Push to Hugging Face Hub
+on:
+  push:
+    branches:
+      - main
+      - master
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Push to Hugging Face Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
+        run: |
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+          # Install git-lfs
+          git lfs install
+          # Clone the HF space or create new directory
+          git clone https://user:$HF_TOKEN@huggingface.co/spaces/$HF_SPACE_NAME hf_space 2>/dev/null || {
+            mkdir hf_space
+            cd hf_space
+            git init
+            git remote add origin https://user:$HF_TOKEN@huggingface.co/spaces/$HF_SPACE_NAME
+            cd ..
+          }
+          cd hf_space
+          # Configure git LFS
+          git lfs install
+          git lfs track "*.pt"
+          git lfs track "*.pth"
+          git lfs track "*.bin"
+          git lfs track "*.h5"
+          git lfs track "*.onnx"
+          git lfs track "*.safetensors"
+          # Copy files from the repository (excluding .git and hf_space)
+          rsync -av --exclude='.git' --exclude='hf_space' ../ .
+          # Add all files and commit
+          git add .
+          git diff-index --quiet HEAD || git commit -m "Sync from GitHub: ${{ github.sha }}"
+          # Push to Hugging Face
+          git push origin main --force

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+.env
+.venv
+venv/
+ENV/
+env/
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+sample_output/
+*.log
+.pytest_cache/
+.coverage
+htmlcov/
+.mypy_cache/
+.ipynb_checkpoints/
+*.md
+!README_HF.md
+!README.md
+test*
+executable.py

README.md CHANGED Viewed

@@ -48,7 +48,3 @@ print(response.json())
 curl -X POST "https://YOUR_USERNAME-invoice-extractor.hf.space/extract" \
   -F "file=@invoice.png"
 ```
-## Hardware
-Requires GPU: T4 minimum (8GB VRAM recommended)

 curl -X POST "https://YOUR_USERNAME-invoice-extractor.hf.space/extract" \
   -F "file=@invoice.png"
 ```

app.py CHANGED Viewed

@@ -140,14 +140,18 @@ async def extract_invoice(
         )
     # Save uploaded file to temporary location
     temp_file = None
     try:
         # Create temporary file
         suffix = os.path.splitext(file.filename)[1]
         with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp:
             temp_file = temp.name
             # Write uploaded file content
             shutil.copyfileobj(file.file, temp)
         # Use filename as doc_id if not provided
         if doc_id is None:
@@ -156,6 +160,10 @@ async def extract_invoice(
         # Process invoice
         result = InferenceProcessor.process_invoice(temp_file, doc_id)
         return JSONResponse(content=result, media_type="application/json; charset=utf-8")
     except Exception as e:

         )
     # Save uploaded file to temporary location
+    import time
+    request_start = time.time()
     temp_file = None
     try:
         # Create temporary file
+        io_start = time.time()
         suffix = os.path.splitext(file.filename)[1]
         with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp:
             temp_file = temp.name
             # Write uploaded file content
             shutil.copyfileobj(file.file, temp)
+        io_time = round(time.time() - io_start, 3)
         # Use filename as doc_id if not provided
         if doc_id is None:
         # Process invoice
         result = InferenceProcessor.process_invoice(temp_file, doc_id)
+        # Add total request time (includes file I/O)
+        result['total_request_time_sec'] = round(time.time() - request_start, 2)
+        result['file_io_time_sec'] = io_time
         return JSONResponse(content=result, media_type="application/json; charset=utf-8")
     except Exception as e:

client_example.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Example client script for Invoice Information Extractor API
+Shows how to integrate the API into your application
+"""
+import requests
+from pathlib import Path
+import json
+from typing import List, Dict
+class InvoiceExtractorClient:
+    """Client for Invoice Information Extractor API"""
+    def __init__(self, base_url: str = "http://localhost:7860"):
+        """
+        Initialize client
+        Args:
+            base_url: Base URL of the API (default: http://localhost:7860)
+        """
+        self.base_url = base_url.rstrip('/')
+        self.session = requests.Session()
+    def health_check(self) -> Dict:
+        """Check API health status"""
+        response = self.session.get(f"{self.base_url}/health")
+        response.raise_for_status()
+        return response.json()
+    def extract_invoice(self, image_path: str, doc_id: str = None) -> Dict:
+        """
+        Extract information from a single invoice
+        Args:
+            image_path: Path to invoice image
+            doc_id: Optional document identifier
+        Returns:
+            Extraction results as dictionary
+        """
+        with open(image_path, 'rb') as f:
+            files = {'file': f}
+            data = {'doc_id': doc_id} if doc_id else {}
+            response = self.session.post(
+                f"{self.base_url}/extract",
+                files=files,
+                data=data,
+                timeout=60
+            )
+            response.raise_for_status()
+            return response.json()
+    def extract_batch(self, image_paths: List[str]) -> List[Dict]:
+        """
+        Extract information from multiple invoices
+        Args:
+            image_paths: List of paths to invoice images
+        Returns:
+            List of extraction results
+        """
+        files = [('files', open(path, 'rb')) for path in image_paths]
+        try:
+            response = self.session.post(
+                f"{self.base_url}/extract_batch",
+                files=files,
+                timeout=120
+            )
+            response.raise_for_status()
+            return response.json()['results']
+        finally:
+            # Close all file handles
+            for _, file_handle in files:
+                file_handle.close()
+# Example usage
+if __name__ == "__main__":
+    # Initialize client
+    client = InvoiceExtractorClient("http://localhost:7860")
+    # Check health
+    print("Checking API health...")
+    health = client.health_check()
+    print(f"Status: {health['status']}")
+    print(f"Models loaded: {health['models_loaded']}\n")
+    # Example 1: Single invoice extraction
+    print("=" * 60)
+    print("Example 1: Single Invoice Extraction")
+    print("=" * 60)
+    # Replace with your invoice path
+    invoice_path = "sample_invoice.png"
+    if Path(invoice_path).exists():
+        try:
+            result = client.extract_invoice(invoice_path, doc_id="demo_001")
+            print(f"\n📄 Document ID: {result['doc_id']}")
+            print(f"✅ Confidence: {result['confidence']}")
+            print(f"⏱️  Processing Time: {result['processing_time_sec']}s")
+            print(f"💰 Cost Estimate: ${result['cost_estimate_usd']}")
+            print("\n📋 Extracted Fields:")
+            fields = result['fields']
+            print(f"  Dealer Name: {fields['dealer_name']}")
+            print(f"  Model Name: {fields['model_name']}")
+            print(f"  Horse Power: {fields['horse_power']} HP")
+            print(f"  Asset Cost: ₹{fields['asset_cost']:,}")
+            print(f"  Signature: {'✓ Detected' if fields['signature']['present'] else '✗ Not found'}")
+            print(f"  Stamp: {'✓ Detected' if fields['stamp']['present'] else '✗ Not found'}")
+            if result.get('warnings'):
+                print(f"\n⚠️  Warnings: {', '.join(result['warnings'])}")
+        except requests.exceptions.RequestException as e:
+            print(f"❌ Error: {e}")
+    else:
+        print(f"⚠️  Sample invoice not found at: {invoice_path}")
+        print("   Please provide a valid invoice image path.")
+    # Example 2: Batch processing
+    print("\n" + "=" * 60)
+    print("Example 2: Batch Invoice Processing")
+    print("=" * 60)
+    # Replace with your invoice paths
+    batch_paths = ["invoice_001.png", "invoice_002.png"]
+    existing_paths = [p for p in batch_paths if Path(p).exists()]
+    if existing_paths:
+        try:
+            results = client.extract_batch(existing_paths)
+            print(f"\n📦 Processed {len(results)} invoices")
+            for i, result in enumerate(results, 1):
+                if 'error' in result:
+                    print(f"\n  {i}. ❌ {result.get('filename', 'Unknown')}: {result['error']}")
+                else:
+                    print(f"\n  {i}. ✅ {result['doc_id']}")
+                    print(f"     Confidence: {result['confidence']}")
+                    print(f"     Dealer: {result['fields']['dealer_name']}")
+                    print(f"     Cost: ₹{result['fields']['asset_cost']:,}")
+        except requests.exceptions.RequestException as e:
+            print(f"❌ Error: {e}")
+    else:
+        print("⚠️  No valid invoice images found for batch processing")
+    # Example 3: Save results to JSON
+    print("\n" + "=" * 60)
+    print("Example 3: Save Results to JSON")
+    print("=" * 60)
+    if Path(invoice_path).exists():
+        try:
+            result = client.extract_invoice(invoice_path)
+            output_file = "extraction_result.json"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            print(f"\n✅ Results saved to: {output_file}")
+        except Exception as e:
+            print(f"❌ Error: {e}")
+    print("\n" + "=" * 60)
+    print("Examples complete!")
+    print("=" * 60)

inference.py CHANGED Viewed

@@ -296,6 +296,7 @@ class InferenceProcessor:
             dict: Complete JSON output with all fields
         """
         total_start = time.time()
         # Generate doc_id if not provided
         if doc_id is None:
@@ -303,23 +304,33 @@ class InferenceProcessor:
             doc_id = os.path.splitext(os.path.basename(image_path))[0]
         # Step 1: Preprocess image
         image = InferenceProcessor.preprocess_image(image_path)
         # Step 2: YOLO Detection
         signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
         # Step 3: VLM Extraction
         vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
         # Clean up image
         image.close()
         del image
         # Step 4: Parse JSON
         raw_json = InferenceProcessor.extract_json_from_output(vlm_output)
         # Step 5: Validate and fix
         validated_fields, field_confidence, warnings = InferenceProcessor.validate_prediction(raw_json)
         # Add signature and stamp
         validated_fields["signature"] = signature_info
@@ -344,6 +355,7 @@ class InferenceProcessor:
             "fields": validated_fields,
             "confidence": overall_confidence,
             "processing_time_sec": round(total_time, 2),
             "cost_estimate_usd": round(cost_estimate, 6),
             "warnings": warnings if warnings else None
         }

             dict: Complete JSON output with all fields
         """
         total_start = time.time()
+        timing_breakdown = {}
         # Generate doc_id if not provided
         if doc_id is None:
             doc_id = os.path.splitext(os.path.basename(image_path))[0]
         # Step 1: Preprocess image
+        t1 = time.time()
         image = InferenceProcessor.preprocess_image(image_path)
+        timing_breakdown['image_preprocessing'] = round(time.time() - t1, 3)
         # Step 2: YOLO Detection
+        t2 = time.time()
         signature_info, stamp_info, signature_conf, stamp_conf = model_manager.detect_sign_stamp(image_path)
+        timing_breakdown['yolo_detection'] = round(time.time() - t2, 3)
         # Step 3: VLM Extraction
+        t3 = time.time()
         vlm_output, vlm_latency = InferenceProcessor.run_vlm_extraction(image)
+        timing_breakdown['vlm_inference'] = round(vlm_latency, 3)
         # Clean up image
         image.close()
         del image
         # Step 4: Parse JSON
+        t4 = time.time()
         raw_json = InferenceProcessor.extract_json_from_output(vlm_output)
+        timing_breakdown['json_parsing'] = round(time.time() - t4, 3)
         # Step 5: Validate and fix
+        t5 = time.time()
         validated_fields, field_confidence, warnings = InferenceProcessor.validate_prediction(raw_json)
+        timing_breakdown['validation'] = round(time.time() - t5, 3)
         # Add signature and stamp
         validated_fields["signature"] = signature_info
             "fields": validated_fields,
             "confidence": overall_confidence,
             "processing_time_sec": round(total_time, 2),
+            "timing_breakdown": timing_breakdown,
             "cost_estimate_usd": round(cost_estimate, 6),
             "warnings": warnings if warnings else None
         }

model_manager.py CHANGED Viewed

@@ -1,145 +1,203 @@
-"""
-Model Manager - Handles loading and caching of YOLO and VLM models
-"""
-import torch
-from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
-    AutoProcessor,
-    BitsAndBytesConfig
-)
-from ultralytics import YOLO
-import os
-from typing import Tuple
-from config import (
-    YOLO_MODEL_PATH,
-    VLM_MODEL_ID,
-    QUANTIZATION_CONFIG,
-    YOLO_CONFIDENCE_THRESHOLD
-)
-class ModelManager:
-    """Singleton class to manage model loading and inference"""
-    _instance = None
-    _initialized = False
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super(ModelManager, cls).__new__(cls)
-        return cls._instance
-    def __init__(self):
-        if not ModelManager._initialized:
-            self.yolo_model = None
-            self.vlm_model = None
-            self.processor = None
-            ModelManager._initialized = True
-    def load_models(self):
-        """Load both YOLO and VLM models into memory"""
-        print("🚀 Starting model loading...")
-        # Load YOLO model
-        self.yolo_model = self._load_yolo_model()
-        # Load VLM model
-        self.vlm_model, self.processor = self._load_vlm_model()
-        print("✅ All models loaded successfully!")
-    def _load_yolo_model(self) -> YOLO:
-        """Load trained YOLO model for signature and stamp detection"""
-        if not os.path.exists(YOLO_MODEL_PATH):
-            raise FileNotFoundError(
-                f"YOLO model not found at {YOLO_MODEL_PATH}. "
-                "Please ensure best.pt is in utils/models/"
-            )
-        yolo_model = YOLO(str(YOLO_MODEL_PATH))
-        print(f"✅ YOLO model loaded from {YOLO_MODEL_PATH}")
-        return yolo_model
-    def _load_vlm_model(self) -> Tuple:
-        """
-        Load Qwen2.5-VL model with 4-bit quantization
-        Downloads from Hugging Face on first run
-        """
-        print(f"📥 Loading VLM model: {VLM_MODEL_ID}")
-        print("   (This will download ~4GB on first run)")
-        # Configure 4-bit quantization
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=QUANTIZATION_CONFIG["load_in_4bit"],
-            bnb_4bit_quant_type=QUANTIZATION_CONFIG["bnb_4bit_quant_type"],
-            bnb_4bit_compute_dtype=getattr(torch, QUANTIZATION_CONFIG["bnb_4bit_compute_dtype"]),
-            bnb_4bit_use_double_quant=QUANTIZATION_CONFIG["bnb_4bit_use_double_quant"]
-        )
-        # Load processor
-        processor = AutoProcessor.from_pretrained(
-            VLM_MODEL_ID,
-            trust_remote_code=True
-        )
-        # Load model with quantization
-        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            VLM_MODEL_ID,
-            quantization_config=bnb_config,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True
-        )
-        model.eval()
-        print(f"✅ Qwen2.5-VL model loaded successfully")
-        return model, processor
-    def detect_sign_stamp(self, image_path: str):
-        """
-        Detect signature and stamp in the image using YOLO
-        Returns:
-            tuple: (signature_info, stamp_info, signature_conf, stamp_conf)
-        """
-        if self.yolo_model is None:
-            raise RuntimeError("YOLO model not loaded. Call load_models() first.")
-        results = self.yolo_model(image_path, verbose=False)[0]
-        signature_info = {"present": False, "bbox": None}
-        stamp_info = {"present": False, "bbox": None}
-        signature_conf = 0.0
-        stamp_conf = 0.0
-        if results.boxes is not None:
-            for box in results.boxes:
-                cls_id = int(box.cls[0])
-                conf = float(box.conf[0])
-                if conf > YOLO_CONFIDENCE_THRESHOLD:
-                    bbox = box.xyxy[0].cpu().numpy().tolist()
-                    bbox = [int(coord) for coord in bbox]
-                    # Class 0: signature, Class 1: stamp
-                    if cls_id == 0 and conf > signature_conf:
-                        signature_info = {"present": True, "bbox": bbox}
-                        signature_conf = conf
-                    elif cls_id == 1 and conf > stamp_conf:
-                        stamp_info = {"present": True, "bbox": bbox}
-                        stamp_conf = conf
-        return signature_info, stamp_info, signature_conf, stamp_conf
-    def is_loaded(self) -> bool:
-        """Check if models are loaded"""
-        return (self.yolo_model is not None and
-                self.vlm_model is not None and
-                self.processor is not None)
-# Global model manager instance
-model_manager = ModelManager()

+"""
+Model Manager - Handles loading and caching of YOLO and VLM models
+"""
+import torch
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    BitsAndBytesConfig
+)
+from ultralytics import YOLO
+import os
+from typing import Tuple
+from config import (
+    YOLO_MODEL_PATH,
+    VLM_MODEL_ID,
+    QUANTIZATION_CONFIG,
+    YOLO_CONFIDENCE_THRESHOLD
+)
+class ModelManager:
+    """Singleton class to manage model loading and inference"""
+    _instance = None
+    _initialized = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ModelManager, cls).__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if not ModelManager._initialized:
+            self.yolo_model = None
+            self.vlm_model = None
+            self.processor = None
+            ModelManager._initialized = True
+    def load_models(self):
+        """Load both YOLO and VLM models into memory"""
+        print("🚀 Starting model loading...")
+        # Load YOLO model
+        self.yolo_model = self._load_yolo_model()
+        # Load VLM model
+        self.vlm_model, self.processor = self._load_vlm_model()
+        # Warm up models to initialize CUDA context
+        self._warmup_models()
+        print("✅ All models loaded successfully!")
+    def _load_yolo_model(self) -> YOLO:
+        """Load trained YOLO model for signature and stamp detection"""
+        if not os.path.exists(YOLO_MODEL_PATH):
+            raise FileNotFoundError(
+                f"YOLO model not found at {YOLO_MODEL_PATH}. "
+                "Please ensure best.pt is in utils/models/"
+            )
+        yolo_model = YOLO(str(YOLO_MODEL_PATH))
+        print(f"✅ YOLO model loaded from {YOLO_MODEL_PATH}")
+        return yolo_model
+    def _load_vlm_model(self) -> Tuple:
+        """
+        Load Qwen2.5-VL model with 4-bit quantization
+        Downloads from Hugging Face on first run
+        """
+        print(f"📥 Loading VLM model: {VLM_MODEL_ID}")
+        print("   (This will download ~4GB on first run)")
+        # Configure 4-bit quantization
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=QUANTIZATION_CONFIG["load_in_4bit"],
+            bnb_4bit_quant_type=QUANTIZATION_CONFIG["bnb_4bit_quant_type"],
+            bnb_4bit_compute_dtype=getattr(torch, QUANTIZATION_CONFIG["bnb_4bit_compute_dtype"]),
+            bnb_4bit_use_double_quant=QUANTIZATION_CONFIG["bnb_4bit_use_double_quant"]
+        )
+        # Load processor
+        processor = AutoProcessor.from_pretrained(
+            VLM_MODEL_ID,
+            trust_remote_code=True
+        )
+        # Load model with quantization
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            VLM_MODEL_ID,
+            quantization_config=bnb_config,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True
+        )
+        model.eval()
+        print(f"✅ Qwen2.5-VL model loaded successfully")
+        return model, processor
+    def _warmup_models(self):
+        """Warm up models with a dummy inference to initialize CUDA context"""
+        print("🔥 Warming up models (initializing CUDA context)...")
+        import time
+        from PIL import Image
+        import numpy as np
+        warmup_start = time.time()
+        # Create a small dummy image
+        dummy_image = Image.fromarray(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        try:
+            # Warm up VLM
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": dummy_image},
+                        {"type": "text", "text": "warm up"}
+                    ]
+                }
+            ]
+            from qwen_vl_utils import process_vision_info
+            text = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = self.processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            inputs = inputs.to("cuda")
+            # Run a quick inference
+            with torch.no_grad():
+                _ = self.vlm_model.generate(**inputs, max_new_tokens=5)
+            # Clean up
+            del inputs
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            warmup_time = time.time() - warmup_start
+            print(f"✅ Models warmed up in {warmup_time:.2f}s (CUDA context initialized)")
+        except Exception as e:
+            print(f"⚠️ Warmup failed (non-critical): {e}")
+    def detect_sign_stamp(self, image_path: str):
+        """
+        Detect signature and stamp in the image using YOLO
+        Returns:
+            tuple: (signature_info, stamp_info, signature_conf, stamp_conf)
+        """
+        if self.yolo_model is None:
+            raise RuntimeError("YOLO model not loaded. Call load_models() first.")
+        results = self.yolo_model(image_path, verbose=False)[0]
+        signature_info = {"present": False, "bbox": None}
+        stamp_info = {"present": False, "bbox": None}
+        signature_conf = 0.0
+        stamp_conf = 0.0
+        if results.boxes is not None:
+            for box in results.boxes:
+                cls_id = int(box.cls[0])
+                conf = float(box.conf[0])
+                if conf > YOLO_CONFIDENCE_THRESHOLD:
+                    bbox = box.xyxy[0].cpu().numpy().tolist()
+                    bbox = [int(coord) for coord in bbox]
+                    # Class 0: signature, Class 1: stamp
+                    if cls_id == 0 and conf > signature_conf:
+                        signature_info = {"present": True, "bbox": bbox}
+                        signature_conf = conf
+                    elif cls_id == 1 and conf > stamp_conf:
+                        stamp_info = {"present": True, "bbox": bbox}
+                        stamp_conf = conf
+        return signature_info, stamp_info, signature_conf, stamp_conf
+    def is_loaded(self) -> bool:
+        """Check if models are loaded"""
+        return (self.yolo_model is not None and
+                self.vlm_model is not None and
+                self.processor is not None)
+# Global model manager instance
+model_manager = ModelManager()

requirements.txt CHANGED Viewed

@@ -1,12 +1,12 @@
-torch
-transformers
-ultralytics
-pillow
-accelerate
-bitsandbytes
-opencv-python
-pyyaml
-qwen-vl-utils[decord]
-fastapi
-uvicorn[standard]
 python-multipart

+torch
+transformers
+ultralytics
+pillow
+accelerate
+bitsandbytes
+opencv-python
+pyyaml
+qwen-vl-utils[decord]
+fastapi
+uvicorn[standard]
 python-multipart