Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

batch_api_2.py +381 -0
nanonets_ocr_2.py +125 -0
single_inferencing_2.py +310 -0

batch_api_2.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import os
+import sys
+import json
+import base64
+import asyncio
+import concurrent.futures
+from typing import Dict, Optional, List, Union
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import uvicorn
+from PIL import Image
+import io
+from contextlib import asynccontextmanager
+from prometheus_fastapi_instrumentator import Instrumentator
+# Add the current directory to the path so we can import the llama_inferencing module
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from single_inferencing_2 import SingleImageInference
+from utils.prompt_utils import create_query, parse_label, create_query_updated
+from utils.image_utils import encode_pil_image_to_base64
+# --- GLOBAL VARS (Constants, not the inferencer itself) ---
+LOG_DIR = os.getenv("LOG_DIR", "inference_logs")
+SEGMENTATION_DEVICE_ID = int(os.getenv("SEGMENTATION_DEVICE_ID", "7"))
+ENABLE_BBOX_DETECTION = os.getenv("ENABLE_BBOX_DETECTION", "False").lower() == "true"
+VLLM_SERVER_URL: Optional[str] = None
+MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "10"))  # Maximum batch size
+MAX_CONCURRENT_WORKERS = int(os.getenv("MAX_CONCURRENT_WORKERS", "4"))  # Concurrent processing limit
+# --- Lifespan Context Manager ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Handles startup and shutdown events for the FastAPI application.
+    Initializes the inferencer during startup.
+    """
+    global VLLM_SERVER_URL
+    if VLLM_SERVER_URL is None:
+        print("ERROR: VLLM_SERVER_URL was not set before lifespan start. Exiting.", flush=True)
+        sys.exit(1)
+    print(f"Lifespan: Initializing inferencer for this worker with VLLM URL: {VLLM_SERVER_URL}", flush=True)
+    try:
+        app.state.inferencer = SingleImageInference(
+            server_url=VLLM_SERVER_URL,
+            log_dir=LOG_DIR,
+            segmentation_device_id=SEGMENTATION_DEVICE_ID,
+            enable_bbox_detection=True
+        )
+        # Initialize thread pool for batch processing
+        app.state.thread_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=MAX_CONCURRENT_WORKERS
+        )
+        print("Lifespan: Inferencer and thread pool successfully initialized.", flush=True)
+    except Exception as e:
+        print(f"Lifespan ERROR: Failed to initialize Inferencer: {e}", flush=True)
+        app.state.inferencer = None
+        app.state.thread_pool = None
+    yield
+    # Shutdown cleanup
+    print("Lifespan: Application shutdown. Performing cleanup.", flush=True)
+    if hasattr(app.state, 'thread_pool') and app.state.thread_pool:
+        app.state.thread_pool.shutdown(wait=True)
+    if hasattr(app.state.inferencer, 'close'):
+        app.state.inferencer.close()
+# Initialize FastAPI app with lifespan
+app = FastAPI(
+    title="Llama Inferencing API with Batch Processing",
+    description="API for running inference on images using Llama model - supports both single and batch processing",
+    lifespan=lifespan
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+Instrumentator().instrument(app).expose(app)
+# --- BaseModel Definitions ---
+class InferenceRequest(BaseModel):
+    data: List[Dict[str, Union[str, float]]]
+class BatchInferenceRequest(BaseModel):
+    data: List[Dict[str, Union[str, float]]]
+    batch_size: Optional[int] = None  # Optional batch size override
+class InferenceResponse(BaseModel):
+    body: Dict
+    meta: Dict
+    error: str
+class BatchInferenceResponse(BaseModel):
+    body: Dict
+    meta: Dict
+    error: str
+    batch_info: Dict  # Additional batch processing info
+def process_single_item(inferencer, item: Dict, temp_dir: str = "/tmp") -> Dict:
+    """
+    Process a single inference item - extracted for reuse in batch processing
+    """
+    try:
+        # Extract fields from the item
+        workorder_id = item["workorder_id"]
+        image_id = item["image_id"]
+        doc_type = item["doc_type"]
+        business_type = item["business_type"]
+        workorder_type = item["workorder_type"]
+        image_base64 = item["image"]
+        # Decode the base64 image
+        image_content = base64.b64decode(image_base64)
+        pil_image = Image.open(io.BytesIO(image_content))
+        # Create a temporary file path for the image
+        temp_image_path = f"{temp_dir}/{image_id}_{workorder_id}.jpg"
+        pil_image.save(temp_image_path)
+        # Create query for the image
+        query = create_query_updated(
+            temp_image_path,
+            doc_type.lower(),
+            [item.get("task_name", "default")],
+            [item.get("format_name", "reasoning_specrec")]
+        )[0]
+        query["image"] = pil_image
+        query["doc_type"] = doc_type.upper()
+        print(f"Processing WORKORDERID: {workorder_id}, DOCTYPE: {query['doc_type']}", flush=True)
+        # Run inference using the initialized inferencer
+        inference_result = inferencer.run_inference(query, item.get("temperature", 0.1))
+        # Parse the response
+        try:
+            json_str = inference_result["response"].strip("`json\n")
+            raw_response = json.loads(json_str)
+        except Exception as e:
+            print(f"Failed to parse model response: {e}. Raw response: {inference_result.get('response')}", flush=True)
+            raw_response = {
+                "reasoning": "Failed to parse model response",
+                "evaluation_result": "UNKNOWN"
+            }
+        evaluation_result = raw_response.get("evaluation_result", "UNCERTAIN")
+        # Normalize model_decision
+        if evaluation_result == "VALID":
+            model_decision = "VALID_INSTALL"
+            review_queue = "GREEN"
+        elif evaluation_result == "INVALID":
+            model_decision = "INVALID_INSTALL"
+            review_queue = "RED"
+        else:
+            model_decision = "UNCERTAIN"
+            review_queue = "YELLOW"
+        # Extract embedding from raw_response if available
+        embedding = raw_response.get("embedding")
+        formatted_result = {
+            "workorder_id": workorder_id,
+            "image_id": image_id,
+            "doc_type": doc_type,
+            "business_type": business_type,
+            "workorder_type": workorder_type,
+            "confidence_threshold": 0,
+            "model_output": {
+                "model_decision_reason": raw_response.get("reasoning", ""),
+                "model_decision": model_decision,
+                "recommendation": raw_response.get("recommendations", ""),
+                # "serial_id": raw_response.get("serial_id", ""),
+                "serial_id": "12345",
+                "power_meter_reading": raw_response.get("power_meter_reading", ""),
+                "review_queue": review_queue,
+                "confidence_score": 0,
+            }
+        }
+        # Add embedding to response if available
+        if embedding is not None:
+            formatted_result["embedding"] = embedding
+        # Clean up the temporary file
+        if os.path.exists(temp_image_path):
+            os.remove(temp_image_path)
+        return {"success": True, "result": formatted_result, "error": None}
+    except Exception as e:
+        # Clean up the temporary file in case of error
+        if 'temp_image_path' in locals() and os.path.exists(temp_image_path):
+            os.remove(temp_image_path)
+        print(f"Error processing item {item.get('workorder_id', 'unknown')}: {e}", flush=True)
+        return {"success": False, "result": None, "error": str(e)}
+async def process_batch_chunk(inferencer, chunk: List[Dict], executor) -> List[Dict]:
+    """
+    Process a chunk of items concurrently using thread pool
+    """
+    loop = asyncio.get_event_loop()
+    futures = [
+        loop.run_in_executor(executor, process_single_item, inferencer, item)
+        for item in chunk
+    ]
+    return await asyncio.gather(*futures)
+@app.post("/infer/", response_model=InferenceResponse)
+async def run_inference(request: InferenceRequest):
+    """
+    Run inference on a single image and return the results.
+    """
+    if app.state.inferencer is None:
+        raise HTTPException(status_code=500, detail="Inferencer not initialized or failed to load.")
+    try:
+        item = request.data[0]
+        result = process_single_item(app.state.inferencer, item)
+        if result["success"]:
+            return {
+                "body": {"data": [result["result"]]},
+                "meta": {},
+                "error": ""
+            }
+        else:
+            return {
+                "body": {"data": []},
+                "meta": {},
+                "error": result["error"]
+            }
+    except Exception as e:
+        print(f"API - Error during inference: {e}", flush=True)
+        return {
+            "body": {"data": []},
+            "meta": {},
+            "error": str(e)
+        }
+@app.post("/infer/batch/", response_model=BatchInferenceResponse)
+async def run_batch_inference(request: BatchInferenceRequest):
+    """
+    Run inference on multiple images in batches with concurrent processing.
+    """
+    if app.state.inferencer is None:
+        raise HTTPException(status_code=500, detail="Inferencer not initialized or failed to load.")
+    if app.state.thread_pool is None:
+        raise HTTPException(status_code=500, detail="Thread pool not initialized.")
+    try:
+        batch_size = request.batch_size or MAX_BATCH_SIZE
+        data = request.data
+        # Validate batch size
+        if len(data) > MAX_BATCH_SIZE * 5:  # Allow up to 5x max batch size
+            raise HTTPException(
+                status_code=400,
+                detail=f"Batch too large. Maximum allowed: {MAX_BATCH_SIZE * 5}, received: {len(data)}"
+            )
+        print(f"Processing batch of {len(data)} items with batch_size={batch_size}", flush=True)
+        # Split data into chunks
+        chunks = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
+        all_results = []
+        successful_count = 0
+        failed_count = 0
+        # Process chunks sequentially to avoid overwhelming the system
+        for i, chunk in enumerate(chunks):
+            print(f"Processing chunk {i + 1}/{len(chunks)} with {len(chunk)} items", flush=True)
+            chunk_results = await process_batch_chunk(
+                app.state.inferencer,
+                chunk,
+                app.state.thread_pool
+            )
+            # Collect results and count successes/failures
+            for result in chunk_results:
+                if result["success"]:
+                    all_results.append(result["result"])
+                    successful_count += 1
+                else:
+                    failed_count += 1
+                    print(f"Failed to process item: {result['error']}", flush=True)
+        batch_info = {
+            "total_items": len(data),
+            "successful_items": successful_count,
+            "failed_items": failed_count,
+            "batch_size_used": batch_size,
+            "total_chunks": len(chunks)
+        }
+        return {
+            "body": {"data": all_results},
+            "meta": {"processing_time": "completed"},
+            "error": f"{failed_count} items failed" if failed_count > 0 else "",
+            "batch_info": batch_info
+        }
+    except Exception as e:
+        print(f"API - Error during batch inference: {e}", flush=True)
+        return {
+            "body": {"data": []},
+            "meta": {},
+            "error": str(e),
+            "batch_info": {"total_items": len(request.data), "successful_items": 0, "failed_items": len(request.data)}
+        }
+@app.get("/health")
+async def health_check():
+    """
+    Health check endpoint.
+    """
+    if app.state.inferencer is None:
+        raise HTTPException(status_code=503, detail="Inferencer not initialized or failed to load")
+    if app.state.thread_pool is None:
+        raise HTTPException(status_code=503, detail="Thread pool not initialized")
+    return {
+        "status": "healthy",
+        "max_batch_size": MAX_BATCH_SIZE,
+        "max_concurrent_workers": MAX_CONCURRENT_WORKERS
+    }
+@app.get("/")
+async def root():
+    """
+    Root endpoint for basic health check.
+    """
+    return {
+        "status": "API is running",
+        "service": "Llama Inferencing API with Batch Processing",
+        "endpoints": {
+            "single_inference": "/infer/",
+            "batch_inference": "/infer/batch/",
+            "health": "/health"
+        }
+    }
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8877, help="API port")
+    parser.add_argument("--vllm-url", type=str, default="http://localhost:8000/v1", help="VLLM server URL")
+    parser.add_argument("--max-batch-size", type=int, default=10, help="Maximum batch size")
+    parser.add_argument("--max-workers", type=int, default=4, help="Maximum concurrent workers")
+    args = parser.parse_args()
+    # Store configuration globally
+    VLLM_SERVER_URL = args.vllm_url
+    MAX_BATCH_SIZE = args.max_batch_size
+    MAX_CONCURRENT_WORKERS = args.max_workers
+    print(f"Starting API server on port {args.port}", flush=True)
+    print(f"VLLM URL: {args.vllm_url}", flush=True)
+    print(f"Max batch size: {MAX_BATCH_SIZE}", flush=True)
+    print(f"Max concurrent workers: {MAX_CONCURRENT_WORKERS}", flush=True)
+    uvicorn.run(app, host="0.0.0.0", port=args.port, reload=False)

nanonets_ocr_2.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq, pipeline
+from PIL import Image
+import threading
+from typing import List, Dict, Tuple
+# Global pipeline instance
+_pipeline_instance = None
+_device = None
+_lock = threading.Lock()
+def download_and_save_model(model_name="nanonets/Nanonets-OCR-s", local_dir="/app/models/nanonets-ocr"):
+    """Download and save model to a local directory"""
+    os.makedirs(local_dir, exist_ok=True)
+    print(f"Downloading model to: {local_dir}")
+    # Download processor and model - using trust_remote_code consistently
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForVision2Seq.from_pretrained(model_name, trust_remote_code=True)
+    # Save to local directory
+    processor.save_pretrained(local_dir)
+    model.save_pretrained(local_dir)
+    print(f"Model saved to: {local_dir}")
+    return local_dir
+def load_model(model_path, device_id=0):
+    """Load model from local path using pipeline - based on working old code"""
+    if os.path.exists(model_path) and os.path.exists(os.path.join(model_path, "config.json")):
+        print(f"Loading model from: {model_path} on device {device_id}")
+        # Use the simple approach from old code that was working, with device_id parameter
+        return pipeline("image-text-to-text", model=model_path, device=device_id, trust_remote_code=True)
+    else:
+        print("Local model not found, downloading...")
+        download_and_save_model(local_dir=model_path)
+        return pipeline("image-text-to-text", model=model_path, device=device_id, trust_remote_code=True)
+def initialize_nanonets_model(device_id=0):
+    """Initialize the nanonets model pipeline - thread-safe version of old working code"""
+    global _pipeline_instance, _device
+    with _lock:
+        if _pipeline_instance is None:
+            local_model_path = "/app/models/nanonets-ocr"
+            _device = f"cuda:{device_id}" if torch.cuda.is_available() else "cpu"
+            print(f"Loading Nanonets OCR model on {_device} (device_id: {device_id})")
+            # Use the simple approach from old code with device_id parameter
+            _pipeline_instance = load_model(local_model_path, device_id)
+            print("Nanonets OCR model initialized successfully")
+    return _pipeline_instance
+def extract_single_serial_number(pil_image: Image.Image) -> str:
+    """Extract serial number from a single image - based on old working code"""
+    global _pipeline_instance
+    if _pipeline_instance is None:
+        raise RuntimeError("Model not initialized. Call initialize_nanonets_model() first.")
+    try:
+        # Use the exact message format from old working code
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": pil_image},
+                    {"type": "text", "text": "Identify the serial number that starts with IN. Strictly return ONLY the alphanumeric serial number string and nothing else."}
+                ]
+            }
+        ]
+        # Use the old working approach for result extraction
+        result = _pipeline_instance(messages)
+        content = result[0]['generated_text'][-1]['content']
+        return content.strip() if content else ""
+    except Exception as e:
+        print(f"Error extracting serial number: {e}")
+        return ""
+def extract_serial_numbers_batch(images_and_indices: List[Tuple[Image.Image, int]]) -> Dict[int, str]:
+    """
+    Extract serial numbers from multiple images in batch
+    Returns dict mapping original index to serial number
+    """
+    if not images_and_indices:
+        return {}
+    print(f"Processing batch of {len(images_and_indices)} S2P_MFIELD images for serial extraction")
+    results = {}
+    # Process each image
+    for pil_image, original_index in images_and_indices:
+        try:
+            serial_id = extract_single_serial_number(pil_image)
+            results[original_index] = serial_id
+        except Exception as e:
+            print(f"Error processing image at index {original_index}: {e}")
+            results[original_index] = ""
+    return results
+def extract_serial_number(pil_image: Image.Image) -> str:
+    """
+    Single image extraction function - backward compatibility
+    Based on the old working code logic
+    """
+    global _pipeline_instance
+    # Initialize model if not already done (like old code)
+    if _pipeline_instance is None:
+        initialize_nanonets_model()
+    return extract_single_serial_number(pil_image)
+def cleanup_gpu_cache():
+    """Clean up GPU cache if needed"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()

single_inferencing_2.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import json
+import os
+import subprocess
+import sys
+import time
+from typing import Dict, List, Optional, Tuple, Union
+import yaml
+sys.path.append(os.getcwd())
+import re
+import pandas as pd
+import base64
+from openai import OpenAI
+from PIL import Image
+from sklearn.metrics import classification_report
+from tqdm import tqdm
+from utils.image_utils_yolo import YOLOProcessor
+from utils.image_utils_bbox import updated_add_bbox
+from utils.image_utils import add_bbox, encode_pil_image_to_base64
+from utils.prompt_utils import create_query, parse_label
+from embedding_service import embedding_service
+def optimize_image_for_tokens(image, max_size=768):
+    """
+    Simple image optimization to reduce token consumption
+    Args:
+        image: PIL Image object
+        max_size: Maximum dimension (768 = good balance of quality vs tokens)
+    Returns:
+        Optimized PIL Image
+    """
+    original_size = image.size
+    # Only resize if image is larger than max_size
+    if max(image.size) > max_size:
+        # Calculate new size maintaining aspect ratio
+        ratio = max_size / max(image.size)
+        new_size = (
+            int(image.size[0] * ratio),
+            int(image.size[1] * ratio)
+        )
+        # Resize with high-quality resampling
+        image = image.resize(new_size, Image.Resampling.LANCZOS)
+        print(f"Image optimized: {original_size} → {new_size} (estimated {int((max(original_size)/max_size)**2 * 100)}% token reduction)", flush=True)
+    return image
+class SingleImageInference:
+    def __init__(
+        self,
+        server_url: str,
+        segmentation_device_id: Optional[int] = None,
+        log_dir: str = "inference_logs",
+        enable_bbox_detection: bool = True,
+    ):
+        self.log_dir = log_dir
+        self.enable_bbox_detection = enable_bbox_detection
+        self.segmentation_device_id = segmentation_device_id
+        # initialise client
+        self.client = OpenAI(base_url=server_url, api_key="EMPTY")
+        # Ensure log directory exists
+        os.makedirs(log_dir, exist_ok=True)
+        # Cache the available models to validate model existence
+        self.available_models = self._get_available_models()
+        # Load question mappings
+        self.question_mappings = self._load_question_mappings()
+        # Load document type groups
+        self.doctype_groups = self._load_doctype_groups()
+        #Load document - bbox class mapping
+        self.doctype_detection_mapping = self._load_detection_class_mapping()
+        self.yolo_processor = YOLOProcessor(device="cuda:0")
+        # Initialize CLIP model for embeddings
+        try:
+            embedding_service.load_model()
+            print("✅ CLIP model loaded successfully for embeddings", flush=True)
+        except Exception as e:
+            print(f"⚠️ Warning: Failed to load CLIP model: {e}", flush=True)
+    def _get_available_models(self) -> List[str]:
+        for attempt in range(3):
+            try:
+                response = self.client.models.list()
+                return [model.id for model in response.data]
+            except Exception as e:
+                if attempt < 2:  # Don't wait after the last attempt
+                    wait_time = (attempt + 1) * 2  # 2s, then 4s
+                    print(f"Attempt {attempt + 1} failed. Retrying in {wait_time}s...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"Warning: Could not fetch available models: {str(e)}")
+        return []
+    def _load_question_mappings(self) -> Dict[str, Dict[str, str]]:
+        """Load question mappings from YAML file."""
+        try:
+            mapping_file = "/app/meta-jv-reasoning/Teleco-Pilot-Use-Case/framework/utils/question_mappings.yaml"
+            with open(mapping_file, 'r') as f:
+                return yaml.safe_load(f)
+        except Exception as e:
+            print(f"Error loading question mappings: {str(e)}")
+            return {}
+    def _load_doctype_groups(self) -> Dict[str, List[str]]:
+        """Load document type groups from YAML file."""
+        try:
+            groups_file = "/app/meta-jv-reasoning/Teleco-Pilot-Use-Case/framework/utils/doctype_groups.yaml"
+            with open(groups_file, 'r') as f:
+                return yaml.safe_load(f)
+        except Exception as e:
+            print(f"Error loading document type groups: {str(e)}")
+            return {}
+    def _get_parent_doctype(self, doc_type: str) -> str:
+        """Get the parent document type for a given document type."""
+        # Ensure doc_type is uppercase for consistency
+        doc_type = doc_type.upper()
+        # Search through the groups to find the parent
+        for parent_type, child_types in self.doctype_groups.items():
+            if doc_type in child_types:
+                return parent_type
+        # If no parent found, return the original type
+        return doc_type
+    def _load_detection_class_mapping(self) -> Dict[str, List[List[str]]]:
+        """Load and cache detection class mapping from YAML file."""
+        try:
+            mapping_file = "/app/meta-jv-reasoning/Teleco-Pilot-Use-Case/framework/utils/doctype_bbox_detection_mapping.yaml"
+            with open(mapping_file, "r") as f:
+                return yaml.safe_load(f)
+        except Exception as e:
+            print(f"Error loading detection class mapping: {str(e)}")
+            return {}
+    def _get_detection_classes_for_doctype(self, parent_doctype: str) -> Optional[List[str]]:
+        """Return list of detection classes for a given parent doctype, or None if not found."""
+        parent_doctype = parent_doctype.upper()
+        class_groups = self.doctype_detection_mapping.get(parent_doctype)
+        if not class_groups:
+            return None
+        return [cls for group in class_groups for cls in group]
+    def _map_question(self, question: str, doc_type: str = "default") -> str:
+        """Map complex questions to simpler versions based on document type."""
+        # Ensure doc_type is uppercase for consistency
+        doc_type = doc_type.upper()
+        # Map the document type to its parent type
+        parent_doc_type = self._get_parent_doctype(doc_type)
+        # Get mappings for the specific document type, fallback to default if not found
+        mappings = self.question_mappings.get(parent_doc_type, self.question_mappings.get("default", {}))
+        print(f"Mapping question for doc_type: {doc_type} (parent: {parent_doc_type})")
+        # Clean up the question by removing any numbering prefix
+        clean_question = re.sub(r'^\d+\.\s*', '', question)
+        # Try to find an exact match in the mappings
+        for complex_q, simple_q in mappings.items():
+            # Remove numbering from complex question for comparison
+            clean_complex_q = re.sub(r'^\d+\.\s*', '', complex_q)
+            if clean_question.startswith(clean_complex_q):
+                return simple_q
+        # If no match found, return the original question
+        return question
+    def _extract_reasoning(self, raw_response: Dict, doc_type: str = "default") -> str:
+        """Extract and format reasoning from raw response into a single string."""
+        # Map the document type to its parent type
+        parent_doc_type = self._get_parent_doctype(doc_type)
+        reasoning = raw_response.get("reasoning", [])
+        if isinstance(reasoning, list):
+            formatted_reasoning = []
+            for i, item in enumerate(reasoning, 1):
+                if ": " in item:
+                    question, answer = item.split(": ", 1)
+                    mapped_question = self._map_question(question, parent_doc_type)
+                    formatted_reasoning.append(f"{i}. {mapped_question}: {answer}")
+                else:
+                    formatted_reasoning.append(f"{i}. {item}")
+            return "\n".join(formatted_reasoning)
+        return str(reasoning)
+    def run_inference(self, query, temperature: float) -> Dict[str, str]:
+        """Run inference on a single image using the vLLM server."""
+        doc_type = query.get("doc_type", "default").upper()
+        parent_doc_type = self._get_parent_doctype(doc_type)
+        print(f"<-- parent_doc_type : {parent_doc_type} -->")
+        try:
+            if isinstance(query["image"], Image.Image):
+                image = query["image"].convert("RGB")
+            else:
+                image_data = base64.b64decode(query["image"])
+                image = Image.open(BytesIO(image_data)).convert("RGB")
+        except Exception as e:
+            print(f"Error decoding base64 image: {str(e)}")
+            raise ValueError("Invalid image format in query")
+        # Extract embedding from original image before any bbox processing
+        try:
+            embedding = embedding_service.extract_embedding(image)
+            print("✅ Embedding extracted successfully", flush=True)
+        except Exception as e:
+            print(f"⚠️ Warning: Failed to extract embedding: {e}", flush=True)
+            embedding = None
+        image = optimize_image_for_tokens(image, max_size=768)
+        # Existing bbox detection logic
+        if self.enable_bbox_detection:
+            try:
+                detection_classes = self._get_detection_classes_for_doctype(parent_doc_type)
+                print(f"<< DETECTION CLASS : {detection_classes} >>", flush=True)
+                if detection_classes:
+                    image_with_boxes = self.yolo_processor.process_bbox(
+                        image, desired_classes=detection_classes
+                    )
+                    if image_with_boxes is not None:
+                        image = image_with_boxes
+                else:
+                    print(f"No detection class mapping found for parent doctype: {parent_doc_type}", flush=True)
+            except Exception as e:
+                print(f"Error applying bounding boxes: {str(e)}", flush=True)
+        # Rest of your existing inference code
+        image_b64 = encode_pil_image_to_base64(image)
+        instruction = f"{query['task_instruction']}\n\n{query['format_instruction']}"
+        if not self.available_models:
+            self.available_models = self._get_available_models()
+            if not self.available_models:
+                raise ValueError(
+                    "No models available on the server. Please ensure the vLLM server is running and accessible."
+                )
+        model_id = self.available_models[0]
+        response = self.client.chat.completions.create(
+            model=model_id,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": instruction},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
+                        },
+                    ],
+                }
+            ],
+            max_tokens=512,
+            temperature=temperature,
+            top_p=0.95,
+        )
+        raw_response = response.choices[0].message.content
+        try:
+            classification_label = parse_label(raw_response)
+            success = True
+        except AttributeError:
+            classification_label = "UNKNOWN"
+            success = False
+        try:
+            json_str = raw_response.strip("`json\n")
+            raw_response_dict = json.loads(json_str)
+            formatted_reasoning = self._extract_reasoning(raw_response_dict, doc_type)
+            raw_response_dict["reasoning"] = formatted_reasoning
+            # Add embedding to response if available
+            if embedding is not None:
+                raw_response_dict["embedding"] = embedding
+            raw_response = json.dumps(raw_response_dict)
+        except Exception as e:
+            print(f"Error formatting reasoning: {str(e)}")
+            response_dict = {
+                "reasoning": str(raw_response),
+                "evaluation_result": classification_label
+            }
+            if embedding is not None:
+                response_dict["embedding"] = embedding
+            raw_response = json.dumps(response_dict)
+        return {
+            "response": raw_response,
+            "label": classification_label,
+            "success": success,
+        }