Spaces:

xce009
/

ai_chat_api

Running

App Files Files Community

Soumik Bose commited on 10 days ago

Commit

80e7d10

1 Parent(s): 8f0d05b

ok

Browse files

Files changed (10) hide show

Dockerfile +30 -12
config.py +38 -0
main.py +94 -255
models/schemas.py +28 -0
requirements.txt +4 -1
routers/text_router.py +53 -0
routers/vision_router.py +73 -0
services/text_service.py +134 -0
services/vision_service.py +142 -0
utils/json_extractor.py +133 -0

Dockerfile CHANGED Viewed

@@ -4,37 +4,55 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PORT=7860 \
     HF_HOME=/app/cache \
     PATH="/home/user/.local/bin:${PATH}"
 WORKDIR /app
-# Install build dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     cmake \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*
-# Create user
-RUN useradd -m -u 1000 user
-RUN mkdir -p /app/cache /app/models && chown -R user:user /app
-# Install pip as root
 RUN pip install --no-cache-dir --upgrade pip
 USER user
-# Build and install llama-cpp-python with proper flags
-RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_NATIVE=OFF" \
-    pip install --no-cache-dir llama-cpp-python==0.3.2
-# Install other dependencies
 COPY --chown=user:user requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy app
 COPY --chown=user:user main.py .
 EXPOSE 7860
-CMD ["bash", "-c", "while true; do curl -s https://xce009-ai-chat-api.hf.space/ping > /dev/null || true; sleep 300; done & python -m uvicorn main:app --host 0.0.0.0 --port 7860"]

     PYTHONUNBUFFERED=1 \
     PORT=7860 \
     HF_HOME=/app/cache \
+    CPU_THREADS=2 \
     PATH="/home/user/.local/bin:${PATH}"
 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     cmake \
     curl \
     git \
+    libgomp1 \
     && rm -rf /var/lib/apt/lists/*
+# Create non-root user
+RUN useradd -m -u 1000 user && \
+    mkdir -p /app/cache /app/models && \
+    chown -R user:user /app
+# Upgrade pip as root
 RUN pip install --no-cache-dir --upgrade pip
+# Switch to non-root user
 USER user
+# Install llama-cpp-python with optimized build flags
+RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_NATIVE=OFF -DGGML_AVX2=ON" \
+    pip install --no-cache-dir --user llama-cpp-python==0.3.2
+# Copy requirements and install dependencies
 COPY --chown=user:user requirements.txt .
+RUN pip install --no-cache-dir --user -r requirements.txt
+# Copy application structure
+COPY --chown=user:user config.py .
 COPY --chown=user:user main.py .
+COPY --chown=user:user models/ ./models/
+COPY --chown=user:user services/ ./services/
+COPY --chown=user:user routers/ ./routers/
+COPY --chown=user:user utils/ ./utils/
+# Create __init__.py files if they don't exist
+RUN touch models/__init__.py services/__init__.py routers/__init__.py utils/__init__.py
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:7860/ping || exit 1
 EXPOSE 7860
+# Production startup with keep-alive and graceful shutdown
+CMD ["bash", "-c", "while true; do curl -s https://xce009-ai-chat-api.hf.space/ping > /dev/null 2>&1 || true; sleep 300; done & exec python -m uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info"]

config.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+from typing import Optional
+class Config:
+    """Centralized configuration for the SmolLM API"""
+    # Server Configuration
+    PORT: int = int(os.getenv("PORT", "7860"))
+    HOST: str = "0.0.0.0"
+    # Cache Configuration
+    HF_HOME: str = os.getenv("HF_HOME", "/app/cache")
+    # CPU Configuration
+    N_THREADS: int = int(os.getenv("CPU_THREADS", "2"))
+    # Text Model Configuration
+    TEXT_MODEL_REPO: str = "bartowski/SmolLM2-1.7B-Instruct-GGUF"
+    TEXT_MODEL_FILE: str = "SmolLM2-1.7B-Instruct-Q4_K_M.gguf"
+    TEXT_MODEL_CTX: int = 2048
+    TEXT_MODEL_BATCH: int = 512
+    # Vision Model Configuration
+    VISION_MODEL_REPO: str = "ggml-org/SmolVLM-500M-Instruct-GGUF"
+    VISION_MODEL_FILE: str = "smolvlm-500m-instruct-q8_0.gguf"
+    VISION_MMPROJ_FILE: str = "mmproj-smolvlm-500m-instruct-f16.gguf"
+    VISION_MODEL_CTX: int = 2048
+    VISION_MODEL_BATCH: int = 512
+    # Default Generation Parameters
+    DEFAULT_TEMPERATURE: float = 0.6
+    DEFAULT_MAX_TOKENS: int = 512
+    # File Upload Configuration
+    MAX_FILE_SIZE: int = 10 * 1024 * 1024  # 10MB
+    ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
+config = Config()

main.py CHANGED Viewed

@@ -1,280 +1,119 @@
-import os
 import logging
-import json
 from contextlib import asynccontextmanager
-from typing import List, Optional, Any
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse, StreamingResponse
-from pydantic import BaseModel
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
-# --- 1. Logging Setup ---
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
 )
-logger = logging.getLogger("SmolLM-API")
-# --- 2. Helper Functions (Previously in json_service.py) ---
-def find_balanced_closing_index(text: str, start_index: int) -> int:
-    """
-    Finds the matching closing bracket for the bracket at start_index.
-    Ignores brackets inside strings and comments.
-    """
-    start_char = text[start_index]
-    end_char = '}' if start_char == '{' else ']'
-    depth = 0
-    in_double_quote = False
-    in_single_quote = False
-    in_backtick = False
-    in_line_comment = False
-    in_block_comment = False
-    is_escaped = False
-    length = len(text)
-    i = start_index
-    while i < length:
-        char = text[i]
-        next_char = text[i+1] if i + 1 < length else ''
-        # Handle Escaping
-        if is_escaped:
-            is_escaped = False
-            i += 1
-            continue
-        if char == '\\' and not in_line_comment and not in_block_comment:
-            is_escaped = True
-            i += 1
-            continue
-        # Handle Comments
-        if in_line_comment:
-            if char == '\n': in_line_comment = False
-            i += 1
-            continue
-        if in_block_comment:
-            if char == '*' and next_char == '/':
-                in_block_comment = False
-                i += 2
-                continue
-            i += 1
-            continue
-        # Check comment starts
-        if not in_double_quote and not in_single_quote and not in_backtick:
-            if char == '/' and next_char == '/':
-                in_line_comment = True
-                i += 2
-                continue
-            if char == '/' and next_char == '*':
-                in_block_comment = True
-                i += 2
-                continue
-        # Handle Strings
-        if in_double_quote:
-            if char == '"': in_double_quote = False
-            i += 1
-            continue
-        if in_single_quote:
-            if char == "'": in_single_quote = False
-            i += 1
-            continue
-        if in_backtick:
-            if char == '`': in_backtick = False
-            i += 1
-            continue
-        if char == '"':
-            in_double_quote = True
-            i += 1
-            continue
-        if char == "'":
-            in_single_quote = True
-            i += 1
-            continue
-        if char == '`':
-            in_backtick = True
-            i += 1
-            continue
-        # Handle Bracket Counting
-        if char == start_char:
-            depth += 1
-        elif char == end_char:
-            depth -= 1
-            if depth == 0:
-                return i # Found matching close
-        i += 1
-    return -1
-def extract_json_from_content(content: str) -> List[Any]:
-    """
-    Scans text for JSON objects/arrays using state machine logic.
-    """
-    if not content or not isinstance(content, str):
-        return []
-    found_blocks = []
-    cursor = 0
-    length = len(content)
-    while cursor < length:
-        if content[cursor] not in ['{', '[']:
-            cursor += 1
-            continue
-        end_index = find_balanced_closing_index(content, cursor)
-        if end_index != -1:
-            raw_candidate = content[cursor : end_index + 1]
-            try:
-                parsed = json.loads(raw_candidate)
-                found_blocks.append(parsed)
-                cursor = end_index + 1
-                continue
-            except json.JSONDecodeError:
-                pass
-        cursor += 1
-    return found_blocks
-# --- 3. Model Configuration ---
-REPO_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
-FILENAME = "smollm2-1.7b-instruct-q4_k_m.gguf"
-N_THREADS = int(os.getenv("CPU_THREADS", "2"))
-llm_model: Optional[Llama] = None
-# --- 4. Lifecycle Manager ---
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global llm_model
-    logger.info("--- STARTING SMOLLM2 API ---")
-    try:
-        logger.info(f"Downloading {FILENAME}...")
-        model_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=FILENAME,
-            cache_dir=os.getenv("HF_HOME", "/app/cache")
-        )
-        logger.info(f"Initializing Engine (Threads: {N_THREADS})...")
-        llm_model = Llama(
-            model_path=model_path,
-            n_ctx=2048,
-            n_threads=N_THREADS,
-            n_batch=512,
-            verbose=False
-        )
-        logger.info("SmolLM2 Loaded.")
     except Exception as e:
-        logger.critical(f"Startup Failed: {e}")
-        raise e
     yield
-    if llm_model:
-        del llm_model
-        logger.info("Model unloaded.")
-app = FastAPI(title="SmolLM2 API", version="2.1", lifespan=lifespan)
-# --- 5. Data Models ---
-class Message(BaseModel):
-    role: str
-    content: str
-class ChatRequest(BaseModel):
-    messages: List[Message]
-    temperature: Optional[float] = 0.6
-    max_tokens: Optional[int] = 512
-    stream: Optional[bool] = False
-    returnJson: Optional[bool] = False
-# --- 6. Endpoints ---
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the SmolLM2 API! Use /v1/chat/completions to interact."}
 @app.get("/ping")
 async def ping():
-    if llm_model: return {"status": "pong", "ready": True}
-    return JSONResponse(status_code=503, content={"status": "loading"})
-@app.post("/v1/chat/completions")
-async def chat(request: ChatRequest):
-    if not llm_model:
-        raise HTTPException(status_code=503, detail="Model loading...")
-    # --- VALIDATION: Check for conflicting parameters ---
-    if request.stream and request.returnJson:
-        raise HTTPException(
-            status_code=400,
-            detail="Conflict: 'stream' and 'returnJson' cannot both be True. Streaming prevents JSON extraction."
-        )
-    # Prepare messages
-    messages_payload = [m.model_dump() for m in request.messages]
-    # --- LOGIC FOR returnJson ---
-    if request.returnJson:
-        logger.info("Format Mode: JSON Extraction Active")
-        system_prompt = {
-            "role": "system",
-            "content": (
-                "You are a strict JSON generator. "
-                "Convert the user's input into a valid JSON Array of Objects. "
-                "Output strictly in markdown code blocks like ```json ... ```. "
-                "Do not add conversational filler."
-            )
-        }
-        messages_payload.insert(0, system_prompt)
-        if messages_payload and messages_payload[-1]['role'] == 'user':
-            messages_payload[-1]['content'] += "\n\nReturn structured JSON of this content..."
-    logger.info(f"Processing request: {len(messages_payload)} msgs | Stream: {request.stream}")
-    try:
-        # Generate Response
-        response_data = llm_model.create_chat_completion(
-            messages=messages_payload,
-            temperature=request.temperature,
-            max_tokens=request.max_tokens,
-            stream=request.stream
         )
-        # --- STREAMING RESPONSE LOGIC ---
-        if request.stream:
-            def iter_response():
-                for chunk in response_data:
-                    yield f"data: {json.dumps(chunk)}\n\n"
-                yield "data: [DONE]\n\n"
-            return StreamingResponse(iter_response(), media_type="text/event-stream")
-        # --- STANDARD / JSON RESPONSE LOGIC ---
-        if not request.returnJson:
-            return response_data
-        # Custom JSON Extraction Logic
-        content_text = response_data['choices'][0]['message']['content']
-        extracted_data = extract_json_from_content(content_text)
-        return JSONResponse(content={
-            "status": "success",
-            "data": extracted_data
-        })
-    except Exception as e:
-        logger.error(f"Error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))

 import logging
 from contextlib import asynccontextmanager
+from datetime import datetime
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from config import config
+from services.text_service import text_service
+from services.vision_service import vision_service
+from routers import text_router, vision_router
+# Logging Setup
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
 )
+logger = logging.getLogger("main")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifecycle manager"""
+    logger.info("=" * 60)
+    logger.info("STARTING SMOLLM2 MULTIMODAL API")
+    logger.info("=" * 60)
+    try:
+        # Initialize text service
+        logger.info("Initializing Text Service...")
+        await text_service.initialize()
+        # Initialize vision service
+        logger.info("Initializing Vision Service...")
+        await vision_service.initialize()
+        logger.info("=" * 60)
+        logger.info("✓ All services initialized successfully")
+        logger.info("=" * 60)
     except Exception as e:
+        logger.critical(f"Startup failed: {e}")
+        raise
     yield
+    # Cleanup
+    logger.info("Shutting down services...")
+    await text_service.cleanup()
+    await vision_service.cleanup()
+    logger.info("Shutdown complete")
+# Create FastAPI application
+app = FastAPI(
+    title="SmolLM2 Multimodal API",
+    version="3.0",
+    description="Production-ready API for SmolLM2 text and vision models",
+    lifespan=lifespan
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Include routers
+app.include_router(text_router.router)
+app.include_router(vision_router.router)
 @app.get("/")
 async def root():
+    """Root endpoint with API information"""
+    return {
+        "name": "SmolLM2 Multimodal API",
+        "version": "3.0",
+        "endpoints": {
+            "text": "/v1/text/chat/completions",
+            "vision": "/v1/vision/analyze",
+            "health": "/health"
+        },
+        "docs": "/docs"
+    }
+@app.get("/health")
+async def health_check():
+    """Comprehensive health check"""
+    return {
+        "status": "healthy",
+        "services": {
+            "text": text_service.is_ready(),
+            "vision": vision_service.is_ready()
+        },
+        "timestamp": datetime.utcnow().isoformat()
+    }
 @app.get("/ping")
 async def ping():
+    """Simple ping endpoint"""
+    all_ready = text_service.is_ready() and vision_service.is_ready()
+    if not all_ready:
+        return JSONResponse(
+            status_code=503,
+            content={"status": "initializing", "ready": False}
         )
+    return {"status": "pong", "ready": True}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "main:app",
+        host=config.HOST,
+        port=config.PORT,
+        log_level="info"
+    )

models/schemas.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from typing import List, Optional, Any
+from pydantic import BaseModel, Field
+class Message(BaseModel):
+    role: str = Field(..., description="Role of the message sender (user/assistant/system)")
+    content: str = Field(..., description="Content of the message")
+class ChatRequest(BaseModel):
+    messages: List[Message] = Field(..., description="List of messages in the conversation")
+    temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
+    max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
+    stream: Optional[bool] = Field(False, description="Enable streaming response")
+    returnJson: Optional[bool] = Field(False, description="Extract and return JSON from response")
+class VisionRequest(BaseModel):
+    prompt: str = Field(..., description="Text prompt/question about the image")
+    temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
+    max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
+class ErrorResponse(BaseModel):
+    error: str
+    detail: Optional[str] = None
+class HealthResponse(BaseModel):
+    status: str
+    text_model: bool
+    vision_model: bool
+    timestamp: str

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 fastapi>=0.115.0
 uvicorn>=0.30.0
 pydantic>=2.8.0
-huggingface-hub>=0.24.0

 fastapi>=0.115.0
 uvicorn>=0.30.0
 pydantic>=2.8.0
+huggingface-hub>=0.24.0
+llama-cpp-python==0.3.2
+python-multipart>=0.0.9
+Pillow>=10.0.0

routers/text_router.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
+import logging
+from models.schemas import ChatRequest, ErrorResponse
+from services.text_service import text_service
+logger = logging.getLogger("text-router")
+router = APIRouter(prefix="/v1/text", tags=["Text Generation"])
+@router.post("/chat/completions")
+async def create_chat_completion(request: ChatRequest):
+    """
+    Create a chat completion using the text model
+    Supports:
+    - Standard completions
+    - Streaming responses
+    - JSON extraction mode
+    """
+    if not text_service.is_ready():
+        raise HTTPException(status_code=503, detail="Text model not ready")
+    try:
+        messages = [msg.model_dump() for msg in request.messages]
+        result = await text_service.generate_completion(
+            messages=messages,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens,
+            stream=request.stream,
+            return_json=request.returnJson
+        )
+        if request.stream:
+            return StreamingResponse(result, media_type="text/event-stream")
+        return JSONResponse(content=result)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Chat completion error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/health")
+async def text_health():
+    """Check text model health status"""
+    return {
+        "status": "healthy" if text_service.is_ready() else "initializing",
+        "model_ready": text_service.is_ready()
+    }

routers/vision_router.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from fastapi import APIRouter, HTTPException, File, UploadFile, Form
+from fastapi.responses import JSONResponse
+import logging
+from pathlib import Path
+from models.schemas import VisionRequest, ErrorResponse
+from services.vision_service import vision_service
+from config import config
+logger = logging.getLogger("vision-router")
+router = APIRouter(prefix="/v1/vision", tags=["Vision AI"])
+@router.post("/analyze")
+async def analyze_image(
+    image: UploadFile = File(..., description="Image file to analyze"),
+    prompt: str = Form(..., description="Question or prompt about the image"),
+    temperature: float = Form(0.6, ge=0.0, le=2.0),
+    max_tokens: int = Form(512, ge=1, le=4096)
+):
+    """
+    Analyze an image with a text prompt
+    Accepts:
+    - Image file (JPEG, PNG, GIF, WebP, BMP)
+    - Text prompt/question
+    - Optional generation parameters
+    """
+    if not vision_service.is_ready():
+        raise HTTPException(status_code=503, detail="Vision model not ready")
+    # Validate file extension
+    file_ext = Path(image.filename).suffix.lower()
+    if file_ext not in config.ALLOWED_IMAGE_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid file type. Allowed: {', '.join(config.ALLOWED_IMAGE_EXTENSIONS)}"
+        )
+    try:
+        # Read image data
+        image_data = await image.read()
+        # Check file size
+        if len(image_data) > config.MAX_FILE_SIZE:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File too large. Max size: {config.MAX_FILE_SIZE / 1024 / 1024}MB"
+            )
+        # Analyze image
+        result = await vision_service.analyze_image(
+            image_data=image_data,
+            prompt=prompt,
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        return JSONResponse(content=result)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Image analysis error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/health")
+async def vision_health():
+    """Check vision model health status"""
+    return {
+        "status": "healthy" if vision_service.is_ready() else "initializing",
+        "model_ready": vision_service.is_ready()
+    }

services/text_service.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import logging
+from typing import Optional, Dict, Any, List, AsyncIterator
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+import json
+from config import config
+from utils.json_extractor import extract_json_from_content
+logger = logging.getLogger("text-service")
+class TextService:
+    """Service for text-based language model interactions"""
+    def __init__(self):
+        self.model: Optional[Llama] = None
+    async def initialize(self) -> None:
+        """Initialize the text model"""
+        try:
+            logger.info(f"Downloading text model: {config.TEXT_MODEL_FILE}...")
+            model_path = hf_hub_download(
+                repo_id=config.TEXT_MODEL_REPO,
+                filename=config.TEXT_MODEL_FILE,
+                cache_dir=config.HF_HOME
+            )
+            logger.info(f"Loading text model (Threads: {config.N_THREADS})...")
+            self.model = Llama(
+                model_path=model_path,
+                n_ctx=config.TEXT_MODEL_CTX,
+                n_threads=config.N_THREADS,
+                n_batch=config.TEXT_MODEL_BATCH,
+                verbose=False
+            )
+            logger.info("✓ Text model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize text model: {e}")
+            raise
+    def is_ready(self) -> bool:
+        """Check if the model is loaded and ready"""
+        return self.model is not None
+    async def generate_completion(
+        self,
+        messages: List[Dict[str, str]],
+        temperature: float = 0.6,
+        max_tokens: int = 512,
+        stream: bool = False,
+        return_json: bool = False
+    ) -> Any:
+        """
+        Generate text completion
+        Args:
+            messages: List of message dictionaries with 'role' and 'content'
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            stream: Whether to stream the response
+            return_json: Whether to extract JSON from response
+        Returns:
+            Generated completion (dict or stream)
+        """
+        if not self.is_ready():
+            raise RuntimeError("Text model not initialized")
+        # Validate conflicting parameters
+        if stream and return_json:
+            raise ValueError("Cannot use both 'stream' and 'return_json' simultaneously")
+        # Prepare messages for JSON extraction mode
+        if return_json:
+            system_prompt = {
+                "role": "system",
+                "content": (
+                    "You are a strict JSON generator. "
+                    "Convert the user's input into valid JSON format. "
+                    "Output strictly in markdown code blocks like ```json ... ```. "
+                    "Do not add conversational filler."
+                )
+            }
+            messages = [system_prompt] + messages
+            if messages[-1]['role'] == 'user':
+                messages[-1]['content'] += "\n\nReturn structured JSON of this content."
+        logger.info(f"Generating completion: {len(messages)} messages | Stream: {stream}")
+        try:
+            response = self.model.create_chat_completion(
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                stream=stream
+            )
+            # Handle streaming response
+            if stream:
+                return self._create_stream_iterator(response)
+            # Handle JSON extraction
+            if return_json:
+                content_text = response['choices'][0]['message']['content']
+                extracted_data = extract_json_from_content(content_text)
+                return {
+                    "status": "success",
+                    "data": extracted_data,
+                    "raw_content": content_text
+                }
+            return response
+        except Exception as e:
+            logger.error(f"Error generating completion: {e}")
+            raise
+    async def _create_stream_iterator(self, response_stream) -> AsyncIterator[str]:
+        """Create an async iterator for streaming responses"""
+        for chunk in response_stream:
+            yield f"data: {json.dumps(chunk)}\n\n"
+        yield "data: [DONE]\n\n"
+    async def cleanup(self) -> None:
+        """Cleanup resources"""
+        if self.model:
+            del self.model
+            self.model = None
+            logger.info("Text model unloaded")
+# Global instance
+text_service = TextService()

services/vision_service.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import logging
+import base64
+import io
+from typing import Optional, Dict, Any
+from pathlib import Path
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from config import config
+logger = logging.getLogger("vision-service")
+class VisionService:
+    """Service for vision-language model interactions"""
+    def __init__(self):
+        self.model: Optional[Llama] = None
+        self.chat_handler: Optional[Llava15ChatHandler] = None
+    async def initialize(self) -> None:
+        """Initialize the vision model"""
+        try:
+            logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
+            model_path = hf_hub_download(
+                repo_id=config.VISION_MODEL_REPO,
+                filename=config.VISION_MODEL_FILE,
+                cache_dir=config.HF_HOME
+            )
+            logger.info(f"Downloading vision projector: {config.VISION_MMPROJ_FILE}...")
+            mmproj_path = hf_hub_download(
+                repo_id=config.VISION_MODEL_REPO,
+                filename=config.VISION_MMPROJ_FILE,
+                cache_dir=config.HF_HOME
+            )
+            logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
+            # Initialize chat handler with multimodal projection
+            self.chat_handler = Llava15ChatHandler(
+                clip_model_path=mmproj_path,
+                verbose=False
+            )
+            self.model = Llama(
+                model_path=model_path,
+                chat_handler=self.chat_handler,
+                n_ctx=config.VISION_MODEL_CTX,
+                n_threads=config.N_THREADS,
+                n_batch=config.VISION_MODEL_BATCH,
+                logits_all=True,
+                verbose=False
+            )
+            logger.info("✓ Vision model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize vision model: {e}")
+            raise
+    def is_ready(self) -> bool:
+        """Check if the model is loaded and ready"""
+        return self.model is not None and self.chat_handler is not None
+    async def analyze_image(
+        self,
+        image_data: bytes,
+        prompt: str,
+        temperature: float = 0.6,
+        max_tokens: int = 512
+    ) -> Dict[str, Any]:
+        """
+        Analyze an image with a text prompt
+        Args:
+            image_data: Raw image bytes
+            prompt: Text question/prompt about the image
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+        Returns:
+            Analysis result dictionary
+        """
+        if not self.is_ready():
+            raise RuntimeError("Vision model not initialized")
+        try:
+            # Convert image bytes to base64 data URI
+            image_b64 = base64.b64encode(image_data).decode('utf-8')
+            # Validate image
+            image = Image.open(io.BytesIO(image_data))
+            logger.info(f"Processing image: {image.size} | Format: {image.format}")
+            # Create vision message format
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ]
+            logger.info(f"Analyzing image with prompt: {prompt[:50]}...")
+            response = self.model.create_chat_completion(
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens
+            )
+            return {
+                "status": "success",
+                "image_info": {
+                    "size": list(image.size),
+                    "format": image.format,
+                    "mode": image.mode
+                },
+                "prompt": prompt,
+                "response": response['choices'][0]['message']['content'],
+                "usage": response.get('usage', {})
+            }
+        except Exception as e:
+            logger.error(f"Error analyzing image: {e}")
+            raise
+    async def cleanup(self) -> None:
+        """Cleanup resources"""
+        if self.model:
+            del self.model
+            self.model = None
+        if self.chat_handler:
+            del self.chat_handler
+            self.chat_handler = None
+        logger.info("Vision model unloaded")
+# Global instance
+vision_service = VisionService()

utils/json_extractor.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import json
+import logging
+from typing import List, Any
+logger = logging.getLogger("json-extractor")
+def find_balanced_closing_index(text: str, start_index: int) -> int:
+    """
+    Finds the matching closing bracket for the bracket at start_index.
+    Ignores brackets inside strings and comments.
+    """
+    start_char = text[start_index]
+    end_char = '}' if start_char == '{' else ']'
+    depth = 0
+    in_double_quote = False
+    in_single_quote = False
+    in_backtick = False
+    in_line_comment = False
+    in_block_comment = False
+    is_escaped = False
+    length = len(text)
+    i = start_index
+    while i < length:
+        char = text[i]
+        next_char = text[i+1] if i + 1 < length else ''
+        # Handle Escaping
+        if is_escaped:
+            is_escaped = False
+            i += 1
+            continue
+        if char == '\\' and not in_line_comment and not in_block_comment:
+            is_escaped = True
+            i += 1
+            continue
+        # Handle Comments
+        if in_line_comment:
+            if char == '\n': in_line_comment = False
+            i += 1
+            continue
+        if in_block_comment:
+            if char == '*' and next_char == '/':
+                in_block_comment = False
+                i += 2
+                continue
+            i += 1
+            continue
+        # Check comment starts
+        if not in_double_quote and not in_single_quote and not in_backtick:
+            if char == '/' and next_char == '/':
+                in_line_comment = True
+                i += 2
+                continue
+            if char == '/' and next_char == '*':
+                in_block_comment = True
+                i += 2
+                continue
+        # Handle Strings
+        if in_double_quote:
+            if char == '"': in_double_quote = False
+            i += 1
+            continue
+        if in_single_quote:
+            if char == "'": in_single_quote = False
+            i += 1
+            continue
+        if in_backtick:
+            if char == '`': in_backtick = False
+            i += 1
+            continue
+        if char == '"':
+            in_double_quote = True
+            i += 1
+            continue
+        if char == "'":
+            in_single_quote = True
+            i += 1
+            continue
+        if char == '`':
+            in_backtick = True
+            i += 1
+            continue
+        # Handle Bracket Counting
+        if char == start_char:
+            depth += 1
+        elif char == end_char:
+            depth -= 1
+            if depth == 0:
+                return i
+        i += 1
+    return -1
+def extract_json_from_content(content: str) -> List[Any]:
+    """
+    Scans text for JSON objects/arrays using state machine logic.
+    """
+    if not content or not isinstance(content, str):
+        return []
+    found_blocks = []
+    cursor = 0
+    length = len(content)
+    while cursor < length:
+        if content[cursor] not in ['{', '[']:
+            cursor += 1
+            continue
+        end_index = find_balanced_closing_index(content, cursor)
+        if end_index != -1:
+            raw_candidate = content[cursor : end_index + 1]
+            try:
+                parsed = json.loads(raw_candidate)
+                found_blocks.append(parsed)
+                cursor = end_index + 1
+                continue
+            except json.JSONDecodeError:
+                pass
+        cursor += 1
+    return found_blocks