Spaces:

xce009
/

ai_chat_api

Running

App Files Files Community

Soumik Bose commited on 9 days ago

Commit

cde2f6e

1 Parent(s): 8de8d71

go

Browse files

Files changed (4) hide show

models/schemas.py +1 -0
routers/vision_router.py +5 -7
services/text_service.py +1 -2
services/vision_service.py +37 -18

models/schemas.py CHANGED Viewed

@@ -16,6 +16,7 @@ class VisionRequest(BaseModel):
     prompt: str = Field(..., description="Text prompt/question about the image")
     temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
     max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
 class ErrorResponse(BaseModel):
     error: str

     prompt: str = Field(..., description="Text prompt/question about the image")
     temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
     max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
+    return_json: Optional[bool] = Field(False, description="Extract and return JSON from response")
 class ErrorResponse(BaseModel):
     error: str

routers/vision_router.py CHANGED Viewed

@@ -16,15 +16,12 @@ async def analyze_image(
     image: UploadFile = File(..., description="Image file to analyze"),
     prompt: str = Form(..., description="Question or prompt about the image"),
     temperature: float = Form(0.6, ge=0.0, le=2.0),
-    max_tokens: int = Form(512, ge=1, le=4096)
 ):
     """
     Analyze an image with a text prompt
-    Accepts:
-    - Image file (JPEG, PNG, GIF, WebP, BMP)
-    - Text prompt/question
-    - Optional generation parameters
     """
     if not vision_service.is_ready():
         raise HTTPException(status_code=503, detail="Vision model not ready")
@@ -53,7 +50,8 @@ async def analyze_image(
             image_data=image_data,
             prompt=prompt,
             temperature=temperature,
-            max_tokens=max_tokens
         )
         return JSONResponse(content=result)

     image: UploadFile = File(..., description="Image file to analyze"),
     prompt: str = Form(..., description="Question or prompt about the image"),
     temperature: float = Form(0.6, ge=0.0, le=2.0),
+    max_tokens: int = Form(512, ge=1, le=4096),
+    # ADDED PARAMETER
+    return_json: bool = Form(False, description="Ensure output is valid JSON")
 ):
     """
     Analyze an image with a text prompt
     """
     if not vision_service.is_ready():
         raise HTTPException(status_code=503, detail="Vision model not ready")
             image_data=image_data,
             prompt=prompt,
             temperature=temperature,
+            max_tokens=max_tokens,
+            return_json=return_json
         )
         return JSONResponse(content=result)

services/text_service.py CHANGED Viewed

@@ -107,8 +107,7 @@ class TextService:
                 extracted_data = extract_json_from_content(content_text)
                 return {
                     "status": "success",
-                    "data": extracted_data,
-                    "raw_content": content_text
                 }
             return response

                 extracted_data = extract_json_from_content(content_text)
                 return {
                     "status": "success",
+                    "data": extracted_data
                 }
             return response

services/vision_service.py CHANGED Viewed

@@ -2,13 +2,14 @@ import logging
 import base64
 import io
 from typing import Optional, Dict, Any
-from pathlib import Path
 from llama_cpp import Llama
 from llama_cpp.llama_chat_format import Llava15ChatHandler
 from huggingface_hub import hf_hub_download
 from PIL import Image
 from config import config
 logger = logging.getLogger("vision-service")
@@ -20,7 +21,7 @@ class VisionService:
         self.chat_handler: Optional[Llava15ChatHandler] = None
     async def initialize(self) -> None:
-        """Initialize the vision model"""
         try:
             logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
             model_path = hf_hub_download(
@@ -38,7 +39,6 @@ class VisionService:
             logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
-            # Initialize chat handler with multimodal projection
             self.chat_handler = Llava15ChatHandler(
                 clip_model_path=mmproj_path,
                 verbose=False
@@ -60,27 +60,19 @@ class VisionService:
             raise
     def is_ready(self) -> bool:
-        """Check if the model is loaded and ready"""
         return self.model is not None and self.chat_handler is not None
     async def analyze_image(
         self,
         image_data: bytes,
         prompt: str,
         temperature: float = 0.6,
-        max_tokens: int = 512
     ) -> Dict[str, Any]:
         """
         Analyze an image with a text prompt
-        Args:
-            image_data: Raw image bytes
-            prompt: Text question/prompt about the image
-            temperature: Sampling temperature
-            max_tokens: Maximum tokens to generate
-        Returns:
-            Analysis result dictionary
         """
         if not self.is_ready():
             raise RuntimeError("Vision model not initialized")
@@ -93,25 +85,53 @@ class VisionService:
             image = Image.open(io.BytesIO(image_data))
             logger.info(f"Processing image: {image.size} | Format: {image.format}")
             # Create vision message format
             messages = [
                 {
                     "role": "user",
                     "content": [
                         {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
-                        {"type": "text", "text": prompt}
                     ]
                 }
             ]
-            logger.info(f"Analyzing image with prompt: {prompt[:50]}...")
             response = self.model.create_chat_completion(
                 messages=messages,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
             return {
                 "status": "success",
                 "image_info": {
@@ -120,7 +140,7 @@ class VisionService:
                     "mode": image.mode
                 },
                 "prompt": prompt,
-                "response": response['choices'][0]['message']['content'],
                 "usage": response.get('usage', {})
             }
@@ -129,7 +149,6 @@ class VisionService:
             raise
     async def cleanup(self) -> None:
-        """Cleanup resources"""
         if self.model:
             del self.model
             self.model = None

 import base64
 import io
 from typing import Optional, Dict, Any
 from llama_cpp import Llama
 from llama_cpp.llama_chat_format import Llava15ChatHandler
 from huggingface_hub import hf_hub_download
 from PIL import Image
 from config import config
+# ADD THIS IMPORT
+from utils.json_extractor import extract_json_from_content
 logger = logging.getLogger("vision-service")
         self.chat_handler: Optional[Llava15ChatHandler] = None
     async def initialize(self) -> None:
+        # ... (Same as your original code) ...
         try:
             logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
             model_path = hf_hub_download(
             logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
             self.chat_handler = Llava15ChatHandler(
                 clip_model_path=mmproj_path,
                 verbose=False
             raise
     def is_ready(self) -> bool:
         return self.model is not None and self.chat_handler is not None
+    # UPDATED METHOD
     async def analyze_image(
         self,
         image_data: bytes,
         prompt: str,
         temperature: float = 0.6,
+        max_tokens: int = 512,
+        return_json: bool = False  # Added parameter
     ) -> Dict[str, Any]:
         """
         Analyze an image with a text prompt
         """
         if not self.is_ready():
             raise RuntimeError("Vision model not initialized")
             image = Image.open(io.BytesIO(image_data))
             logger.info(f"Processing image: {image.size} | Format: {image.format}")
+            # Modify prompt if return_json is requested
+            # Note: For LLaVA/Vision models, it is often safer to append the system instruction
+            # to the user text rather than a separate system role message.
+            final_prompt = prompt
+            if return_json:
+                final_prompt += (
+                    "\n\nYou are a strict JSON generator. "
+                    "Convert the output into valid JSON format. "
+                    "Output strictly in markdown code blocks like ```json ... ```. "
+                    "Do not add conversational filler."
+                )
             # Create vision message format
             messages = [
                 {
                     "role": "user",
                     "content": [
                         {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
+                        {"type": "text", "text": final_prompt}
                     ]
                 }
             ]
+            logger.info(f"Analyzing image with prompt: {prompt[:50]}... | JSON: {return_json}")
             response = self.model.create_chat_completion(
                 messages=messages,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
+            content_text = response['choices'][0]['message']['content']
+            # Logic for return_json
+            if return_json:
+                extracted_data = extract_json_from_content(content_text)
+                return {
+                    "status": "success",
+                    "data": extracted_data,
+                    "image_info": {
+                        "size": list(image.size),
+                        "format": image.format
+                    },
+                    "usage": response.get('usage', {})
+                }
+            # Standard return
             return {
                 "status": "success",
                 "image_info": {
                     "mode": image.mode
                 },
                 "prompt": prompt,
+                "response": content_text,
                 "usage": response.get('usage', {})
             }
             raise
     async def cleanup(self) -> None:
         if self.model:
             del self.model
             self.model = None