Spaces:
Running
Running
Soumik Bose commited on
Commit ·
cde2f6e
1
Parent(s): 8de8d71
go
Browse files- models/schemas.py +1 -0
- routers/vision_router.py +5 -7
- services/text_service.py +1 -2
- services/vision_service.py +37 -18
models/schemas.py
CHANGED
|
@@ -16,6 +16,7 @@ class VisionRequest(BaseModel):
|
|
| 16 |
prompt: str = Field(..., description="Text prompt/question about the image")
|
| 17 |
temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
|
| 18 |
max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
|
|
|
|
| 19 |
|
| 20 |
class ErrorResponse(BaseModel):
|
| 21 |
error: str
|
|
|
|
| 16 |
prompt: str = Field(..., description="Text prompt/question about the image")
|
| 17 |
temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
|
| 18 |
max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
|
| 19 |
+
return_json: Optional[bool] = Field(False, description="Extract and return JSON from response")
|
| 20 |
|
| 21 |
class ErrorResponse(BaseModel):
|
| 22 |
error: str
|
routers/vision_router.py
CHANGED
|
@@ -16,15 +16,12 @@ async def analyze_image(
|
|
| 16 |
image: UploadFile = File(..., description="Image file to analyze"),
|
| 17 |
prompt: str = Form(..., description="Question or prompt about the image"),
|
| 18 |
temperature: float = Form(0.6, ge=0.0, le=2.0),
|
| 19 |
-
max_tokens: int = Form(512, ge=1, le=4096)
|
|
|
|
|
|
|
| 20 |
):
|
| 21 |
"""
|
| 22 |
Analyze an image with a text prompt
|
| 23 |
-
|
| 24 |
-
Accepts:
|
| 25 |
-
- Image file (JPEG, PNG, GIF, WebP, BMP)
|
| 26 |
-
- Text prompt/question
|
| 27 |
-
- Optional generation parameters
|
| 28 |
"""
|
| 29 |
if not vision_service.is_ready():
|
| 30 |
raise HTTPException(status_code=503, detail="Vision model not ready")
|
|
@@ -53,7 +50,8 @@ async def analyze_image(
|
|
| 53 |
image_data=image_data,
|
| 54 |
prompt=prompt,
|
| 55 |
temperature=temperature,
|
| 56 |
-
max_tokens=max_tokens
|
|
|
|
| 57 |
)
|
| 58 |
|
| 59 |
return JSONResponse(content=result)
|
|
|
|
| 16 |
image: UploadFile = File(..., description="Image file to analyze"),
|
| 17 |
prompt: str = Form(..., description="Question or prompt about the image"),
|
| 18 |
temperature: float = Form(0.6, ge=0.0, le=2.0),
|
| 19 |
+
max_tokens: int = Form(512, ge=1, le=4096),
|
| 20 |
+
# ADDED PARAMETER
|
| 21 |
+
return_json: bool = Form(False, description="Ensure output is valid JSON")
|
| 22 |
):
|
| 23 |
"""
|
| 24 |
Analyze an image with a text prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
if not vision_service.is_ready():
|
| 27 |
raise HTTPException(status_code=503, detail="Vision model not ready")
|
|
|
|
| 50 |
image_data=image_data,
|
| 51 |
prompt=prompt,
|
| 52 |
temperature=temperature,
|
| 53 |
+
max_tokens=max_tokens,
|
| 54 |
+
return_json=return_json
|
| 55 |
)
|
| 56 |
|
| 57 |
return JSONResponse(content=result)
|
services/text_service.py
CHANGED
|
@@ -107,8 +107,7 @@ class TextService:
|
|
| 107 |
extracted_data = extract_json_from_content(content_text)
|
| 108 |
return {
|
| 109 |
"status": "success",
|
| 110 |
-
"data": extracted_data
|
| 111 |
-
"raw_content": content_text
|
| 112 |
}
|
| 113 |
|
| 114 |
return response
|
|
|
|
| 107 |
extracted_data = extract_json_from_content(content_text)
|
| 108 |
return {
|
| 109 |
"status": "success",
|
| 110 |
+
"data": extracted_data
|
|
|
|
| 111 |
}
|
| 112 |
|
| 113 |
return response
|
services/vision_service.py
CHANGED
|
@@ -2,13 +2,14 @@ import logging
|
|
| 2 |
import base64
|
| 3 |
import io
|
| 4 |
from typing import Optional, Dict, Any
|
| 5 |
-
from pathlib import Path
|
| 6 |
from llama_cpp import Llama
|
| 7 |
from llama_cpp.llama_chat_format import Llava15ChatHandler
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
from PIL import Image
|
| 10 |
|
| 11 |
from config import config
|
|
|
|
|
|
|
| 12 |
|
| 13 |
logger = logging.getLogger("vision-service")
|
| 14 |
|
|
@@ -20,7 +21,7 @@ class VisionService:
|
|
| 20 |
self.chat_handler: Optional[Llava15ChatHandler] = None
|
| 21 |
|
| 22 |
async def initialize(self) -> None:
|
| 23 |
-
|
| 24 |
try:
|
| 25 |
logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
|
| 26 |
model_path = hf_hub_download(
|
|
@@ -38,7 +39,6 @@ class VisionService:
|
|
| 38 |
|
| 39 |
logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
|
| 40 |
|
| 41 |
-
# Initialize chat handler with multimodal projection
|
| 42 |
self.chat_handler = Llava15ChatHandler(
|
| 43 |
clip_model_path=mmproj_path,
|
| 44 |
verbose=False
|
|
@@ -60,27 +60,19 @@ class VisionService:
|
|
| 60 |
raise
|
| 61 |
|
| 62 |
def is_ready(self) -> bool:
|
| 63 |
-
"""Check if the model is loaded and ready"""
|
| 64 |
return self.model is not None and self.chat_handler is not None
|
| 65 |
|
|
|
|
| 66 |
async def analyze_image(
|
| 67 |
self,
|
| 68 |
image_data: bytes,
|
| 69 |
prompt: str,
|
| 70 |
temperature: float = 0.6,
|
| 71 |
-
max_tokens: int = 512
|
|
|
|
| 72 |
) -> Dict[str, Any]:
|
| 73 |
"""
|
| 74 |
Analyze an image with a text prompt
|
| 75 |
-
|
| 76 |
-
Args:
|
| 77 |
-
image_data: Raw image bytes
|
| 78 |
-
prompt: Text question/prompt about the image
|
| 79 |
-
temperature: Sampling temperature
|
| 80 |
-
max_tokens: Maximum tokens to generate
|
| 81 |
-
|
| 82 |
-
Returns:
|
| 83 |
-
Analysis result dictionary
|
| 84 |
"""
|
| 85 |
if not self.is_ready():
|
| 86 |
raise RuntimeError("Vision model not initialized")
|
|
@@ -93,25 +85,53 @@ class VisionService:
|
|
| 93 |
image = Image.open(io.BytesIO(image_data))
|
| 94 |
logger.info(f"Processing image: {image.size} | Format: {image.format}")
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
# Create vision message format
|
| 97 |
messages = [
|
| 98 |
{
|
| 99 |
"role": "user",
|
| 100 |
"content": [
|
| 101 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
|
| 102 |
-
{"type": "text", "text":
|
| 103 |
]
|
| 104 |
}
|
| 105 |
]
|
| 106 |
|
| 107 |
-
logger.info(f"Analyzing image with prompt: {prompt[:50]}...")
|
| 108 |
|
| 109 |
response = self.model.create_chat_completion(
|
| 110 |
messages=messages,
|
| 111 |
temperature=temperature,
|
| 112 |
max_tokens=max_tokens
|
| 113 |
)
|
|
|
|
|
|
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
return {
|
| 116 |
"status": "success",
|
| 117 |
"image_info": {
|
|
@@ -120,7 +140,7 @@ class VisionService:
|
|
| 120 |
"mode": image.mode
|
| 121 |
},
|
| 122 |
"prompt": prompt,
|
| 123 |
-
"response":
|
| 124 |
"usage": response.get('usage', {})
|
| 125 |
}
|
| 126 |
|
|
@@ -129,7 +149,6 @@ class VisionService:
|
|
| 129 |
raise
|
| 130 |
|
| 131 |
async def cleanup(self) -> None:
|
| 132 |
-
"""Cleanup resources"""
|
| 133 |
if self.model:
|
| 134 |
del self.model
|
| 135 |
self.model = None
|
|
|
|
| 2 |
import base64
|
| 3 |
import io
|
| 4 |
from typing import Optional, Dict, Any
|
|
|
|
| 5 |
from llama_cpp import Llama
|
| 6 |
from llama_cpp.llama_chat_format import Llava15ChatHandler
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
from PIL import Image
|
| 9 |
|
| 10 |
from config import config
|
| 11 |
+
# ADD THIS IMPORT
|
| 12 |
+
from utils.json_extractor import extract_json_from_content
|
| 13 |
|
| 14 |
logger = logging.getLogger("vision-service")
|
| 15 |
|
|
|
|
| 21 |
self.chat_handler: Optional[Llava15ChatHandler] = None
|
| 22 |
|
| 23 |
async def initialize(self) -> None:
|
| 24 |
+
# ... (Same as your original code) ...
|
| 25 |
try:
|
| 26 |
logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
|
| 27 |
model_path = hf_hub_download(
|
|
|
|
| 39 |
|
| 40 |
logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
|
| 41 |
|
|
|
|
| 42 |
self.chat_handler = Llava15ChatHandler(
|
| 43 |
clip_model_path=mmproj_path,
|
| 44 |
verbose=False
|
|
|
|
| 60 |
raise
|
| 61 |
|
| 62 |
def is_ready(self) -> bool:
|
|
|
|
| 63 |
return self.model is not None and self.chat_handler is not None
|
| 64 |
|
| 65 |
+
# UPDATED METHOD
|
| 66 |
async def analyze_image(
|
| 67 |
self,
|
| 68 |
image_data: bytes,
|
| 69 |
prompt: str,
|
| 70 |
temperature: float = 0.6,
|
| 71 |
+
max_tokens: int = 512,
|
| 72 |
+
return_json: bool = False # Added parameter
|
| 73 |
) -> Dict[str, Any]:
|
| 74 |
"""
|
| 75 |
Analyze an image with a text prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
"""
|
| 77 |
if not self.is_ready():
|
| 78 |
raise RuntimeError("Vision model not initialized")
|
|
|
|
| 85 |
image = Image.open(io.BytesIO(image_data))
|
| 86 |
logger.info(f"Processing image: {image.size} | Format: {image.format}")
|
| 87 |
|
| 88 |
+
# Modify prompt if return_json is requested
|
| 89 |
+
# Note: For LLaVA/Vision models, it is often safer to append the system instruction
|
| 90 |
+
# to the user text rather than a separate system role message.
|
| 91 |
+
final_prompt = prompt
|
| 92 |
+
if return_json:
|
| 93 |
+
final_prompt += (
|
| 94 |
+
"\n\nYou are a strict JSON generator. "
|
| 95 |
+
"Convert the output into valid JSON format. "
|
| 96 |
+
"Output strictly in markdown code blocks like ```json ... ```. "
|
| 97 |
+
"Do not add conversational filler."
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
# Create vision message format
|
| 101 |
messages = [
|
| 102 |
{
|
| 103 |
"role": "user",
|
| 104 |
"content": [
|
| 105 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
|
| 106 |
+
{"type": "text", "text": final_prompt}
|
| 107 |
]
|
| 108 |
}
|
| 109 |
]
|
| 110 |
|
| 111 |
+
logger.info(f"Analyzing image with prompt: {prompt[:50]}... | JSON: {return_json}")
|
| 112 |
|
| 113 |
response = self.model.create_chat_completion(
|
| 114 |
messages=messages,
|
| 115 |
temperature=temperature,
|
| 116 |
max_tokens=max_tokens
|
| 117 |
)
|
| 118 |
+
|
| 119 |
+
content_text = response['choices'][0]['message']['content']
|
| 120 |
|
| 121 |
+
# Logic for return_json
|
| 122 |
+
if return_json:
|
| 123 |
+
extracted_data = extract_json_from_content(content_text)
|
| 124 |
+
return {
|
| 125 |
+
"status": "success",
|
| 126 |
+
"data": extracted_data,
|
| 127 |
+
"image_info": {
|
| 128 |
+
"size": list(image.size),
|
| 129 |
+
"format": image.format
|
| 130 |
+
},
|
| 131 |
+
"usage": response.get('usage', {})
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
# Standard return
|
| 135 |
return {
|
| 136 |
"status": "success",
|
| 137 |
"image_info": {
|
|
|
|
| 140 |
"mode": image.mode
|
| 141 |
},
|
| 142 |
"prompt": prompt,
|
| 143 |
+
"response": content_text,
|
| 144 |
"usage": response.get('usage', {})
|
| 145 |
}
|
| 146 |
|
|
|
|
| 149 |
raise
|
| 150 |
|
| 151 |
async def cleanup(self) -> None:
|
|
|
|
| 152 |
if self.model:
|
| 153 |
del self.model
|
| 154 |
self.model = None
|