Soumik Bose commited on
Commit
cde2f6e
·
1 Parent(s): 8de8d71
models/schemas.py CHANGED
@@ -16,6 +16,7 @@ class VisionRequest(BaseModel):
16
  prompt: str = Field(..., description="Text prompt/question about the image")
17
  temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
18
  max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
 
19
 
20
  class ErrorResponse(BaseModel):
21
  error: str
 
16
  prompt: str = Field(..., description="Text prompt/question about the image")
17
  temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
18
  max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
19
+ return_json: Optional[bool] = Field(False, description="Extract and return JSON from response")
20
 
21
  class ErrorResponse(BaseModel):
22
  error: str
routers/vision_router.py CHANGED
@@ -16,15 +16,12 @@ async def analyze_image(
16
  image: UploadFile = File(..., description="Image file to analyze"),
17
  prompt: str = Form(..., description="Question or prompt about the image"),
18
  temperature: float = Form(0.6, ge=0.0, le=2.0),
19
- max_tokens: int = Form(512, ge=1, le=4096)
 
 
20
  ):
21
  """
22
  Analyze an image with a text prompt
23
-
24
- Accepts:
25
- - Image file (JPEG, PNG, GIF, WebP, BMP)
26
- - Text prompt/question
27
- - Optional generation parameters
28
  """
29
  if not vision_service.is_ready():
30
  raise HTTPException(status_code=503, detail="Vision model not ready")
@@ -53,7 +50,8 @@ async def analyze_image(
53
  image_data=image_data,
54
  prompt=prompt,
55
  temperature=temperature,
56
- max_tokens=max_tokens
 
57
  )
58
 
59
  return JSONResponse(content=result)
 
16
  image: UploadFile = File(..., description="Image file to analyze"),
17
  prompt: str = Form(..., description="Question or prompt about the image"),
18
  temperature: float = Form(0.6, ge=0.0, le=2.0),
19
+ max_tokens: int = Form(512, ge=1, le=4096),
20
+ # ADDED PARAMETER
21
+ return_json: bool = Form(False, description="Ensure output is valid JSON")
22
  ):
23
  """
24
  Analyze an image with a text prompt
 
 
 
 
 
25
  """
26
  if not vision_service.is_ready():
27
  raise HTTPException(status_code=503, detail="Vision model not ready")
 
50
  image_data=image_data,
51
  prompt=prompt,
52
  temperature=temperature,
53
+ max_tokens=max_tokens,
54
+ return_json=return_json
55
  )
56
 
57
  return JSONResponse(content=result)
services/text_service.py CHANGED
@@ -107,8 +107,7 @@ class TextService:
107
  extracted_data = extract_json_from_content(content_text)
108
  return {
109
  "status": "success",
110
- "data": extracted_data,
111
- "raw_content": content_text
112
  }
113
 
114
  return response
 
107
  extracted_data = extract_json_from_content(content_text)
108
  return {
109
  "status": "success",
110
+ "data": extracted_data
 
111
  }
112
 
113
  return response
services/vision_service.py CHANGED
@@ -2,13 +2,14 @@ import logging
2
  import base64
3
  import io
4
  from typing import Optional, Dict, Any
5
- from pathlib import Path
6
  from llama_cpp import Llama
7
  from llama_cpp.llama_chat_format import Llava15ChatHandler
8
  from huggingface_hub import hf_hub_download
9
  from PIL import Image
10
 
11
  from config import config
 
 
12
 
13
  logger = logging.getLogger("vision-service")
14
 
@@ -20,7 +21,7 @@ class VisionService:
20
  self.chat_handler: Optional[Llava15ChatHandler] = None
21
 
22
  async def initialize(self) -> None:
23
- """Initialize the vision model"""
24
  try:
25
  logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
26
  model_path = hf_hub_download(
@@ -38,7 +39,6 @@ class VisionService:
38
 
39
  logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
40
 
41
- # Initialize chat handler with multimodal projection
42
  self.chat_handler = Llava15ChatHandler(
43
  clip_model_path=mmproj_path,
44
  verbose=False
@@ -60,27 +60,19 @@ class VisionService:
60
  raise
61
 
62
  def is_ready(self) -> bool:
63
- """Check if the model is loaded and ready"""
64
  return self.model is not None and self.chat_handler is not None
65
 
 
66
  async def analyze_image(
67
  self,
68
  image_data: bytes,
69
  prompt: str,
70
  temperature: float = 0.6,
71
- max_tokens: int = 512
 
72
  ) -> Dict[str, Any]:
73
  """
74
  Analyze an image with a text prompt
75
-
76
- Args:
77
- image_data: Raw image bytes
78
- prompt: Text question/prompt about the image
79
- temperature: Sampling temperature
80
- max_tokens: Maximum tokens to generate
81
-
82
- Returns:
83
- Analysis result dictionary
84
  """
85
  if not self.is_ready():
86
  raise RuntimeError("Vision model not initialized")
@@ -93,25 +85,53 @@ class VisionService:
93
  image = Image.open(io.BytesIO(image_data))
94
  logger.info(f"Processing image: {image.size} | Format: {image.format}")
95
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # Create vision message format
97
  messages = [
98
  {
99
  "role": "user",
100
  "content": [
101
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
102
- {"type": "text", "text": prompt}
103
  ]
104
  }
105
  ]
106
 
107
- logger.info(f"Analyzing image with prompt: {prompt[:50]}...")
108
 
109
  response = self.model.create_chat_completion(
110
  messages=messages,
111
  temperature=temperature,
112
  max_tokens=max_tokens
113
  )
 
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  return {
116
  "status": "success",
117
  "image_info": {
@@ -120,7 +140,7 @@ class VisionService:
120
  "mode": image.mode
121
  },
122
  "prompt": prompt,
123
- "response": response['choices'][0]['message']['content'],
124
  "usage": response.get('usage', {})
125
  }
126
 
@@ -129,7 +149,6 @@ class VisionService:
129
  raise
130
 
131
  async def cleanup(self) -> None:
132
- """Cleanup resources"""
133
  if self.model:
134
  del self.model
135
  self.model = None
 
2
  import base64
3
  import io
4
  from typing import Optional, Dict, Any
 
5
  from llama_cpp import Llama
6
  from llama_cpp.llama_chat_format import Llava15ChatHandler
7
  from huggingface_hub import hf_hub_download
8
  from PIL import Image
9
 
10
  from config import config
11
+ # ADD THIS IMPORT
12
+ from utils.json_extractor import extract_json_from_content
13
 
14
  logger = logging.getLogger("vision-service")
15
 
 
21
  self.chat_handler: Optional[Llava15ChatHandler] = None
22
 
23
  async def initialize(self) -> None:
24
+ # ... (Same as your original code) ...
25
  try:
26
  logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
27
  model_path = hf_hub_download(
 
39
 
40
  logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
41
 
 
42
  self.chat_handler = Llava15ChatHandler(
43
  clip_model_path=mmproj_path,
44
  verbose=False
 
60
  raise
61
 
62
  def is_ready(self) -> bool:
 
63
  return self.model is not None and self.chat_handler is not None
64
 
65
+ # UPDATED METHOD
66
  async def analyze_image(
67
  self,
68
  image_data: bytes,
69
  prompt: str,
70
  temperature: float = 0.6,
71
+ max_tokens: int = 512,
72
+ return_json: bool = False # Added parameter
73
  ) -> Dict[str, Any]:
74
  """
75
  Analyze an image with a text prompt
 
 
 
 
 
 
 
 
 
76
  """
77
  if not self.is_ready():
78
  raise RuntimeError("Vision model not initialized")
 
85
  image = Image.open(io.BytesIO(image_data))
86
  logger.info(f"Processing image: {image.size} | Format: {image.format}")
87
 
88
+ # Modify prompt if return_json is requested
89
+ # Note: For LLaVA/Vision models, it is often safer to append the system instruction
90
+ # to the user text rather than a separate system role message.
91
+ final_prompt = prompt
92
+ if return_json:
93
+ final_prompt += (
94
+ "\n\nYou are a strict JSON generator. "
95
+ "Convert the output into valid JSON format. "
96
+ "Output strictly in markdown code blocks like ```json ... ```. "
97
+ "Do not add conversational filler."
98
+ )
99
+
100
  # Create vision message format
101
  messages = [
102
  {
103
  "role": "user",
104
  "content": [
105
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
106
+ {"type": "text", "text": final_prompt}
107
  ]
108
  }
109
  ]
110
 
111
+ logger.info(f"Analyzing image with prompt: {prompt[:50]}... | JSON: {return_json}")
112
 
113
  response = self.model.create_chat_completion(
114
  messages=messages,
115
  temperature=temperature,
116
  max_tokens=max_tokens
117
  )
118
+
119
+ content_text = response['choices'][0]['message']['content']
120
 
121
+ # Logic for return_json
122
+ if return_json:
123
+ extracted_data = extract_json_from_content(content_text)
124
+ return {
125
+ "status": "success",
126
+ "data": extracted_data,
127
+ "image_info": {
128
+ "size": list(image.size),
129
+ "format": image.format
130
+ },
131
+ "usage": response.get('usage', {})
132
+ }
133
+
134
+ # Standard return
135
  return {
136
  "status": "success",
137
  "image_info": {
 
140
  "mode": image.mode
141
  },
142
  "prompt": prompt,
143
+ "response": content_text,
144
  "usage": response.get('usage', {})
145
  }
146
 
 
149
  raise
150
 
151
  async def cleanup(self) -> None:
 
152
  if self.model:
153
  del self.model
154
  self.model = None