Soumik Bose commited on
Commit
80e7d10
·
1 Parent(s): 8f0d05b
Dockerfile CHANGED
@@ -4,37 +4,55 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
4
  PYTHONUNBUFFERED=1 \
5
  PORT=7860 \
6
  HF_HOME=/app/cache \
 
7
  PATH="/home/user/.local/bin:${PATH}"
8
 
9
  WORKDIR /app
10
 
11
- # Install build dependencies
12
  RUN apt-get update && apt-get install -y \
13
  build-essential \
14
  cmake \
15
  curl \
16
  git \
 
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
- # Create user
20
- RUN useradd -m -u 1000 user
21
- RUN mkdir -p /app/cache /app/models && chown -R user:user /app
 
22
 
23
- # Install pip as root
24
  RUN pip install --no-cache-dir --upgrade pip
25
 
 
26
  USER user
27
 
28
- # Build and install llama-cpp-python with proper flags
29
- RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_NATIVE=OFF" \
30
- pip install --no-cache-dir llama-cpp-python==0.3.2
31
 
32
- # Install other dependencies
33
  COPY --chown=user:user requirements.txt .
34
- RUN pip install --no-cache-dir -r requirements.txt
35
 
36
- # Copy app
 
37
  COPY --chown=user:user main.py .
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  EXPOSE 7860
40
- CMD ["bash", "-c", "while true; do curl -s https://xce009-ai-chat-api.hf.space/ping > /dev/null || true; sleep 300; done & python -m uvicorn main:app --host 0.0.0.0 --port 7860"]
 
 
 
4
  PYTHONUNBUFFERED=1 \
5
  PORT=7860 \
6
  HF_HOME=/app/cache \
7
+ CPU_THREADS=2 \
8
  PATH="/home/user/.local/bin:${PATH}"
9
 
10
  WORKDIR /app
11
 
12
+ # Install system dependencies
13
  RUN apt-get update && apt-get install -y \
14
  build-essential \
15
  cmake \
16
  curl \
17
  git \
18
+ libgomp1 \
19
  && rm -rf /var/lib/apt/lists/*
20
 
21
+ # Create non-root user
22
+ RUN useradd -m -u 1000 user && \
23
+ mkdir -p /app/cache /app/models && \
24
+ chown -R user:user /app
25
 
26
+ # Upgrade pip as root
27
  RUN pip install --no-cache-dir --upgrade pip
28
 
29
+ # Switch to non-root user
30
  USER user
31
 
32
+ # Install llama-cpp-python with optimized build flags
33
+ RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_NATIVE=OFF -DGGML_AVX2=ON" \
34
+ pip install --no-cache-dir --user llama-cpp-python==0.3.2
35
 
36
+ # Copy requirements and install dependencies
37
  COPY --chown=user:user requirements.txt .
38
+ RUN pip install --no-cache-dir --user -r requirements.txt
39
 
40
+ # Copy application structure
41
+ COPY --chown=user:user config.py .
42
  COPY --chown=user:user main.py .
43
+ COPY --chown=user:user models/ ./models/
44
+ COPY --chown=user:user services/ ./services/
45
+ COPY --chown=user:user routers/ ./routers/
46
+ COPY --chown=user:user utils/ ./utils/
47
+
48
+ # Create __init__.py files if they don't exist
49
+ RUN touch models/__init__.py services/__init__.py routers/__init__.py utils/__init__.py
50
+
51
+ # Health check
52
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
53
+ CMD curl -f http://localhost:7860/ping || exit 1
54
 
55
  EXPOSE 7860
56
+
57
+ # Production startup with keep-alive and graceful shutdown
58
+ CMD ["bash", "-c", "while true; do curl -s https://xce009-ai-chat-api.hf.space/ping > /dev/null 2>&1 || true; sleep 300; done & exec python -m uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info"]
config.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ class Config:
5
+ """Centralized configuration for the SmolLM API"""
6
+
7
+ # Server Configuration
8
+ PORT: int = int(os.getenv("PORT", "7860"))
9
+ HOST: str = "0.0.0.0"
10
+
11
+ # Cache Configuration
12
+ HF_HOME: str = os.getenv("HF_HOME", "/app/cache")
13
+
14
+ # CPU Configuration
15
+ N_THREADS: int = int(os.getenv("CPU_THREADS", "2"))
16
+
17
+ # Text Model Configuration
18
+ TEXT_MODEL_REPO: str = "bartowski/SmolLM2-1.7B-Instruct-GGUF"
19
+ TEXT_MODEL_FILE: str = "SmolLM2-1.7B-Instruct-Q4_K_M.gguf"
20
+ TEXT_MODEL_CTX: int = 2048
21
+ TEXT_MODEL_BATCH: int = 512
22
+
23
+ # Vision Model Configuration
24
+ VISION_MODEL_REPO: str = "ggml-org/SmolVLM-500M-Instruct-GGUF"
25
+ VISION_MODEL_FILE: str = "smolvlm-500m-instruct-q8_0.gguf"
26
+ VISION_MMPROJ_FILE: str = "mmproj-smolvlm-500m-instruct-f16.gguf"
27
+ VISION_MODEL_CTX: int = 2048
28
+ VISION_MODEL_BATCH: int = 512
29
+
30
+ # Default Generation Parameters
31
+ DEFAULT_TEMPERATURE: float = 0.6
32
+ DEFAULT_MAX_TOKENS: int = 512
33
+
34
+ # File Upload Configuration
35
+ MAX_FILE_SIZE: int = 10 * 1024 * 1024 # 10MB
36
+ ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
37
+
38
+ config = Config()
main.py CHANGED
@@ -1,280 +1,119 @@
1
- import os
2
  import logging
3
- import json
4
  from contextlib import asynccontextmanager
5
- from typing import List, Optional, Any
 
 
 
6
 
7
- from fastapi import FastAPI, HTTPException
8
- from fastapi.responses import JSONResponse, StreamingResponse
9
- from pydantic import BaseModel
10
- from llama_cpp import Llama
11
- from huggingface_hub import hf_hub_download
12
 
13
- # --- 1. Logging Setup ---
14
  logging.basicConfig(
15
  level=logging.INFO,
16
  format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
17
  )
18
- logger = logging.getLogger("SmolLM-API")
19
 
20
- # --- 2. Helper Functions (Previously in json_service.py) ---
21
-
22
- def find_balanced_closing_index(text: str, start_index: int) -> int:
23
- """
24
- Finds the matching closing bracket for the bracket at start_index.
25
- Ignores brackets inside strings and comments.
26
- """
27
- start_char = text[start_index]
28
- end_char = '}' if start_char == '{' else ']'
29
 
30
- depth = 0
31
- in_double_quote = False
32
- in_single_quote = False
33
- in_backtick = False
34
- in_line_comment = False
35
- in_block_comment = False
36
- is_escaped = False
37
-
38
- length = len(text)
39
- i = start_index
40
-
41
- while i < length:
42
- char = text[i]
43
- next_char = text[i+1] if i + 1 < length else ''
44
 
45
- # Handle Escaping
46
- if is_escaped:
47
- is_escaped = False
48
- i += 1
49
- continue
50
- if char == '\\' and not in_line_comment and not in_block_comment:
51
- is_escaped = True
52
- i += 1
53
- continue
54
-
55
- # Handle Comments
56
- if in_line_comment:
57
- if char == '\n': in_line_comment = False
58
- i += 1
59
- continue
60
- if in_block_comment:
61
- if char == '*' and next_char == '/':
62
- in_block_comment = False
63
- i += 2
64
- continue
65
- i += 1
66
- continue
67
-
68
- # Check comment starts
69
- if not in_double_quote and not in_single_quote and not in_backtick:
70
- if char == '/' and next_char == '/':
71
- in_line_comment = True
72
- i += 2
73
- continue
74
- if char == '/' and next_char == '*':
75
- in_block_comment = True
76
- i += 2
77
- continue
78
-
79
- # Handle Strings
80
- if in_double_quote:
81
- if char == '"': in_double_quote = False
82
- i += 1
83
- continue
84
- if in_single_quote:
85
- if char == "'": in_single_quote = False
86
- i += 1
87
- continue
88
- if in_backtick:
89
- if char == '`': in_backtick = False
90
- i += 1
91
- continue
92
-
93
- if char == '"':
94
- in_double_quote = True
95
- i += 1
96
- continue
97
- if char == "'":
98
- in_single_quote = True
99
- i += 1
100
- continue
101
- if char == '`':
102
- in_backtick = True
103
- i += 1
104
- continue
105
-
106
- # Handle Bracket Counting
107
- if char == start_char:
108
- depth += 1
109
- elif char == end_char:
110
- depth -= 1
111
- if depth == 0:
112
- return i # Found matching close
113
 
114
- i += 1
115
-
116
- return -1
117
-
118
- def extract_json_from_content(content: str) -> List[Any]:
119
- """
120
- Scans text for JSON objects/arrays using state machine logic.
121
- """
122
- if not content or not isinstance(content, str):
123
- return []
124
-
125
- found_blocks = []
126
- cursor = 0
127
- length = len(content)
128
-
129
- while cursor < length:
130
- if content[cursor] not in ['{', '[']:
131
- cursor += 1
132
- continue
133
-
134
- end_index = find_balanced_closing_index(content, cursor)
135
-
136
- if end_index != -1:
137
- raw_candidate = content[cursor : end_index + 1]
138
- try:
139
- parsed = json.loads(raw_candidate)
140
- found_blocks.append(parsed)
141
- cursor = end_index + 1
142
- continue
143
- except json.JSONDecodeError:
144
- pass
145
 
146
- cursor += 1
147
-
148
- return found_blocks
149
-
150
- # --- 3. Model Configuration ---
151
- REPO_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
152
- FILENAME = "smollm2-1.7b-instruct-q4_k_m.gguf"
153
- N_THREADS = int(os.getenv("CPU_THREADS", "2"))
154
-
155
- llm_model: Optional[Llama] = None
156
-
157
- # --- 4. Lifecycle Manager ---
158
- @asynccontextmanager
159
- async def lifespan(app: FastAPI):
160
- global llm_model
161
- logger.info("--- STARTING SMOLLM2 API ---")
162
- try:
163
- logger.info(f"Downloading {FILENAME}...")
164
- model_path = hf_hub_download(
165
- repo_id=REPO_ID,
166
- filename=FILENAME,
167
- cache_dir=os.getenv("HF_HOME", "/app/cache")
168
- )
169
- logger.info(f"Initializing Engine (Threads: {N_THREADS})...")
170
- llm_model = Llama(
171
- model_path=model_path,
172
- n_ctx=2048,
173
- n_threads=N_THREADS,
174
- n_batch=512,
175
- verbose=False
176
- )
177
- logger.info("SmolLM2 Loaded.")
178
  except Exception as e:
179
- logger.critical(f"Startup Failed: {e}")
180
- raise e
 
181
  yield
182
- if llm_model:
183
- del llm_model
184
- logger.info("Model unloaded.")
185
-
186
- app = FastAPI(title="SmolLM2 API", version="2.1", lifespan=lifespan)
187
-
188
- # --- 5. Data Models ---
189
- class Message(BaseModel):
190
- role: str
191
- content: str
 
 
 
 
192
 
193
- class ChatRequest(BaseModel):
194
- messages: List[Message]
195
- temperature: Optional[float] = 0.6
196
- max_tokens: Optional[int] = 512
197
- stream: Optional[bool] = False
198
- returnJson: Optional[bool] = False
 
 
199
 
200
- # --- 6. Endpoints ---
 
 
201
 
202
  @app.get("/")
203
  async def root():
204
- return {"message": "Welcome to the SmolLM2 API! Use /v1/chat/completions to interact."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  @app.get("/ping")
207
  async def ping():
208
- if llm_model: return {"status": "pong", "ready": True}
209
- return JSONResponse(status_code=503, content={"status": "loading"})
210
-
211
- @app.post("/v1/chat/completions")
212
- async def chat(request: ChatRequest):
213
- if not llm_model:
214
- raise HTTPException(status_code=503, detail="Model loading...")
215
-
216
- # --- VALIDATION: Check for conflicting parameters ---
217
- if request.stream and request.returnJson:
218
- raise HTTPException(
219
- status_code=400,
220
- detail="Conflict: 'stream' and 'returnJson' cannot both be True. Streaming prevents JSON extraction."
221
- )
222
-
223
- # Prepare messages
224
- messages_payload = [m.model_dump() for m in request.messages]
225
 
226
- # --- LOGIC FOR returnJson ---
227
- if request.returnJson:
228
- logger.info("Format Mode: JSON Extraction Active")
229
-
230
- system_prompt = {
231
- "role": "system",
232
- "content": (
233
- "You are a strict JSON generator. "
234
- "Convert the user's input into a valid JSON Array of Objects. "
235
- "Output strictly in markdown code blocks like ```json ... ```. "
236
- "Do not add conversational filler."
237
- )
238
- }
239
- messages_payload.insert(0, system_prompt)
240
-
241
- if messages_payload and messages_payload[-1]['role'] == 'user':
242
- messages_payload[-1]['content'] += "\n\nReturn structured JSON of this content..."
243
-
244
- logger.info(f"Processing request: {len(messages_payload)} msgs | Stream: {request.stream}")
245
-
246
- try:
247
- # Generate Response
248
- response_data = llm_model.create_chat_completion(
249
- messages=messages_payload,
250
- temperature=request.temperature,
251
- max_tokens=request.max_tokens,
252
- stream=request.stream
253
  )
254
-
255
- # --- STREAMING RESPONSE LOGIC ---
256
- if request.stream:
257
- def iter_response():
258
- for chunk in response_data:
259
- yield f"data: {json.dumps(chunk)}\n\n"
260
- yield "data: [DONE]\n\n"
261
-
262
- return StreamingResponse(iter_response(), media_type="text/event-stream")
263
-
264
- # --- STANDARD / JSON RESPONSE LOGIC ---
265
-
266
- if not request.returnJson:
267
- return response_data
268
-
269
- # Custom JSON Extraction Logic
270
- content_text = response_data['choices'][0]['message']['content']
271
- extracted_data = extract_json_from_content(content_text)
272
-
273
- return JSONResponse(content={
274
- "status": "success",
275
- "data": extracted_data
276
- })
277
-
278
- except Exception as e:
279
- logger.error(f"Error: {e}")
280
- raise HTTPException(status_code=500, detail=str(e))
 
 
1
  import logging
 
2
  from contextlib import asynccontextmanager
3
+ from datetime import datetime
4
+ from fastapi import FastAPI
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi.responses import JSONResponse
7
 
8
+ from config import config
9
+ from services.text_service import text_service
10
+ from services.vision_service import vision_service
11
+ from routers import text_router, vision_router
 
12
 
13
+ # Logging Setup
14
  logging.basicConfig(
15
  level=logging.INFO,
16
  format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
17
  )
18
+ logger = logging.getLogger("main")
19
 
20
+ @asynccontextmanager
21
+ async def lifespan(app: FastAPI):
22
+ """Application lifecycle manager"""
23
+ logger.info("=" * 60)
24
+ logger.info("STARTING SMOLLM2 MULTIMODAL API")
25
+ logger.info("=" * 60)
 
 
 
26
 
27
+ try:
28
+ # Initialize text service
29
+ logger.info("Initializing Text Service...")
30
+ await text_service.initialize()
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Initialize vision service
33
+ logger.info("Initializing Vision Service...")
34
+ await vision_service.initialize()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ logger.info("=" * 60)
37
+ logger.info("✓ All services initialized successfully")
38
+ logger.info("=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
+ logger.critical(f"Startup failed: {e}")
42
+ raise
43
+
44
  yield
45
+
46
+ # Cleanup
47
+ logger.info("Shutting down services...")
48
+ await text_service.cleanup()
49
+ await vision_service.cleanup()
50
+ logger.info("Shutdown complete")
51
+
52
+ # Create FastAPI application
53
+ app = FastAPI(
54
+ title="SmolLM2 Multimodal API",
55
+ version="3.0",
56
+ description="Production-ready API for SmolLM2 text and vision models",
57
+ lifespan=lifespan
58
+ )
59
 
60
+ # Add CORS middleware
61
+ app.add_middleware(
62
+ CORSMiddleware,
63
+ allow_origins=["*"],
64
+ allow_credentials=True,
65
+ allow_methods=["*"],
66
+ allow_headers=["*"],
67
+ )
68
 
69
+ # Include routers
70
+ app.include_router(text_router.router)
71
+ app.include_router(vision_router.router)
72
 
73
  @app.get("/")
74
  async def root():
75
+ """Root endpoint with API information"""
76
+ return {
77
+ "name": "SmolLM2 Multimodal API",
78
+ "version": "3.0",
79
+ "endpoints": {
80
+ "text": "/v1/text/chat/completions",
81
+ "vision": "/v1/vision/analyze",
82
+ "health": "/health"
83
+ },
84
+ "docs": "/docs"
85
+ }
86
+
87
+ @app.get("/health")
88
+ async def health_check():
89
+ """Comprehensive health check"""
90
+ return {
91
+ "status": "healthy",
92
+ "services": {
93
+ "text": text_service.is_ready(),
94
+ "vision": vision_service.is_ready()
95
+ },
96
+ "timestamp": datetime.utcnow().isoformat()
97
+ }
98
 
99
  @app.get("/ping")
100
  async def ping():
101
+ """Simple ping endpoint"""
102
+ all_ready = text_service.is_ready() and vision_service.is_ready()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ if not all_ready:
105
+ return JSONResponse(
106
+ status_code=503,
107
+ content={"status": "initializing", "ready": False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  )
109
+
110
+ return {"status": "pong", "ready": True}
111
+
112
+ if __name__ == "__main__":
113
+ import uvicorn
114
+ uvicorn.run(
115
+ "main:app",
116
+ host=config.HOST,
117
+ port=config.PORT,
118
+ log_level="info"
119
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/schemas.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Any
2
+ from pydantic import BaseModel, Field
3
+
4
+ class Message(BaseModel):
5
+ role: str = Field(..., description="Role of the message sender (user/assistant/system)")
6
+ content: str = Field(..., description="Content of the message")
7
+
8
+ class ChatRequest(BaseModel):
9
+ messages: List[Message] = Field(..., description="List of messages in the conversation")
10
+ temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
11
+ max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
12
+ stream: Optional[bool] = Field(False, description="Enable streaming response")
13
+ returnJson: Optional[bool] = Field(False, description="Extract and return JSON from response")
14
+
15
+ class VisionRequest(BaseModel):
16
+ prompt: str = Field(..., description="Text prompt/question about the image")
17
+ temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
18
+ max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
19
+
20
+ class ErrorResponse(BaseModel):
21
+ error: str
22
+ detail: Optional[str] = None
23
+
24
+ class HealthResponse(BaseModel):
25
+ status: str
26
+ text_model: bool
27
+ vision_model: bool
28
+ timestamp: str
requirements.txt CHANGED
@@ -1,4 +1,7 @@
1
  fastapi>=0.115.0
2
  uvicorn>=0.30.0
3
  pydantic>=2.8.0
4
- huggingface-hub>=0.24.0
 
 
 
 
1
  fastapi>=0.115.0
2
  uvicorn>=0.30.0
3
  pydantic>=2.8.0
4
+ huggingface-hub>=0.24.0
5
+ llama-cpp-python==0.3.2
6
+ python-multipart>=0.0.9
7
+ Pillow>=10.0.0
routers/text_router.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from fastapi.responses import StreamingResponse, JSONResponse
3
+ import logging
4
+
5
+ from models.schemas import ChatRequest, ErrorResponse
6
+ from services.text_service import text_service
7
+
8
+ logger = logging.getLogger("text-router")
9
+
10
+ router = APIRouter(prefix="/v1/text", tags=["Text Generation"])
11
+
12
+ @router.post("/chat/completions")
13
+ async def create_chat_completion(request: ChatRequest):
14
+ """
15
+ Create a chat completion using the text model
16
+
17
+ Supports:
18
+ - Standard completions
19
+ - Streaming responses
20
+ - JSON extraction mode
21
+ """
22
+ if not text_service.is_ready():
23
+ raise HTTPException(status_code=503, detail="Text model not ready")
24
+
25
+ try:
26
+ messages = [msg.model_dump() for msg in request.messages]
27
+
28
+ result = await text_service.generate_completion(
29
+ messages=messages,
30
+ temperature=request.temperature,
31
+ max_tokens=request.max_tokens,
32
+ stream=request.stream,
33
+ return_json=request.returnJson
34
+ )
35
+
36
+ if request.stream:
37
+ return StreamingResponse(result, media_type="text/event-stream")
38
+
39
+ return JSONResponse(content=result)
40
+
41
+ except ValueError as e:
42
+ raise HTTPException(status_code=400, detail=str(e))
43
+ except Exception as e:
44
+ logger.error(f"Chat completion error: {e}")
45
+ raise HTTPException(status_code=500, detail=str(e))
46
+
47
+ @router.get("/health")
48
+ async def text_health():
49
+ """Check text model health status"""
50
+ return {
51
+ "status": "healthy" if text_service.is_ready() else "initializing",
52
+ "model_ready": text_service.is_ready()
53
+ }
routers/vision_router.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, File, UploadFile, Form
2
+ from fastapi.responses import JSONResponse
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from models.schemas import VisionRequest, ErrorResponse
7
+ from services.vision_service import vision_service
8
+ from config import config
9
+
10
+ logger = logging.getLogger("vision-router")
11
+
12
+ router = APIRouter(prefix="/v1/vision", tags=["Vision AI"])
13
+
14
+ @router.post("/analyze")
15
+ async def analyze_image(
16
+ image: UploadFile = File(..., description="Image file to analyze"),
17
+ prompt: str = Form(..., description="Question or prompt about the image"),
18
+ temperature: float = Form(0.6, ge=0.0, le=2.0),
19
+ max_tokens: int = Form(512, ge=1, le=4096)
20
+ ):
21
+ """
22
+ Analyze an image with a text prompt
23
+
24
+ Accepts:
25
+ - Image file (JPEG, PNG, GIF, WebP, BMP)
26
+ - Text prompt/question
27
+ - Optional generation parameters
28
+ """
29
+ if not vision_service.is_ready():
30
+ raise HTTPException(status_code=503, detail="Vision model not ready")
31
+
32
+ # Validate file extension
33
+ file_ext = Path(image.filename).suffix.lower()
34
+ if file_ext not in config.ALLOWED_IMAGE_EXTENSIONS:
35
+ raise HTTPException(
36
+ status_code=400,
37
+ detail=f"Invalid file type. Allowed: {', '.join(config.ALLOWED_IMAGE_EXTENSIONS)}"
38
+ )
39
+
40
+ try:
41
+ # Read image data
42
+ image_data = await image.read()
43
+
44
+ # Check file size
45
+ if len(image_data) > config.MAX_FILE_SIZE:
46
+ raise HTTPException(
47
+ status_code=400,
48
+ detail=f"File too large. Max size: {config.MAX_FILE_SIZE / 1024 / 1024}MB"
49
+ )
50
+
51
+ # Analyze image
52
+ result = await vision_service.analyze_image(
53
+ image_data=image_data,
54
+ prompt=prompt,
55
+ temperature=temperature,
56
+ max_tokens=max_tokens
57
+ )
58
+
59
+ return JSONResponse(content=result)
60
+
61
+ except HTTPException:
62
+ raise
63
+ except Exception as e:
64
+ logger.error(f"Image analysis error: {e}")
65
+ raise HTTPException(status_code=500, detail=str(e))
66
+
67
+ @router.get("/health")
68
+ async def vision_health():
69
+ """Check vision model health status"""
70
+ return {
71
+ "status": "healthy" if vision_service.is_ready() else "initializing",
72
+ "model_ready": vision_service.is_ready()
73
+ }
services/text_service.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Dict, Any, List, AsyncIterator
3
+ from llama_cpp import Llama
4
+ from huggingface_hub import hf_hub_download
5
+ import json
6
+
7
+ from config import config
8
+ from utils.json_extractor import extract_json_from_content
9
+
10
+ logger = logging.getLogger("text-service")
11
+
12
+ class TextService:
13
+ """Service for text-based language model interactions"""
14
+
15
+ def __init__(self):
16
+ self.model: Optional[Llama] = None
17
+
18
+ async def initialize(self) -> None:
19
+ """Initialize the text model"""
20
+ try:
21
+ logger.info(f"Downloading text model: {config.TEXT_MODEL_FILE}...")
22
+ model_path = hf_hub_download(
23
+ repo_id=config.TEXT_MODEL_REPO,
24
+ filename=config.TEXT_MODEL_FILE,
25
+ cache_dir=config.HF_HOME
26
+ )
27
+
28
+ logger.info(f"Loading text model (Threads: {config.N_THREADS})...")
29
+ self.model = Llama(
30
+ model_path=model_path,
31
+ n_ctx=config.TEXT_MODEL_CTX,
32
+ n_threads=config.N_THREADS,
33
+ n_batch=config.TEXT_MODEL_BATCH,
34
+ verbose=False
35
+ )
36
+ logger.info("✓ Text model loaded successfully")
37
+
38
+ except Exception as e:
39
+ logger.error(f"Failed to initialize text model: {e}")
40
+ raise
41
+
42
+ def is_ready(self) -> bool:
43
+ """Check if the model is loaded and ready"""
44
+ return self.model is not None
45
+
46
+ async def generate_completion(
47
+ self,
48
+ messages: List[Dict[str, str]],
49
+ temperature: float = 0.6,
50
+ max_tokens: int = 512,
51
+ stream: bool = False,
52
+ return_json: bool = False
53
+ ) -> Any:
54
+ """
55
+ Generate text completion
56
+
57
+ Args:
58
+ messages: List of message dictionaries with 'role' and 'content'
59
+ temperature: Sampling temperature
60
+ max_tokens: Maximum tokens to generate
61
+ stream: Whether to stream the response
62
+ return_json: Whether to extract JSON from response
63
+
64
+ Returns:
65
+ Generated completion (dict or stream)
66
+ """
67
+ if not self.is_ready():
68
+ raise RuntimeError("Text model not initialized")
69
+
70
+ # Validate conflicting parameters
71
+ if stream and return_json:
72
+ raise ValueError("Cannot use both 'stream' and 'return_json' simultaneously")
73
+
74
+ # Prepare messages for JSON extraction mode
75
+ if return_json:
76
+ system_prompt = {
77
+ "role": "system",
78
+ "content": (
79
+ "You are a strict JSON generator. "
80
+ "Convert the user's input into valid JSON format. "
81
+ "Output strictly in markdown code blocks like ```json ... ```. "
82
+ "Do not add conversational filler."
83
+ )
84
+ }
85
+ messages = [system_prompt] + messages
86
+
87
+ if messages[-1]['role'] == 'user':
88
+ messages[-1]['content'] += "\n\nReturn structured JSON of this content."
89
+
90
+ logger.info(f"Generating completion: {len(messages)} messages | Stream: {stream}")
91
+
92
+ try:
93
+ response = self.model.create_chat_completion(
94
+ messages=messages,
95
+ temperature=temperature,
96
+ max_tokens=max_tokens,
97
+ stream=stream
98
+ )
99
+
100
+ # Handle streaming response
101
+ if stream:
102
+ return self._create_stream_iterator(response)
103
+
104
+ # Handle JSON extraction
105
+ if return_json:
106
+ content_text = response['choices'][0]['message']['content']
107
+ extracted_data = extract_json_from_content(content_text)
108
+ return {
109
+ "status": "success",
110
+ "data": extracted_data,
111
+ "raw_content": content_text
112
+ }
113
+
114
+ return response
115
+
116
+ except Exception as e:
117
+ logger.error(f"Error generating completion: {e}")
118
+ raise
119
+
120
+ async def _create_stream_iterator(self, response_stream) -> AsyncIterator[str]:
121
+ """Create an async iterator for streaming responses"""
122
+ for chunk in response_stream:
123
+ yield f"data: {json.dumps(chunk)}\n\n"
124
+ yield "data: [DONE]\n\n"
125
+
126
+ async def cleanup(self) -> None:
127
+ """Cleanup resources"""
128
+ if self.model:
129
+ del self.model
130
+ self.model = None
131
+ logger.info("Text model unloaded")
132
+
133
+ # Global instance
134
+ text_service = TextService()
services/vision_service.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import base64
3
+ import io
4
+ from typing import Optional, Dict, Any
5
+ from pathlib import Path
6
+ from llama_cpp import Llama
7
+ from llama_cpp.llama_chat_format import Llava15ChatHandler
8
+ from huggingface_hub import hf_hub_download
9
+ from PIL import Image
10
+
11
+ from config import config
12
+
13
+ logger = logging.getLogger("vision-service")
14
+
15
+ class VisionService:
16
+ """Service for vision-language model interactions"""
17
+
18
+ def __init__(self):
19
+ self.model: Optional[Llama] = None
20
+ self.chat_handler: Optional[Llava15ChatHandler] = None
21
+
22
+ async def initialize(self) -> None:
23
+ """Initialize the vision model"""
24
+ try:
25
+ logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
26
+ model_path = hf_hub_download(
27
+ repo_id=config.VISION_MODEL_REPO,
28
+ filename=config.VISION_MODEL_FILE,
29
+ cache_dir=config.HF_HOME
30
+ )
31
+
32
+ logger.info(f"Downloading vision projector: {config.VISION_MMPROJ_FILE}...")
33
+ mmproj_path = hf_hub_download(
34
+ repo_id=config.VISION_MODEL_REPO,
35
+ filename=config.VISION_MMPROJ_FILE,
36
+ cache_dir=config.HF_HOME
37
+ )
38
+
39
+ logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
40
+
41
+ # Initialize chat handler with multimodal projection
42
+ self.chat_handler = Llava15ChatHandler(
43
+ clip_model_path=mmproj_path,
44
+ verbose=False
45
+ )
46
+
47
+ self.model = Llama(
48
+ model_path=model_path,
49
+ chat_handler=self.chat_handler,
50
+ n_ctx=config.VISION_MODEL_CTX,
51
+ n_threads=config.N_THREADS,
52
+ n_batch=config.VISION_MODEL_BATCH,
53
+ logits_all=True,
54
+ verbose=False
55
+ )
56
+ logger.info("✓ Vision model loaded successfully")
57
+
58
+ except Exception as e:
59
+ logger.error(f"Failed to initialize vision model: {e}")
60
+ raise
61
+
62
+ def is_ready(self) -> bool:
63
+ """Check if the model is loaded and ready"""
64
+ return self.model is not None and self.chat_handler is not None
65
+
66
+ async def analyze_image(
67
+ self,
68
+ image_data: bytes,
69
+ prompt: str,
70
+ temperature: float = 0.6,
71
+ max_tokens: int = 512
72
+ ) -> Dict[str, Any]:
73
+ """
74
+ Analyze an image with a text prompt
75
+
76
+ Args:
77
+ image_data: Raw image bytes
78
+ prompt: Text question/prompt about the image
79
+ temperature: Sampling temperature
80
+ max_tokens: Maximum tokens to generate
81
+
82
+ Returns:
83
+ Analysis result dictionary
84
+ """
85
+ if not self.is_ready():
86
+ raise RuntimeError("Vision model not initialized")
87
+
88
+ try:
89
+ # Convert image bytes to base64 data URI
90
+ image_b64 = base64.b64encode(image_data).decode('utf-8')
91
+
92
+ # Validate image
93
+ image = Image.open(io.BytesIO(image_data))
94
+ logger.info(f"Processing image: {image.size} | Format: {image.format}")
95
+
96
+ # Create vision message format
97
+ messages = [
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
102
+ {"type": "text", "text": prompt}
103
+ ]
104
+ }
105
+ ]
106
+
107
+ logger.info(f"Analyzing image with prompt: {prompt[:50]}...")
108
+
109
+ response = self.model.create_chat_completion(
110
+ messages=messages,
111
+ temperature=temperature,
112
+ max_tokens=max_tokens
113
+ )
114
+
115
+ return {
116
+ "status": "success",
117
+ "image_info": {
118
+ "size": list(image.size),
119
+ "format": image.format,
120
+ "mode": image.mode
121
+ },
122
+ "prompt": prompt,
123
+ "response": response['choices'][0]['message']['content'],
124
+ "usage": response.get('usage', {})
125
+ }
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error analyzing image: {e}")
129
+ raise
130
+
131
+ async def cleanup(self) -> None:
132
+ """Cleanup resources"""
133
+ if self.model:
134
+ del self.model
135
+ self.model = None
136
+ if self.chat_handler:
137
+ del self.chat_handler
138
+ self.chat_handler = None
139
+ logger.info("Vision model unloaded")
140
+
141
+ # Global instance
142
+ vision_service = VisionService()
utils/json_extractor.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from typing import List, Any
4
+
5
+ logger = logging.getLogger("json-extractor")
6
+
7
+ def find_balanced_closing_index(text: str, start_index: int) -> int:
8
+ """
9
+ Finds the matching closing bracket for the bracket at start_index.
10
+ Ignores brackets inside strings and comments.
11
+ """
12
+ start_char = text[start_index]
13
+ end_char = '}' if start_char == '{' else ']'
14
+
15
+ depth = 0
16
+ in_double_quote = False
17
+ in_single_quote = False
18
+ in_backtick = False
19
+ in_line_comment = False
20
+ in_block_comment = False
21
+ is_escaped = False
22
+
23
+ length = len(text)
24
+ i = start_index
25
+
26
+ while i < length:
27
+ char = text[i]
28
+ next_char = text[i+1] if i + 1 < length else ''
29
+
30
+ # Handle Escaping
31
+ if is_escaped:
32
+ is_escaped = False
33
+ i += 1
34
+ continue
35
+ if char == '\\' and not in_line_comment and not in_block_comment:
36
+ is_escaped = True
37
+ i += 1
38
+ continue
39
+
40
+ # Handle Comments
41
+ if in_line_comment:
42
+ if char == '\n': in_line_comment = False
43
+ i += 1
44
+ continue
45
+ if in_block_comment:
46
+ if char == '*' and next_char == '/':
47
+ in_block_comment = False
48
+ i += 2
49
+ continue
50
+ i += 1
51
+ continue
52
+
53
+ # Check comment starts
54
+ if not in_double_quote and not in_single_quote and not in_backtick:
55
+ if char == '/' and next_char == '/':
56
+ in_line_comment = True
57
+ i += 2
58
+ continue
59
+ if char == '/' and next_char == '*':
60
+ in_block_comment = True
61
+ i += 2
62
+ continue
63
+
64
+ # Handle Strings
65
+ if in_double_quote:
66
+ if char == '"': in_double_quote = False
67
+ i += 1
68
+ continue
69
+ if in_single_quote:
70
+ if char == "'": in_single_quote = False
71
+ i += 1
72
+ continue
73
+ if in_backtick:
74
+ if char == '`': in_backtick = False
75
+ i += 1
76
+ continue
77
+
78
+ if char == '"':
79
+ in_double_quote = True
80
+ i += 1
81
+ continue
82
+ if char == "'":
83
+ in_single_quote = True
84
+ i += 1
85
+ continue
86
+ if char == '`':
87
+ in_backtick = True
88
+ i += 1
89
+ continue
90
+
91
+ # Handle Bracket Counting
92
+ if char == start_char:
93
+ depth += 1
94
+ elif char == end_char:
95
+ depth -= 1
96
+ if depth == 0:
97
+ return i
98
+
99
+ i += 1
100
+
101
+ return -1
102
+
103
+ def extract_json_from_content(content: str) -> List[Any]:
104
+ """
105
+ Scans text for JSON objects/arrays using state machine logic.
106
+ """
107
+ if not content or not isinstance(content, str):
108
+ return []
109
+
110
+ found_blocks = []
111
+ cursor = 0
112
+ length = len(content)
113
+
114
+ while cursor < length:
115
+ if content[cursor] not in ['{', '[']:
116
+ cursor += 1
117
+ continue
118
+
119
+ end_index = find_balanced_closing_index(content, cursor)
120
+
121
+ if end_index != -1:
122
+ raw_candidate = content[cursor : end_index + 1]
123
+ try:
124
+ parsed = json.loads(raw_candidate)
125
+ found_blocks.append(parsed)
126
+ cursor = end_index + 1
127
+ continue
128
+ except json.JSONDecodeError:
129
+ pass
130
+
131
+ cursor += 1
132
+
133
+ return found_blocks