Soumik Bose commited on
Commit
95db209
·
1 Parent(s): 63d026b
Dockerfile CHANGED
@@ -4,55 +4,37 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
4
  PYTHONUNBUFFERED=1 \
5
  PORT=7860 \
6
  HF_HOME=/app/cache \
7
- CPU_THREADS=2 \
8
  PATH="/home/user/.local/bin:${PATH}"
9
 
10
  WORKDIR /app
11
 
12
- # Install system dependencies
13
  RUN apt-get update && apt-get install -y \
14
  build-essential \
15
  cmake \
16
  curl \
17
  git \
18
- libgomp1 \
19
  && rm -rf /var/lib/apt/lists/*
20
 
21
- # Create non-root user
22
- RUN useradd -m -u 1000 user && \
23
- mkdir -p /app/cache /app/models && \
24
- chown -R user:user /app
25
 
26
- # Upgrade pip as root
27
  RUN pip install --no-cache-dir --upgrade pip
28
 
29
- # Switch to non-root user
30
  USER user
31
 
32
- # Install llama-cpp-python with optimized build flags
33
- RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_NATIVE=OFF -DGGML_AVX2=ON" \
34
- pip install --no-cache-dir --user llama-cpp-python==0.3.16
35
 
36
- # Copy requirements and install dependencies
37
  COPY --chown=user:user requirements.txt .
38
- RUN pip install --no-cache-dir --user -r requirements.txt
39
 
40
- # Copy application structure
41
- COPY --chown=user:user config.py .
42
  COPY --chown=user:user main.py .
43
- COPY --chown=user:user models/ ./models/
44
- COPY --chown=user:user services/ ./services/
45
- COPY --chown=user:user routers/ ./routers/
46
- COPY --chown=user:user utils/ ./utils/
47
-
48
- # Create __init__.py files if they don't exist
49
- RUN touch models/__init__.py services/__init__.py routers/__init__.py utils/__init__.py
50
-
51
- # Health check
52
- HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
53
- CMD curl -f http://localhost:7860/ping || exit 1
54
 
55
  EXPOSE 7860
56
-
57
- # Production startup with keep-alive and graceful shutdown
58
- CMD ["bash", "-c", "while true; do curl -s https://xce009-ai-chat-api.hf.space/ping > /dev/null 2>&1 || true; sleep 300; done & exec python -m uvicorn main:app --host 0.0.0.0 --port 7860 --log-level info"]
 
4
  PYTHONUNBUFFERED=1 \
5
  PORT=7860 \
6
  HF_HOME=/app/cache \
 
7
  PATH="/home/user/.local/bin:${PATH}"
8
 
9
  WORKDIR /app
10
 
11
+ # Install build dependencies
12
  RUN apt-get update && apt-get install -y \
13
  build-essential \
14
  cmake \
15
  curl \
16
  git \
 
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
+ # Create user
20
+ RUN useradd -m -u 1000 user
21
+ RUN mkdir -p /app/cache /app/models && chown -R user:user /app
 
22
 
23
+ # Install pip as root
24
  RUN pip install --no-cache-dir --upgrade pip
25
 
 
26
  USER user
27
 
28
+ # Build and install llama-cpp-python with proper flags
29
+ RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_NATIVE=OFF" \
30
+ pip install --no-cache-dir llama-cpp-python==0.3.2
31
 
32
+ # Install other dependencies
33
  COPY --chown=user:user requirements.txt .
34
+ RUN pip install --no-cache-dir -r requirements.txt
35
 
36
+ # Copy app
 
37
  COPY --chown=user:user main.py .
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  EXPOSE 7860
40
+ CMD ["bash", "-c", "while true; do curl -s https://xce009-ai-chat-api.hf.space/ping > /dev/null || true; sleep 300; done & python -m uvicorn main:app --host 0.0.0.0 --port 7860"]
 
 
config.py DELETED
@@ -1,38 +0,0 @@
1
- import os
2
- from typing import Optional
3
-
4
- class Config:
5
- """Centralized configuration for the SmolLM API"""
6
-
7
- # Server Configuration
8
- PORT: int = int(os.getenv("PORT", "7860"))
9
- HOST: str = "0.0.0.0"
10
-
11
- # Cache Configuration
12
- HF_HOME: str = os.getenv("HF_HOME", "/app/cache")
13
-
14
- # CPU Configuration
15
- N_THREADS: int = int(os.getenv("CPU_THREADS", "2"))
16
-
17
- # Text Model Configuration
18
- TEXT_MODEL_REPO: str = "bartowski/SmolLM2-1.7B-Instruct-GGUF"
19
- TEXT_MODEL_FILE: str = "SmolLM2-1.7B-Instruct-Q4_K_M.gguf"
20
- TEXT_MODEL_CTX: int = 2048
21
- TEXT_MODEL_BATCH: int = 512
22
-
23
- # Vision Model Configuration
24
- VISION_MODEL_REPO: str = "ggml-org/SmolVLM-500M-Instruct-GGUF"
25
- VISION_MODEL_FILE: str = "SmolVLM-500M-Instruct-Q8_0.gguf"
26
- VISION_MMPROJ_FILE: str = "mmproj-SmolVLM-500M-Instruct-f16.gguf"
27
- VISION_MODEL_CTX: int = 2048
28
- VISION_MODEL_BATCH: int = 512
29
-
30
- # Default Generation Parameters
31
- DEFAULT_TEMPERATURE: float = 0.6
32
- DEFAULT_MAX_TOKENS: int = 512
33
-
34
- # File Upload Configuration
35
- MAX_FILE_SIZE: int = 10 * 1024 * 1024 # 10MB
36
- ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
37
-
38
- config = Config()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -1,119 +1,280 @@
 
1
  import logging
 
2
  from contextlib import asynccontextmanager
3
- from datetime import datetime
4
- from fastapi import FastAPI
5
- from fastapi.middleware.cors import CORSMiddleware
6
- from fastapi.responses import JSONResponse
7
 
8
- from config import config
9
- from services.text_service import text_service
10
- from services.vision_service import vision_service
11
- from routers import text_router, vision_router
 
12
 
13
- # Logging Setup
14
  logging.basicConfig(
15
  level=logging.INFO,
16
  format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
17
  )
18
- logger = logging.getLogger("main")
19
 
20
- @asynccontextmanager
21
- async def lifespan(app: FastAPI):
22
- """Application lifecycle manager"""
23
- logger.info("=" * 60)
24
- logger.info("STARTING SMOLLM2 MULTIMODAL API")
25
- logger.info("=" * 60)
 
 
 
26
 
27
- try:
28
- # Initialize text service
29
- logger.info("Initializing Text Service...")
30
- await text_service.initialize()
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Initialize vision service
33
- logger.info("Initializing Vision Service...")
34
- await vision_service.initialize()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- logger.info("=" * 60)
37
- logger.info("✓ All services initialized successfully")
38
- logger.info("=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
- logger.critical(f"Startup failed: {e}")
42
- raise
43
-
44
  yield
45
-
46
- # Cleanup
47
- logger.info("Shutting down services...")
48
- await text_service.cleanup()
49
- await vision_service.cleanup()
50
- logger.info("Shutdown complete")
51
-
52
- # Create FastAPI application
53
- app = FastAPI(
54
- title="SmolLM2 Multimodal API",
55
- version="3.0",
56
- description="Production-ready API for SmolLM2 text and vision models",
57
- lifespan=lifespan
58
- )
59
 
60
- # Add CORS middleware
61
- app.add_middleware(
62
- CORSMiddleware,
63
- allow_origins=["*"],
64
- allow_credentials=True,
65
- allow_methods=["*"],
66
- allow_headers=["*"],
67
- )
 
 
 
 
 
68
 
69
- # Include routers
70
- app.include_router(text_router.router)
71
- app.include_router(vision_router.router)
72
 
73
  @app.get("/")
74
  async def root():
75
- """Root endpoint with API information"""
76
- return {
77
- "name": "SmolLM2 Multimodal API",
78
- "version": "3.0",
79
- "endpoints": {
80
- "text": "/v1/text/chat/completions",
81
- "vision": "/v1/vision/analyze",
82
- "health": "/health"
83
- },
84
- "docs": "/docs"
85
- }
86
-
87
- @app.get("/health")
88
- async def health_check():
89
- """Comprehensive health check"""
90
- return {
91
- "status": "healthy",
92
- "services": {
93
- "text": text_service.is_ready(),
94
- "vision": vision_service.is_ready()
95
- },
96
- "timestamp": datetime.utcnow().isoformat()
97
- }
98
 
99
  @app.get("/ping")
100
  async def ping():
101
- """Simple ping endpoint"""
102
- all_ready = text_service.is_ready() and vision_service.is_ready()
103
-
104
- if not all_ready:
105
- return JSONResponse(
106
- status_code=503,
107
- content={"status": "initializing", "ready": False}
 
 
 
 
 
 
108
  )
 
 
 
109
 
110
- return {"status": "pong", "ready": True}
111
-
112
- if __name__ == "__main__":
113
- import uvicorn
114
- uvicorn.run(
115
- "main:app",
116
- host=config.HOST,
117
- port=config.PORT,
118
- log_level="info"
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import logging
3
+ import json
4
  from contextlib import asynccontextmanager
5
+ from typing import List, Optional, Any
 
 
 
6
 
7
+ from fastapi import FastAPI, HTTPException
8
+ from fastapi.responses import JSONResponse, StreamingResponse
9
+ from pydantic import BaseModel
10
+ from llama_cpp import Llama
11
+ from huggingface_hub import hf_hub_download
12
 
13
+ # --- 1. Logging Setup ---
14
  logging.basicConfig(
15
  level=logging.INFO,
16
  format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
17
  )
18
+ logger = logging.getLogger("SmolLM-API")
19
 
20
+ # --- 2. Helper Functions (Previously in json_service.py) ---
21
+
22
+ def find_balanced_closing_index(text: str, start_index: int) -> int:
23
+ """
24
+ Finds the matching closing bracket for the bracket at start_index.
25
+ Ignores brackets inside strings and comments.
26
+ """
27
+ start_char = text[start_index]
28
+ end_char = '}' if start_char == '{' else ']'
29
 
30
+ depth = 0
31
+ in_double_quote = False
32
+ in_single_quote = False
33
+ in_backtick = False
34
+ in_line_comment = False
35
+ in_block_comment = False
36
+ is_escaped = False
37
+
38
+ length = len(text)
39
+ i = start_index
40
+
41
+ while i < length:
42
+ char = text[i]
43
+ next_char = text[i+1] if i + 1 < length else ''
44
 
45
+ # Handle Escaping
46
+ if is_escaped:
47
+ is_escaped = False
48
+ i += 1
49
+ continue
50
+ if char == '\\' and not in_line_comment and not in_block_comment:
51
+ is_escaped = True
52
+ i += 1
53
+ continue
54
+
55
+ # Handle Comments
56
+ if in_line_comment:
57
+ if char == '\n': in_line_comment = False
58
+ i += 1
59
+ continue
60
+ if in_block_comment:
61
+ if char == '*' and next_char == '/':
62
+ in_block_comment = False
63
+ i += 2
64
+ continue
65
+ i += 1
66
+ continue
67
+
68
+ # Check comment starts
69
+ if not in_double_quote and not in_single_quote and not in_backtick:
70
+ if char == '/' and next_char == '/':
71
+ in_line_comment = True
72
+ i += 2
73
+ continue
74
+ if char == '/' and next_char == '*':
75
+ in_block_comment = True
76
+ i += 2
77
+ continue
78
+
79
+ # Handle Strings
80
+ if in_double_quote:
81
+ if char == '"': in_double_quote = False
82
+ i += 1
83
+ continue
84
+ if in_single_quote:
85
+ if char == "'": in_single_quote = False
86
+ i += 1
87
+ continue
88
+ if in_backtick:
89
+ if char == '`': in_backtick = False
90
+ i += 1
91
+ continue
92
+
93
+ if char == '"':
94
+ in_double_quote = True
95
+ i += 1
96
+ continue
97
+ if char == "'":
98
+ in_single_quote = True
99
+ i += 1
100
+ continue
101
+ if char == '`':
102
+ in_backtick = True
103
+ i += 1
104
+ continue
105
+
106
+ # Handle Bracket Counting
107
+ if char == start_char:
108
+ depth += 1
109
+ elif char == end_char:
110
+ depth -= 1
111
+ if depth == 0:
112
+ return i # Found matching close
113
 
114
+ i += 1
115
+
116
+ return -1
117
+
118
+ def extract_json_from_content(content: str) -> List[Any]:
119
+ """
120
+ Scans text for JSON objects/arrays using state machine logic.
121
+ """
122
+ if not content or not isinstance(content, str):
123
+ return []
124
+
125
+ found_blocks = []
126
+ cursor = 0
127
+ length = len(content)
128
+
129
+ while cursor < length:
130
+ if content[cursor] not in ['{', '[']:
131
+ cursor += 1
132
+ continue
133
+
134
+ end_index = find_balanced_closing_index(content, cursor)
135
+
136
+ if end_index != -1:
137
+ raw_candidate = content[cursor : end_index + 1]
138
+ try:
139
+ parsed = json.loads(raw_candidate)
140
+ found_blocks.append(parsed)
141
+ cursor = end_index + 1
142
+ continue
143
+ except json.JSONDecodeError:
144
+ pass
145
 
146
+ cursor += 1
147
+
148
+ return found_blocks
149
+
150
+ # --- 3. Model Configuration ---
151
+ REPO_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
152
+ FILENAME = "smollm2-1.7b-instruct-q4_k_m.gguf"
153
+ N_THREADS = int(os.getenv("CPU_THREADS", "2"))
154
+
155
+ llm_model: Optional[Llama] = None
156
+
157
+ # --- 4. Lifecycle Manager ---
158
+ @asynccontextmanager
159
+ async def lifespan(app: FastAPI):
160
+ global llm_model
161
+ logger.info("--- STARTING SMOLLM2 API ---")
162
+ try:
163
+ logger.info(f"Downloading {FILENAME}...")
164
+ model_path = hf_hub_download(
165
+ repo_id=REPO_ID,
166
+ filename=FILENAME,
167
+ cache_dir=os.getenv("HF_HOME", "/app/cache")
168
+ )
169
+ logger.info(f"Initializing Engine (Threads: {N_THREADS})...")
170
+ llm_model = Llama(
171
+ model_path=model_path,
172
+ n_ctx=2048,
173
+ n_threads=N_THREADS,
174
+ n_batch=512,
175
+ verbose=False
176
+ )
177
+ logger.info("SmolLM2 Loaded.")
178
  except Exception as e:
179
+ logger.critical(f"Startup Failed: {e}")
180
+ raise e
 
181
  yield
182
+ if llm_model:
183
+ del llm_model
184
+ logger.info("Model unloaded.")
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ app = FastAPI(title="SmolLM2 API", version="2.1", lifespan=lifespan)
187
+
188
+ # --- 5. Data Models ---
189
+ class Message(BaseModel):
190
+ role: str
191
+ content: str
192
+
193
+ class ChatRequest(BaseModel):
194
+ messages: List[Message]
195
+ temperature: Optional[float] = 0.6
196
+ max_tokens: Optional[int] = 512
197
+ stream: Optional[bool] = False
198
+ returnJson: Optional[bool] = False
199
 
200
+ # --- 6. Endpoints ---
 
 
201
 
202
  @app.get("/")
203
  async def root():
204
+ return {"message": "Welcome to the SmolLM2 API! Use /v1/chat/completions to interact."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  @app.get("/ping")
207
  async def ping():
208
+ if llm_model: return {"status": "pong", "ready": True}
209
+ return JSONResponse(status_code=503, content={"status": "loading"})
210
+
211
+ @app.post("/v1/chat/completions")
212
+ async def chat(request: ChatRequest):
213
+ if not llm_model:
214
+ raise HTTPException(status_code=503, detail="Model loading...")
215
+
216
+ # --- VALIDATION: Check for conflicting parameters ---
217
+ if request.stream and request.returnJson:
218
+ raise HTTPException(
219
+ status_code=400,
220
+ detail="Conflict: 'stream' and 'returnJson' cannot both be True. Streaming prevents JSON extraction."
221
  )
222
+
223
+ # Prepare messages
224
+ messages_payload = [m.model_dump() for m in request.messages]
225
 
226
+ # --- LOGIC FOR returnJson ---
227
+ if request.returnJson:
228
+ logger.info("Format Mode: JSON Extraction Active")
229
+
230
+ system_prompt = {
231
+ "role": "system",
232
+ "content": (
233
+ "You are a strict JSON generator. "
234
+ "Convert the user's input into a valid JSON Array of Objects. "
235
+ "Output strictly in markdown code blocks like ```json ... ```. "
236
+ "Do not add conversational filler."
237
+ )
238
+ }
239
+ messages_payload.insert(0, system_prompt)
240
+
241
+ if messages_payload and messages_payload[-1]['role'] == 'user':
242
+ messages_payload[-1]['content'] += "\n\nReturn structured JSON of this content..."
243
+
244
+ logger.info(f"Processing request: {len(messages_payload)} msgs | Stream: {request.stream}")
245
+
246
+ try:
247
+ # Generate Response
248
+ response_data = llm_model.create_chat_completion(
249
+ messages=messages_payload,
250
+ temperature=request.temperature,
251
+ max_tokens=request.max_tokens,
252
+ stream=request.stream
253
+ )
254
+
255
+ # --- STREAMING RESPONSE LOGIC ---
256
+ if request.stream:
257
+ def iter_response():
258
+ for chunk in response_data:
259
+ yield f"data: {json.dumps(chunk)}\n\n"
260
+ yield "data: [DONE]\n\n"
261
+
262
+ return StreamingResponse(iter_response(), media_type="text/event-stream")
263
+
264
+ # --- STANDARD / JSON RESPONSE LOGIC ---
265
+
266
+ if not request.returnJson:
267
+ return response_data
268
+
269
+ # Custom JSON Extraction Logic
270
+ content_text = response_data['choices'][0]['message']['content']
271
+ extracted_data = extract_json_from_content(content_text)
272
+
273
+ return JSONResponse(content={
274
+ "status": "success",
275
+ "data": extracted_data
276
+ })
277
+
278
+ except Exception as e:
279
+ logger.error(f"Error: {e}")
280
+ raise HTTPException(status_code=500, detail=str(e))
models/schemas.py DELETED
@@ -1,28 +0,0 @@
1
- from typing import List, Optional, Any
2
- from pydantic import BaseModel, Field
3
-
4
- class Message(BaseModel):
5
- role: str = Field(..., description="Role of the message sender (user/assistant/system)")
6
- content: str = Field(..., description="Content of the message")
7
-
8
- class ChatRequest(BaseModel):
9
- messages: List[Message] = Field(..., description="List of messages in the conversation")
10
- temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
11
- max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
12
- stream: Optional[bool] = Field(False, description="Enable streaming response")
13
- returnJson: Optional[bool] = Field(False, description="Extract and return JSON from response")
14
-
15
- class VisionRequest(BaseModel):
16
- prompt: str = Field(..., description="Text prompt/question about the image")
17
- temperature: Optional[float] = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
18
- max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
19
-
20
- class ErrorResponse(BaseModel):
21
- error: str
22
- detail: Optional[str] = None
23
-
24
- class HealthResponse(BaseModel):
25
- status: str
26
- text_model: bool
27
- vision_model: bool
28
- timestamp: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
routers/text_router.py DELETED
@@ -1,53 +0,0 @@
1
- from fastapi import APIRouter, HTTPException
2
- from fastapi.responses import StreamingResponse, JSONResponse
3
- import logging
4
-
5
- from models.schemas import ChatRequest, ErrorResponse
6
- from services.text_service import text_service
7
-
8
- logger = logging.getLogger("text-router")
9
-
10
- router = APIRouter(prefix="/v1/text", tags=["Text Generation"])
11
-
12
- @router.post("/chat/completions")
13
- async def create_chat_completion(request: ChatRequest):
14
- """
15
- Create a chat completion using the text model
16
-
17
- Supports:
18
- - Standard completions
19
- - Streaming responses
20
- - JSON extraction mode
21
- """
22
- if not text_service.is_ready():
23
- raise HTTPException(status_code=503, detail="Text model not ready")
24
-
25
- try:
26
- messages = [msg.model_dump() for msg in request.messages]
27
-
28
- result = await text_service.generate_completion(
29
- messages=messages,
30
- temperature=request.temperature,
31
- max_tokens=request.max_tokens,
32
- stream=request.stream,
33
- return_json=request.returnJson
34
- )
35
-
36
- if request.stream:
37
- return StreamingResponse(result, media_type="text/event-stream")
38
-
39
- return JSONResponse(content=result)
40
-
41
- except ValueError as e:
42
- raise HTTPException(status_code=400, detail=str(e))
43
- except Exception as e:
44
- logger.error(f"Chat completion error: {e}")
45
- raise HTTPException(status_code=500, detail=str(e))
46
-
47
- @router.get("/health")
48
- async def text_health():
49
- """Check text model health status"""
50
- return {
51
- "status": "healthy" if text_service.is_ready() else "initializing",
52
- "model_ready": text_service.is_ready()
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
routers/vision_router.py DELETED
@@ -1,73 +0,0 @@
1
- from fastapi import APIRouter, HTTPException, File, UploadFile, Form
2
- from fastapi.responses import JSONResponse
3
- import logging
4
- from pathlib import Path
5
-
6
- from models.schemas import VisionRequest, ErrorResponse
7
- from services.vision_service import vision_service
8
- from config import config
9
-
10
- logger = logging.getLogger("vision-router")
11
-
12
- router = APIRouter(prefix="/v1/vision", tags=["Vision AI"])
13
-
14
- @router.post("/analyze")
15
- async def analyze_image(
16
- image: UploadFile = File(..., description="Image file to analyze"),
17
- prompt: str = Form(..., description="Question or prompt about the image"),
18
- temperature: float = Form(0.6, ge=0.0, le=2.0),
19
- max_tokens: int = Form(512, ge=1, le=4096)
20
- ):
21
- """
22
- Analyze an image with a text prompt
23
-
24
- Accepts:
25
- - Image file (JPEG, PNG, GIF, WebP, BMP)
26
- - Text prompt/question
27
- - Optional generation parameters
28
- """
29
- if not vision_service.is_ready():
30
- raise HTTPException(status_code=503, detail="Vision model not ready")
31
-
32
- # Validate file extension
33
- file_ext = Path(image.filename).suffix.lower()
34
- if file_ext not in config.ALLOWED_IMAGE_EXTENSIONS:
35
- raise HTTPException(
36
- status_code=400,
37
- detail=f"Invalid file type. Allowed: {', '.join(config.ALLOWED_IMAGE_EXTENSIONS)}"
38
- )
39
-
40
- try:
41
- # Read image data
42
- image_data = await image.read()
43
-
44
- # Check file size
45
- if len(image_data) > config.MAX_FILE_SIZE:
46
- raise HTTPException(
47
- status_code=400,
48
- detail=f"File too large. Max size: {config.MAX_FILE_SIZE / 1024 / 1024}MB"
49
- )
50
-
51
- # Analyze image
52
- result = await vision_service.analyze_image(
53
- image_data=image_data,
54
- prompt=prompt,
55
- temperature=temperature,
56
- max_tokens=max_tokens
57
- )
58
-
59
- return JSONResponse(content=result)
60
-
61
- except HTTPException:
62
- raise
63
- except Exception as e:
64
- logger.error(f"Image analysis error: {e}")
65
- raise HTTPException(status_code=500, detail=str(e))
66
-
67
- @router.get("/health")
68
- async def vision_health():
69
- """Check vision model health status"""
70
- return {
71
- "status": "healthy" if vision_service.is_ready() else "initializing",
72
- "model_ready": vision_service.is_ready()
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
services/text_service.py DELETED
@@ -1,134 +0,0 @@
1
- import logging
2
- from typing import Optional, Dict, Any, List, AsyncIterator
3
- from llama_cpp import Llama
4
- from huggingface_hub import hf_hub_download
5
- import json
6
-
7
- from config import config
8
- from utils.json_extractor import extract_json_from_content
9
-
10
- logger = logging.getLogger("text-service")
11
-
12
- class TextService:
13
- """Service for text-based language model interactions"""
14
-
15
- def __init__(self):
16
- self.model: Optional[Llama] = None
17
-
18
- async def initialize(self) -> None:
19
- """Initialize the text model"""
20
- try:
21
- logger.info(f"Downloading text model: {config.TEXT_MODEL_FILE}...")
22
- model_path = hf_hub_download(
23
- repo_id=config.TEXT_MODEL_REPO,
24
- filename=config.TEXT_MODEL_FILE,
25
- cache_dir=config.HF_HOME
26
- )
27
-
28
- logger.info(f"Loading text model (Threads: {config.N_THREADS})...")
29
- self.model = Llama(
30
- model_path=model_path,
31
- n_ctx=config.TEXT_MODEL_CTX,
32
- n_threads=config.N_THREADS,
33
- n_batch=config.TEXT_MODEL_BATCH,
34
- verbose=False
35
- )
36
- logger.info("✓ Text model loaded successfully")
37
-
38
- except Exception as e:
39
- logger.error(f"Failed to initialize text model: {e}")
40
- raise
41
-
42
- def is_ready(self) -> bool:
43
- """Check if the model is loaded and ready"""
44
- return self.model is not None
45
-
46
- async def generate_completion(
47
- self,
48
- messages: List[Dict[str, str]],
49
- temperature: float = 0.6,
50
- max_tokens: int = 512,
51
- stream: bool = False,
52
- return_json: bool = False
53
- ) -> Any:
54
- """
55
- Generate text completion
56
-
57
- Args:
58
- messages: List of message dictionaries with 'role' and 'content'
59
- temperature: Sampling temperature
60
- max_tokens: Maximum tokens to generate
61
- stream: Whether to stream the response
62
- return_json: Whether to extract JSON from response
63
-
64
- Returns:
65
- Generated completion (dict or stream)
66
- """
67
- if not self.is_ready():
68
- raise RuntimeError("Text model not initialized")
69
-
70
- # Validate conflicting parameters
71
- if stream and return_json:
72
- raise ValueError("Cannot use both 'stream' and 'return_json' simultaneously")
73
-
74
- # Prepare messages for JSON extraction mode
75
- if return_json:
76
- system_prompt = {
77
- "role": "system",
78
- "content": (
79
- "You are a strict JSON generator. "
80
- "Convert the user's input into valid JSON format. "
81
- "Output strictly in markdown code blocks like ```json ... ```. "
82
- "Do not add conversational filler."
83
- )
84
- }
85
- messages = [system_prompt] + messages
86
-
87
- if messages[-1]['role'] == 'user':
88
- messages[-1]['content'] += "\n\nReturn structured JSON of this content."
89
-
90
- logger.info(f"Generating completion: {len(messages)} messages | Stream: {stream}")
91
-
92
- try:
93
- response = self.model.create_chat_completion(
94
- messages=messages,
95
- temperature=temperature,
96
- max_tokens=max_tokens,
97
- stream=stream
98
- )
99
-
100
- # Handle streaming response
101
- if stream:
102
- return self._create_stream_iterator(response)
103
-
104
- # Handle JSON extraction
105
- if return_json:
106
- content_text = response['choices'][0]['message']['content']
107
- extracted_data = extract_json_from_content(content_text)
108
- return {
109
- "status": "success",
110
- "data": extracted_data,
111
- "raw_content": content_text
112
- }
113
-
114
- return response
115
-
116
- except Exception as e:
117
- logger.error(f"Error generating completion: {e}")
118
- raise
119
-
120
- async def _create_stream_iterator(self, response_stream) -> AsyncIterator[str]:
121
- """Create an async iterator for streaming responses"""
122
- for chunk in response_stream:
123
- yield f"data: {json.dumps(chunk)}\n\n"
124
- yield "data: [DONE]\n\n"
125
-
126
- async def cleanup(self) -> None:
127
- """Cleanup resources"""
128
- if self.model:
129
- del self.model
130
- self.model = None
131
- logger.info("Text model unloaded")
132
-
133
- # Global instance
134
- text_service = TextService()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
services/vision_service.py DELETED
@@ -1,144 +0,0 @@
1
- import logging
2
- import base64
3
- import io
4
- from typing import Optional, Dict, Any
5
- from llama_cpp import Llama
6
- from llama_cpp.llama_chat_format import Llava15ChatHandler
7
- from huggingface_hub import hf_hub_download
8
- from PIL import Image
9
-
10
- from config import config
11
-
12
- logger = logging.getLogger("vision-service")
13
-
14
- class VisionService:
15
- """Service for vision-language model interactions"""
16
-
17
- def __init__(self):
18
- self.model: Optional[Llama] = None
19
- self.chat_handler: Optional[Llava15ChatHandler] = None
20
-
21
- async def initialize(self) -> None:
22
- """Initialize the vision model"""
23
- try:
24
- logger.info(f"Downloading vision model: {config.VISION_MODEL_FILE}...")
25
- model_path = hf_hub_download(
26
- repo_id=config.VISION_MODEL_REPO,
27
- filename=config.VISION_MODEL_FILE,
28
- cache_dir=config.HF_HOME
29
- )
30
-
31
- logger.info(f"Downloading vision projector: {config.VISION_MMPROJ_FILE}...")
32
- mmproj_path = hf_hub_download(
33
- repo_id=config.VISION_MODEL_REPO,
34
- filename=config.VISION_MMPROJ_FILE,
35
- cache_dir=config.HF_HOME
36
- )
37
-
38
- logger.info(f"Loading vision model (Threads: {config.N_THREADS})...")
39
-
40
- # NOTE: Llava15ChatHandler is the standard Python wrapper for loading
41
- # external projectors (mmproj files), even for newer architectures like SmolVLM
42
- self.chat_handler = Llava15ChatHandler(
43
- clip_model_path=mmproj_path,
44
- verbose=False
45
- )
46
-
47
- self.model = Llama(
48
- model_path=model_path,
49
- chat_handler=self.chat_handler,
50
- n_ctx=config.VISION_MODEL_CTX,
51
- n_threads=config.N_THREADS,
52
- n_batch=config.VISION_MODEL_BATCH,
53
- logits_all=True,
54
- verbose=False,
55
- n_gpu_layers=0 # Explicitly set to 0 to ensure CPU usage and prevent driver crashes
56
- )
57
- logger.info("✓ Vision model loaded successfully")
58
-
59
- except Exception as e:
60
- logger.error(f"Failed to initialize vision model: {e}")
61
- # Ensure cleanup if initialization fails halfway
62
- await self.cleanup()
63
- raise
64
-
65
- def is_ready(self) -> bool:
66
- """Check if the model is loaded and ready"""
67
- return self.model is not None and self.chat_handler is not None
68
-
69
- async def analyze_image(
70
- self,
71
- image_data: bytes,
72
- prompt: str,
73
- temperature: float = 0.6,
74
- max_tokens: int = 512
75
- ) -> Dict[str, Any]:
76
- """
77
- Analyze an image with a text prompt
78
- """
79
- if not self.is_ready():
80
- raise RuntimeError("Vision model not initialized")
81
-
82
- try:
83
- # Convert image bytes to base64 data URI
84
- image_b64 = base64.b64encode(image_data).decode('utf-8')
85
-
86
- # Validate image
87
- image = Image.open(io.BytesIO(image_data))
88
- # logger.info(f"Processing image: {image.size} | Format: {image.format}")
89
-
90
- # Create vision message format
91
- messages = [
92
- {
93
- "role": "user",
94
- "content": [
95
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
96
- {"type": "text", "text": prompt}
97
- ]
98
- }
99
- ]
100
-
101
- logger.info(f"Analyzing image... Prompt: {prompt[:50]}")
102
-
103
- response = self.model.create_chat_completion(
104
- messages=messages,
105
- temperature=temperature,
106
- max_tokens=max_tokens
107
- )
108
-
109
- return {
110
- "status": "success",
111
- "image_info": {
112
- "size": list(image.size),
113
- "format": image.format,
114
- "mode": image.mode
115
- },
116
- "prompt": prompt,
117
- "response": response['choices'][0]['message']['content'],
118
- "usage": response.get('usage', {})
119
- }
120
-
121
- except Exception as e:
122
- logger.error(f"Error analyzing image: {e}")
123
- raise
124
-
125
- async def cleanup(self) -> None:
126
- """Cleanup resources"""
127
- if self.model:
128
- try:
129
- del self.model
130
- except:
131
- pass
132
- self.model = None
133
-
134
- if self.chat_handler:
135
- try:
136
- del self.chat_handler
137
- except:
138
- pass
139
- self.chat_handler = None
140
-
141
- logger.info("Vision model unloaded")
142
-
143
- # Global instance
144
- vision_service = VisionService()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/json_extractor.py DELETED
@@ -1,133 +0,0 @@
1
- import json
2
- import logging
3
- from typing import List, Any
4
-
5
- logger = logging.getLogger("json-extractor")
6
-
7
- def find_balanced_closing_index(text: str, start_index: int) -> int:
8
- """
9
- Finds the matching closing bracket for the bracket at start_index.
10
- Ignores brackets inside strings and comments.
11
- """
12
- start_char = text[start_index]
13
- end_char = '}' if start_char == '{' else ']'
14
-
15
- depth = 0
16
- in_double_quote = False
17
- in_single_quote = False
18
- in_backtick = False
19
- in_line_comment = False
20
- in_block_comment = False
21
- is_escaped = False
22
-
23
- length = len(text)
24
- i = start_index
25
-
26
- while i < length:
27
- char = text[i]
28
- next_char = text[i+1] if i + 1 < length else ''
29
-
30
- # Handle Escaping
31
- if is_escaped:
32
- is_escaped = False
33
- i += 1
34
- continue
35
- if char == '\\' and not in_line_comment and not in_block_comment:
36
- is_escaped = True
37
- i += 1
38
- continue
39
-
40
- # Handle Comments
41
- if in_line_comment:
42
- if char == '\n': in_line_comment = False
43
- i += 1
44
- continue
45
- if in_block_comment:
46
- if char == '*' and next_char == '/':
47
- in_block_comment = False
48
- i += 2
49
- continue
50
- i += 1
51
- continue
52
-
53
- # Check comment starts
54
- if not in_double_quote and not in_single_quote and not in_backtick:
55
- if char == '/' and next_char == '/':
56
- in_line_comment = True
57
- i += 2
58
- continue
59
- if char == '/' and next_char == '*':
60
- in_block_comment = True
61
- i += 2
62
- continue
63
-
64
- # Handle Strings
65
- if in_double_quote:
66
- if char == '"': in_double_quote = False
67
- i += 1
68
- continue
69
- if in_single_quote:
70
- if char == "'": in_single_quote = False
71
- i += 1
72
- continue
73
- if in_backtick:
74
- if char == '`': in_backtick = False
75
- i += 1
76
- continue
77
-
78
- if char == '"':
79
- in_double_quote = True
80
- i += 1
81
- continue
82
- if char == "'":
83
- in_single_quote = True
84
- i += 1
85
- continue
86
- if char == '`':
87
- in_backtick = True
88
- i += 1
89
- continue
90
-
91
- # Handle Bracket Counting
92
- if char == start_char:
93
- depth += 1
94
- elif char == end_char:
95
- depth -= 1
96
- if depth == 0:
97
- return i
98
-
99
- i += 1
100
-
101
- return -1
102
-
103
- def extract_json_from_content(content: str) -> List[Any]:
104
- """
105
- Scans text for JSON objects/arrays using state machine logic.
106
- """
107
- if not content or not isinstance(content, str):
108
- return []
109
-
110
- found_blocks = []
111
- cursor = 0
112
- length = len(content)
113
-
114
- while cursor < length:
115
- if content[cursor] not in ['{', '[']:
116
- cursor += 1
117
- continue
118
-
119
- end_index = find_balanced_closing_index(content, cursor)
120
-
121
- if end_index != -1:
122
- raw_candidate = content[cursor : end_index + 1]
123
- try:
124
- parsed = json.loads(raw_candidate)
125
- found_blocks.append(parsed)
126
- cursor = end_index + 1
127
- continue
128
- except json.JSONDecodeError:
129
- pass
130
-
131
- cursor += 1
132
-
133
- return found_blocks