WORKWITHSHAFISK commited on
Commit
6536728
ยท
verified ยท
1 Parent(s): 7f27d90

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +56 -0
  2. Dockerfile +48 -0
  3. main.py +400 -0
  4. requirements.txt +25 -0
.gitignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .gitignore for Space B
2
+
3
+ # Python
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+ *.so
8
+ .Python
9
+ build/
10
+ develop-eggs/
11
+ dist/
12
+ downloads/
13
+ eggs/
14
+ .eggs/
15
+ lib/
16
+ lib64/
17
+ parts/
18
+ sdist/
19
+ var/
20
+ wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+
25
+ # Virtual environments
26
+ venv/
27
+ ENV/
28
+ env/
29
+ .venv
30
+
31
+ # Models cache
32
+ models/
33
+ *.gguf
34
+ *.bin
35
+ *.safetensors
36
+
37
+ # IDE
38
+ .vscode/
39
+ .idea/
40
+ *.swp
41
+ *.swo
42
+ *~
43
+
44
+ # OS
45
+ .DS_Store
46
+ Thumbs.db
47
+
48
+ # Logs
49
+ *.log
50
+
51
+ # Environment variables
52
+ .env
53
+ .env.local
54
+
55
+ # HuggingFace cache
56
+ .cache/
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.11-slim
3
+
4
+ # Install build dependencies for llama-cpp-python
5
+ RUN apt-get update && apt-get install -y \
6
+ cmake \
7
+ g++ \
8
+ gcc \
9
+ libopenblas-dev \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Set environment variables for CPU optimization
16
+ # GGML_BLAS enables BLAS acceleration
17
+ # GGML_OPENBLAS uses OpenBLAS library for matrix operations (2-3x faster)
18
+ ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
19
+ ENV FORCE_CMAKE=1
20
+
21
+ # Copy requirements first for better Docker layer caching
22
+ COPY requirements.txt .
23
+
24
+ # Install Python dependencies
25
+ # llama-cpp-python will compile from source with CPU optimizations
26
+ RUN pip install --no-cache-dir -r requirements.txt
27
+
28
+ # Copy application code
29
+ COPY main.py .
30
+
31
+ # Create cache directory for models
32
+ RUN mkdir -p /app/models
33
+
34
+ # Expose port 7860 (HuggingFace Space default)
35
+ EXPOSE 7860
36
+
37
+ # Set environment variables
38
+ ENV HOST=0.0.0.0
39
+ ENV PORT=7860
40
+
41
+ # Health check for HuggingFace monitoring
42
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
43
+ CMD python -c "import requests; requests.get('http://localhost:7860/health')"
44
+
45
+ # Run the FastAPI application with Uvicorn
46
+ # workers=1 ensures single process (important for model memory management)
47
+ # log-level=info provides detailed logging for debugging
48
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--log-level", "info"]
main.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Space B (The Factory) - AI Inference Microservice
3
+
4
+ This service handles heavy AI workloads offloaded from Space A:
5
+ - Llama-3 text summarization (GGUF quantized for CPU)
6
+ - GLiNER named entity recognition
7
+ - Edge-TTS audio generation
8
+
9
+ Optimized for: 2 vCPU, 16GB RAM, HuggingFace Free Tier
10
+ """
11
+
12
+ import asyncio
13
+ import logging
14
+ import os
15
+ import time
16
+ from contextlib import asynccontextmanager
17
+ from typing import List, Optional
18
+
19
+ import edge_tts
20
+ from fastapi import FastAPI, HTTPException
21
+ from fastapi.responses import StreamingResponse
22
+ from gliner import GLiNER
23
+ from huggingface_hub import hf_hub_download
24
+ from llama_cpp import Llama
25
+ from pydantic import BaseModel, Field
26
+
27
+ # Setup logging
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Global model instances (loaded at startup)
35
+ llama_model: Optional[Llama] = None
36
+ gliner_model: Optional[GLiNER] = None
37
+ startup_time = time.time()
38
+
39
+
40
+ # ============================================================================
41
+ # Pydantic Models (Request/Response Schemas)
42
+ # ============================================================================
43
+
44
+ class SummarizeRequest(BaseModel):
45
+ text: str = Field(..., description="Text to summarize", min_length=10)
46
+ max_tokens: int = Field(150, description="Maximum summary length", ge=50, le=500)
47
+ temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
48
+
49
+
50
+ class SummarizeResponse(BaseModel):
51
+ summary: str
52
+ model: str
53
+ inference_time_ms: int
54
+
55
+
56
+ class ExtractRequest(BaseModel):
57
+ text: str = Field(..., description="Text for entity extraction", min_length=5)
58
+ labels: List[str] = Field(
59
+ ["Person", "Organization", "Location"],
60
+ description="Entity types to extract"
61
+ )
62
+ threshold: float = Field(0.5, description="Confidence threshold", ge=0.0, le=1.0)
63
+
64
+
65
+ class Entity(BaseModel):
66
+ text: str
67
+ label: str
68
+ score: float
69
+
70
+
71
+ class ExtractResponse(BaseModel):
72
+ entities: List[Entity]
73
+ model: str
74
+ inference_time_ms: int
75
+
76
+
77
+ class AudioRequest(BaseModel):
78
+ text: str = Field(..., description="Text to convert to speech", min_length=1)
79
+ voice: str = Field(
80
+ "en-US-ChristopherNeural",
81
+ description="Edge-TTS voice name"
82
+ )
83
+ rate: str = Field("+0%", description="Speech rate (-50% to +100%)")
84
+ volume: str = Field("+0%", description="Volume (-50% to +50%)")
85
+
86
+
87
+ class HealthResponse(BaseModel):
88
+ status: str
89
+ models_loaded: bool
90
+ uptime_seconds: int
91
+ llama_loaded: bool
92
+ gliner_loaded: bool
93
+
94
+
95
+ # ============================================================================
96
+ # Model Loading (Startup Event)
97
+ # ============================================================================
98
+
99
+ async def load_models():
100
+ """
101
+ Load all AI models into memory at startup
102
+
103
+ This is critical for performance - models are loaded ONCE and reused
104
+ for all requests. Loading on every request would be 100x slower.
105
+ """
106
+ global llama_model, gliner_model
107
+
108
+ logger.info("=" * 80)
109
+ logger.info("๐Ÿญ [SPACE B] Starting model loading...")
110
+ logger.info("=" * 80)
111
+
112
+ # -------------------------------------------------------------------------
113
+ # 1. Download and load Llama-3 GGUF model
114
+ # -------------------------------------------------------------------------
115
+ try:
116
+ logger.info("๐Ÿ“ฅ Downloading Llama-3-8B-Instruct (Q4_K_M quantized)...")
117
+
118
+ # Download from HuggingFace Hub
119
+ model_path = hf_hub_download(
120
+ repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
121
+ filename="Meta-Llama-3-8B-Instruct.Q4_K_M.gguf",
122
+ cache_dir="/app/models"
123
+ )
124
+
125
+ logger.info(f"โœ… Model downloaded to: {model_path}")
126
+ logger.info("๐Ÿ”ง Loading Llama-3 into memory...")
127
+
128
+ # Load with CPU optimizations
129
+ llama_model = Llama(
130
+ model_path=model_path,
131
+ n_ctx=2048, # Context window (tokens)
132
+ n_threads=2, # Use both vCPUs
133
+ n_batch=512, # Batch size for prompt processing
134
+ verbose=False # Suppress llama.cpp logs
135
+ )
136
+
137
+ logger.info("โœ… Llama-3 loaded successfully!")
138
+ logger.info(f" ๐Ÿ“Š Model size: ~4.5GB RAM")
139
+ logger.info(f" ๐Ÿ”ข Context length: 2048 tokens")
140
+
141
+ except Exception as e:
142
+ logger.error(f"โŒ Failed to load Llama-3: {e}")
143
+ raise
144
+
145
+ # -------------------------------------------------------------------------
146
+ # 2. Load GLiNER model
147
+ # -------------------------------------------------------------------------
148
+ try:
149
+ logger.info("๐Ÿ“ฅ Loading GLiNER (small-v2.1) for NER...")
150
+
151
+ gliner_model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")
152
+
153
+ logger.info("โœ… GLiNER loaded successfully!")
154
+ logger.info(f" ๐Ÿ“Š Model size: ~200MB RAM")
155
+ logger.info(f" ๐ŸŽฏ Zero-shot NER ready")
156
+
157
+ except Exception as e:
158
+ logger.error(f"โŒ Failed to load GLiNER: {e}")
159
+ raise
160
+
161
+ logger.info("")
162
+ logger.info("=" * 80)
163
+ logger.info("๐ŸŽ‰ [SPACE B] All models loaded successfully!")
164
+ logger.info("=" * 80)
165
+
166
+
167
+ @asynccontextmanager
168
+ async def lifespan(app: FastAPI):
169
+ """
170
+ Application lifespan manager
171
+
172
+ Loads models at startup and cleans up at shutdown
173
+ """
174
+ # Startup: Load models
175
+ await load_models()
176
+
177
+ yield # Application runs here
178
+
179
+ # Shutdown: Cleanup (if needed)
180
+ logger.info("๐Ÿ‘‹ [SPACE B] Shutting down...")
181
+
182
+
183
+ # ============================================================================
184
+ # FastAPI Application
185
+ # ============================================================================
186
+
187
+ app = FastAPI(
188
+ title="Space B - The Factory",
189
+ description="AI Inference Microservice for Segmento Pulse",
190
+ version="1.0.0",
191
+ lifespan=lifespan
192
+ )
193
+
194
+
195
+ # ============================================================================
196
+ # Endpoints
197
+ # ============================================================================
198
+
199
+ @app.get("/", tags=["Info"])
200
+ async def root():
201
+ """Root endpoint with service info"""
202
+ return {
203
+ "service": "Space B - The Factory",
204
+ "description": "AI inference microservice for heavy workloads",
205
+ "version": "1.0.0",
206
+ "endpoints": {
207
+ "summarize": "/summarize (POST)",
208
+ "extract": "/extract (POST)",
209
+ "audio": "/audio (POST)",
210
+ "health": "/health (GET)"
211
+ }
212
+ }
213
+
214
+
215
+ @app.get("/health", response_model=HealthResponse, tags=["Health"])
216
+ async def health_check():
217
+ """
218
+ Health check endpoint
219
+
220
+ CRITICAL: This must respond quickly (<1s) for HuggingFace monitoring.
221
+ Do NOT perform heavy operations here.
222
+ """
223
+ uptime = int(time.time() - startup_time)
224
+
225
+ return HealthResponse(
226
+ status="healthy",
227
+ models_loaded=llama_model is not None and gliner_model is not None,
228
+ uptime_seconds=uptime,
229
+ llama_loaded=llama_model is not None,
230
+ gliner_loaded=gliner_model is not None
231
+ )
232
+
233
+
234
+ @app.post("/summarize", response_model=SummarizeResponse, tags=["AI"])
235
+ async def summarize_text(request: SummarizeRequest):
236
+ """
237
+ Generate text summary using Llama-3
238
+
239
+ Uses quantized GGUF model for CPU-optimized inference.
240
+ Typical inference time: 5-10 seconds on 2 vCPU.
241
+ """
242
+ if llama_model is None:
243
+ raise HTTPException(status_code=503, detail="Llama model not loaded")
244
+
245
+ start_time = time.time()
246
+
247
+ try:
248
+ # Construct prompt (Llama-3-Instruct format)
249
+ prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
250
+
251
+ You are a professional news summarizer. Create concise, accurate summaries.<|eot_id|><|start_header_id|>user<|end_header_id|>
252
+
253
+ Summarize the following article in 2-3 sentences:
254
+
255
+ {request.text[:2000]}
256
+
257
+ Summary:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
258
+
259
+ """
260
+
261
+ logger.info(f"๐Ÿ”ฎ Generating summary (max_tokens={request.max_tokens})...")
262
+
263
+ # Run inference in thread pool (llama.cpp is synchronous)
264
+ loop = asyncio.get_event_loop()
265
+ output = await loop.run_in_executor(
266
+ None, # Use default thread pool
267
+ lambda: llama_model(
268
+ prompt,
269
+ max_tokens=request.max_tokens,
270
+ temperature=request.temperature,
271
+ stop=["<|eot_id|>", "\n\n"],
272
+ echo=False
273
+ )
274
+ )
275
+
276
+ # Extract generated text
277
+ summary = output['choices'][0]['text'].strip()
278
+
279
+ inference_time = int((time.time() - start_time) * 1000)
280
+
281
+ logger.info(f"โœ… Summary generated in {inference_time}ms")
282
+
283
+ return SummarizeResponse(
284
+ summary=summary,
285
+ model="Llama-3-8B-Instruct-Q4_K_M",
286
+ inference_time_ms=inference_time
287
+ )
288
+
289
+ except Exception as e:
290
+ logger.error(f"โŒ Summarization error: {e}")
291
+ raise HTTPException(status_code=500, detail=str(e))
292
+
293
+
294
+ @app.post("/extract", response_model=ExtractResponse, tags=["AI"])
295
+ async def extract_entities(request: ExtractRequest):
296
+ """
297
+ Extract named entities using GLiNER
298
+
299
+ Zero-shot NER - can extract any entity type without training.
300
+ Typical inference time: 50-200ms on CPU.
301
+ """
302
+ if gliner_model is None:
303
+ raise HTTPException(status_code=503, detail="GLiNER model not loaded")
304
+
305
+ start_time = time.time()
306
+
307
+ try:
308
+ logger.info(f"๐Ÿ” Extracting entities: {request.labels}")
309
+
310
+ # Run GLiNER inference in thread pool
311
+ loop = asyncio.get_event_loop()
312
+ raw_entities = await loop.run_in_executor(
313
+ None,
314
+ lambda: gliner_model.predict_entities(
315
+ request.text,
316
+ request.labels,
317
+ threshold=request.threshold
318
+ )
319
+ )
320
+
321
+ # Convert to response format
322
+ entities = [
323
+ Entity(
324
+ text=entity['text'],
325
+ label=entity['label'],
326
+ score=round(entity['score'], 3)
327
+ )
328
+ for entity in raw_entities
329
+ ]
330
+
331
+ inference_time = int((time.time() - start_time) * 1000)
332
+
333
+ logger.info(f"โœ… Extracted {len(entities)} entities in {inference_time}ms")
334
+
335
+ return ExtractResponse(
336
+ entities=entities,
337
+ model="GLiNER-small-v2.1",
338
+ inference_time_ms=inference_time
339
+ )
340
+
341
+ except Exception as e:
342
+ logger.error(f"โŒ Entity extraction error: {e}")
343
+ raise HTTPException(status_code=500, detail=str(e))
344
+
345
+
346
+ @app.post("/audio", tags=["Audio"])
347
+ async def generate_audio(request: AudioRequest):
348
+ """
349
+ Generate speech audio using Edge-TTS
350
+
351
+ Uses Microsoft's cloud API (zero local resources).
352
+ Returns MP3 audio stream.
353
+ """
354
+ try:
355
+ logger.info(f"๐Ÿ”Š Generating audio with voice: {request.voice}")
356
+
357
+ # Create TTS communicator
358
+ communicate = edge_tts.Communicate(
359
+ text=request.text,
360
+ voice=request.voice,
361
+ rate=request.rate,
362
+ volume=request.volume
363
+ )
364
+
365
+ # Stream audio chunks
366
+ async def audio_generator():
367
+ async for chunk in communicate.stream():
368
+ if chunk["type"] == "audio":
369
+ yield chunk["data"]
370
+
371
+ logger.info("โœ… Audio generation started")
372
+
373
+ return StreamingResponse(
374
+ audio_generator(),
375
+ media_type="audio/mpeg",
376
+ headers={
377
+ "Content-Disposition": f"attachment; filename=audio.mp3"
378
+ }
379
+ )
380
+
381
+ except Exception as e:
382
+ logger.error(f"โŒ Audio generation error: {e}")
383
+ raise HTTPException(status_code=500, detail=str(e))
384
+
385
+
386
+ # ============================================================================
387
+ # Application Entry Point
388
+ # ============================================================================
389
+
390
+ if __name__ == "__main__":
391
+ import uvicorn
392
+
393
+ # Run server
394
+ uvicorn.run(
395
+ "main:app",
396
+ host="0.0.0.0",
397
+ port=7860,
398
+ workers=1,
399
+ log_level="info"
400
+ )
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web Framework
2
+ fastapi==0.115.5
3
+ uvicorn[standard]==0.32.1
4
+ pydantic==2.10.3
5
+ python-multipart==0.0.6
6
+
7
+ # HTTP Client (for model downloads and health checks)
8
+ httpx==0.26.0
9
+ requests==2.31.0
10
+
11
+ # Llama-cpp-python - CPU-optimized LLM inference
12
+ # Will be compiled with CMAKE_ARGS from Dockerfile
13
+ llama-cpp-python==0.2.90
14
+
15
+ # GLiNER - Fast CPU-based NER
16
+ gliner==0.2.19
17
+
18
+ # Edge-TTS - Cloud-based TTS (zero local resources)
19
+ edge-tts==6.1.15
20
+
21
+ # HuggingFace Hub - Model downloads
22
+ huggingface-hub==0.26.5
23
+
24
+ # Logging and utilities
25
+ python-dotenv==1.0.0