Spaces:

WORKWITHSHAFISK
/

segmentopulse-factory

Paused

App Files Files Community

WORKWITHSHAFISK commited on Jan 31

Commit

6536728

verified ·

1 Parent(s): 7f27d90

Upload 4 files

Browse files

Files changed (4) hide show

.gitignore +56 -0
Dockerfile +48 -0
main.py +400 -0
requirements.txt +25 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,56 @@

+# .gitignore for Space B
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+venv/
+ENV/
+env/
+.venv
+# Models cache
+models/
+*.gguf
+*.bin
+*.safetensors
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+# Environment variables
+.env
+.env.local
+# HuggingFace cache
+.cache/

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+# syntax=docker/dockerfile:1
+FROM python:3.11-slim
+# Install build dependencies for llama-cpp-python
+RUN apt-get update && apt-get install -y \
+    cmake \
+    g++ \
+    gcc \
+    libopenblas-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Set environment variables for CPU optimization
+# GGML_BLAS enables BLAS acceleration
+# GGML_OPENBLAS uses OpenBLAS library for matrix operations (2-3x faster)
+ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
+ENV FORCE_CMAKE=1
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+# Install Python dependencies
+# llama-cpp-python will compile from source with CPU optimizations
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY main.py .
+# Create cache directory for models
+RUN mkdir -p /app/models
+# Expose port 7860 (HuggingFace Space default)
+EXPOSE 7860
+# Set environment variables
+ENV HOST=0.0.0.0
+ENV PORT=7860
+# Health check for HuggingFace monitoring
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/health')"
+# Run the FastAPI application with Uvicorn
+# workers=1 ensures single process (important for model memory management)
+# log-level=info provides detailed logging for debugging
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--log-level", "info"]

main.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+Space B (The Factory) - AI Inference Microservice
+This service handles heavy AI workloads offloaded from Space A:
+- Llama-3 text summarization (GGUF quantized for CPU)
+- GLiNER named entity recognition
+- Edge-TTS audio generation
+Optimized for: 2 vCPU, 16GB RAM, HuggingFace Free Tier
+"""
+import asyncio
+import logging
+import os
+import time
+from contextlib import asynccontextmanager
+from typing import List, Optional
+import edge_tts
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from gliner import GLiNER
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from pydantic import BaseModel, Field
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Global model instances (loaded at startup)
+llama_model: Optional[Llama] = None
+gliner_model: Optional[GLiNER] = None
+startup_time = time.time()
+# ============================================================================
+# Pydantic Models (Request/Response Schemas)
+# ============================================================================
+class SummarizeRequest(BaseModel):
+    text: str = Field(..., description="Text to summarize", min_length=10)
+    max_tokens: int = Field(150, description="Maximum summary length", ge=50, le=500)
+    temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
+class SummarizeResponse(BaseModel):
+    summary: str
+    model: str
+    inference_time_ms: int
+class ExtractRequest(BaseModel):
+    text: str = Field(..., description="Text for entity extraction", min_length=5)
+    labels: List[str] = Field(
+        ["Person", "Organization", "Location"],
+        description="Entity types to extract"
+    )
+    threshold: float = Field(0.5, description="Confidence threshold", ge=0.0, le=1.0)
+class Entity(BaseModel):
+    text: str
+    label: str
+    score: float
+class ExtractResponse(BaseModel):
+    entities: List[Entity]
+    model: str
+    inference_time_ms: int
+class AudioRequest(BaseModel):
+    text: str = Field(..., description="Text to convert to speech", min_length=1)
+    voice: str = Field(
+        "en-US-ChristopherNeural",
+        description="Edge-TTS voice name"
+    )
+    rate: str = Field("+0%", description="Speech rate (-50% to +100%)")
+    volume: str = Field("+0%", description="Volume (-50% to +50%)")
+class HealthResponse(BaseModel):
+    status: str
+    models_loaded: bool
+    uptime_seconds: int
+    llama_loaded: bool
+    gliner_loaded: bool
+# ============================================================================
+# Model Loading (Startup Event)
+# ============================================================================
+async def load_models():
+    """
+    Load all AI models into memory at startup
+    This is critical for performance - models are loaded ONCE and reused
+    for all requests. Loading on every request would be 100x slower.
+    """
+    global llama_model, gliner_model
+    logger.info("=" * 80)
+    logger.info("🏭 [SPACE B] Starting model loading...")
+    logger.info("=" * 80)
+    # -------------------------------------------------------------------------
+    # 1. Download and load Llama-3 GGUF model
+    # -------------------------------------------------------------------------
+    try:
+        logger.info("📥 Downloading Llama-3-8B-Instruct (Q4_K_M quantized)...")
+        # Download from HuggingFace Hub
+        model_path = hf_hub_download(
+            repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
+            filename="Meta-Llama-3-8B-Instruct.Q4_K_M.gguf",
+            cache_dir="/app/models"
+        )
+        logger.info(f"✅ Model downloaded to: {model_path}")
+        logger.info("🔧 Loading Llama-3 into memory...")
+        # Load with CPU optimizations
+        llama_model = Llama(
+            model_path=model_path,
+            n_ctx=2048,  # Context window (tokens)
+            n_threads=2,  # Use both vCPUs
+            n_batch=512,  # Batch size for prompt processing
+            verbose=False  # Suppress llama.cpp logs
+        )
+        logger.info("✅ Llama-3 loaded successfully!")
+        logger.info(f"   📊 Model size: ~4.5GB RAM")
+        logger.info(f"   🔢 Context length: 2048 tokens")
+    except Exception as e:
+        logger.error(f"❌ Failed to load Llama-3: {e}")
+        raise
+    # -------------------------------------------------------------------------
+    # 2. Load GLiNER model
+    # -------------------------------------------------------------------------
+    try:
+        logger.info("📥 Loading GLiNER (small-v2.1) for NER...")
+        gliner_model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")
+        logger.info("✅ GLiNER loaded successfully!")
+        logger.info(f"   📊 Model size: ~200MB RAM")
+        logger.info(f"   🎯 Zero-shot NER ready")
+    except Exception as e:
+        logger.error(f"❌ Failed to load GLiNER: {e}")
+        raise
+    logger.info("")
+    logger.info("=" * 80)
+    logger.info("🎉 [SPACE B] All models loaded successfully!")
+    logger.info("=" * 80)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Application lifespan manager
+    Loads models at startup and cleans up at shutdown
+    """
+    # Startup: Load models
+    await load_models()
+    yield  # Application runs here
+    # Shutdown: Cleanup (if needed)
+    logger.info("👋 [SPACE B] Shutting down...")
+# ============================================================================
+# FastAPI Application
+# ============================================================================
+app = FastAPI(
+    title="Space B - The Factory",
+    description="AI Inference Microservice for Segmento Pulse",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# ============================================================================
+# Endpoints
+# ============================================================================
+@app.get("/", tags=["Info"])
+async def root():
+    """Root endpoint with service info"""
+    return {
+        "service": "Space B - The Factory",
+        "description": "AI inference microservice for heavy workloads",
+        "version": "1.0.0",
+        "endpoints": {
+            "summarize": "/summarize (POST)",
+            "extract": "/extract (POST)",
+            "audio": "/audio (POST)",
+            "health": "/health (GET)"
+        }
+    }
+@app.get("/health", response_model=HealthResponse, tags=["Health"])
+async def health_check():
+    """
+    Health check endpoint
+    CRITICAL: This must respond quickly (<1s) for HuggingFace monitoring.
+    Do NOT perform heavy operations here.
+    """
+    uptime = int(time.time() - startup_time)
+    return HealthResponse(
+        status="healthy",
+        models_loaded=llama_model is not None and gliner_model is not None,
+        uptime_seconds=uptime,
+        llama_loaded=llama_model is not None,
+        gliner_loaded=gliner_model is not None
+    )
+@app.post("/summarize", response_model=SummarizeResponse, tags=["AI"])
+async def summarize_text(request: SummarizeRequest):
+    """
+    Generate text summary using Llama-3
+    Uses quantized GGUF model for CPU-optimized inference.
+    Typical inference time: 5-10 seconds on 2 vCPU.
+    """
+    if llama_model is None:
+        raise HTTPException(status_code=503, detail="Llama model not loaded")
+    start_time = time.time()
+    try:
+        # Construct prompt (Llama-3-Instruct format)
+        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a professional news summarizer. Create concise, accurate summaries.<|eot_id|><|start_header_id|>user<|end_header_id|>
+Summarize the following article in 2-3 sentences:
+{request.text[:2000]}
+Summary:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+        logger.info(f"🔮 Generating summary (max_tokens={request.max_tokens})...")
+        # Run inference in thread pool (llama.cpp is synchronous)
+        loop = asyncio.get_event_loop()
+        output = await loop.run_in_executor(
+            None,  # Use default thread pool
+            lambda: llama_model(
+                prompt,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+                stop=["<|eot_id|>", "\n\n"],
+                echo=False
+            )
+        )
+        # Extract generated text
+        summary = output['choices'][0]['text'].strip()
+        inference_time = int((time.time() - start_time) * 1000)
+        logger.info(f"✅ Summary generated in {inference_time}ms")
+        return SummarizeResponse(
+            summary=summary,
+            model="Llama-3-8B-Instruct-Q4_K_M",
+            inference_time_ms=inference_time
+        )
+    except Exception as e:
+        logger.error(f"❌ Summarization error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/extract", response_model=ExtractResponse, tags=["AI"])
+async def extract_entities(request: ExtractRequest):
+    """
+    Extract named entities using GLiNER
+    Zero-shot NER - can extract any entity type without training.
+    Typical inference time: 50-200ms on CPU.
+    """
+    if gliner_model is None:
+        raise HTTPException(status_code=503, detail="GLiNER model not loaded")
+    start_time = time.time()
+    try:
+        logger.info(f"🔍 Extracting entities: {request.labels}")
+        # Run GLiNER inference in thread pool
+        loop = asyncio.get_event_loop()
+        raw_entities = await loop.run_in_executor(
+            None,
+            lambda: gliner_model.predict_entities(
+                request.text,
+                request.labels,
+                threshold=request.threshold
+            )
+        )
+        # Convert to response format
+        entities = [
+            Entity(
+                text=entity['text'],
+                label=entity['label'],
+                score=round(entity['score'], 3)
+            )
+            for entity in raw_entities
+        ]
+        inference_time = int((time.time() - start_time) * 1000)
+        logger.info(f"✅ Extracted {len(entities)} entities in {inference_time}ms")
+        return ExtractResponse(
+            entities=entities,
+            model="GLiNER-small-v2.1",
+            inference_time_ms=inference_time
+        )
+    except Exception as e:
+        logger.error(f"❌ Entity extraction error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/audio", tags=["Audio"])
+async def generate_audio(request: AudioRequest):
+    """
+    Generate speech audio using Edge-TTS
+    Uses Microsoft's cloud API (zero local resources).
+    Returns MP3 audio stream.
+    """
+    try:
+        logger.info(f"🔊 Generating audio with voice: {request.voice}")
+        # Create TTS communicator
+        communicate = edge_tts.Communicate(
+            text=request.text,
+            voice=request.voice,
+            rate=request.rate,
+            volume=request.volume
+        )
+        # Stream audio chunks
+        async def audio_generator():
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    yield chunk["data"]
+        logger.info("✅ Audio generation started")
+        return StreamingResponse(
+            audio_generator(),
+            media_type="audio/mpeg",
+            headers={
+                "Content-Disposition": f"attachment; filename=audio.mp3"
+            }
+        )
+    except Exception as e:
+        logger.error(f"❌ Audio generation error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# Application Entry Point
+# ============================================================================
+if __name__ == "__main__":
+    import uvicorn
+    # Run server
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=7860,
+        workers=1,
+        log_level="info"
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# Web Framework
+fastapi==0.115.5
+uvicorn[standard]==0.32.1
+pydantic==2.10.3
+python-multipart==0.0.6
+# HTTP Client (for model downloads and health checks)
+httpx==0.26.0
+requests==2.31.0
+# Llama-cpp-python - CPU-optimized LLM inference
+# Will be compiled with CMAKE_ARGS from Dockerfile
+llama-cpp-python==0.2.90
+# GLiNER - Fast CPU-based NER
+gliner==0.2.19
+# Edge-TTS - Cloud-based TTS (zero local resources)
+edge-tts==6.1.15
+# HuggingFace Hub - Model downloads
+huggingface-hub==0.26.5
+# Logging and utilities
+python-dotenv==1.0.0