Spaces:

lamionx
/

laminou

Runtime error

App Files Files Community

lamionx commited on Jan 17

Commit

8a464c3

1 Parent(s): 6a0df05

Deploy AI API with FastAPI + Mistral-7B: streaming, authentication, multi-language support, caching

Browse files

Files changed (4) hide show

Dockerfile +77 -0
README.md +45 -8
app.py +734 -0
requirements.txt +18 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,77 @@

+# ============================================================================
+# AI API Dockerfile - Optimized for Hugging Face Spaces
+# ============================================================================
+# This Dockerfile is configured to run on Hugging Face Spaces with:
+# - Port 7860 (HF Spaces default)
+# - Non-root user for security
+# - CUDA support for GPU acceleration
+# - 4-bit quantization for efficient memory usage
+# ============================================================================
+FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    CUDA_HOME=/usr/local/cuda \
+    PATH=/usr/local/cuda/bin:${PATH} \
+    LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    python3-dev \
+    build-essential \
+    git \
+    wget \
+    curl \
+    ca-certificates \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user for security
+RUN useradd -m -u 1000 -s /bin/bash appuser
+# Set working directory
+WORKDIR /app
+# Copy requirements first (for better caching)
+COPY requirements.txt .
+# Upgrade pip and install Python dependencies
+RUN pip3 install --upgrade pip setuptools wheel && \
+    pip3 install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY main.py .
+# Create necessary directories
+RUN mkdir -p /app/logs && \
+    chown -R appuser:appuser /app
+# Switch to non-root user
+USER appuser
+# Set default environment variables (can be overridden)
+ENV MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.2" \
+    API_KEY="your-secret-api-key-here" \
+    MAX_LENGTH="2048" \
+    TEMPERATURE="0.7" \
+    TOP_P="0.95" \
+    CACHE_SIZE="100" \
+    PORT="7860" \
+    HOST="0.0.0.0"
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Run the application
+CMD ["python3", "main.py"]

README.md CHANGED Viewed

@@ -1,13 +1,50 @@
 ---
-title: Laminou
-emoji: 🏢
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AI API - Mistral 7B
+emoji: 🤖
+colorFrom: indigo
+colorTo: purple
+sdk: docker
+app_port: 7860
 pinned: false
 license: apache-2.0
 ---
+# AI API - Mistral 7B
+Production-ready AI API powered by Mistral-7B-Instruct-v0.2 with:
+- ⚡ FastAPI with streaming responses
+- 🔐 API Key authentication
+- 🌍 Multi-language support (8 languages)
+- 💾 Smart response caching
+- 🎨 Built-in web interface
+- 🚀 Optimized 4-bit quantization
+## Usage
+### Web Interface
+Open the Space URL in your browser to access the interactive chat interface.
+### API Endpoint
+```bash
+curl -X POST "https://lamionx-laminou.hf.space/api/chat" \
+  -H "X-API-Key: your-secret-api-key-here" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Explain quantum computing",
+    "language": "en",
+    "stream": false
+  }'
+```
+### Configuration
+Set your API key as a Space secret:
+- Go to Settings → Repository secrets
+- Add: `API_KEY` = `your-secret-api-key-here`
+## Documentation
+For full documentation, visit the [GitHub repository](https://github.com/your-repo).

app.py ADDED Viewed

	@@ -0,0 +1,734 @@

+"""
+AI API Server - FastAPI + Mistral-7B
+Production-ready API with streaming, authentication, and caching
+"""
+import os
+import logging
+import asyncio
+from typing import Optional, Dict, Any, AsyncGenerator
+from datetime import datetime
+from functools import lru_cache
+import hashlib
+from fastapi import FastAPI, HTTPException, Header, Request, status
+from fastapi.responses import StreamingResponse, HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, validator
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import uvicorn
+# ============================================================================
+# LOGGING CONFIGURATION
+# ============================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler("api.log")
+    ]
+)
+logger = logging.getLogger(__name__)
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+class Config:
+    """Application configuration"""
+    MODEL_NAME = os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.2")
+    API_KEY = os.getenv("API_KEY", "your-secret-api-key-here")
+    MAX_LENGTH = int(os.getenv("MAX_LENGTH", "2048"))
+    TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
+    TOP_P = float(os.getenv("TOP_P", "0.95"))
+    CACHE_SIZE = int(os.getenv("CACHE_SIZE", "100"))
+    PORT = int(os.getenv("PORT", "7860"))
+    HOST = os.getenv("HOST", "0.0.0.0")
+    # Quantization config for 4-bit loading (optimized for free hardware)
+    QUANTIZATION_CONFIG = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4"
+    )
+config = Config()
+# ============================================================================
+# PYDANTIC MODELS
+# ============================================================================
+class ChatRequest(BaseModel):
+    """Request model for chat endpoint"""
+    prompt: str = Field(..., min_length=1, max_length=4000, description="User prompt")
+    language: str = Field(default="en", description="Response language (en, pt, es)")
+    temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    max_tokens: Optional[int] = Field(default=None, ge=1, le=4096)
+    stream: bool = Field(default=True, description="Enable streaming response")
+    @validator("language")
+    def validate_language(cls, v):
+        allowed = ["en", "pt", "es", "fr", "de", "it", "ja", "zh"]
+        if v not in allowed:
+            raise ValueError(f"Language must be one of {allowed}")
+        return v
+class ChatResponse(BaseModel):
+    """Response model for chat endpoint"""
+    response: str
+    language: str
+    model: str
+    timestamp: str
+    cached: bool = False
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    model_loaded: bool
+    timestamp: str
+# ============================================================================
+# SYSTEM PROMPTS (MULTI-LANGUAGE)
+# ============================================================================
+SYSTEM_PROMPTS = {
+    "en": "You are a helpful, respectful and honest AI assistant. Always answer as helpfully as possible, while being safe. If you don't know the answer, say so instead of making up information.",
+    "pt": "Você é um assistente de IA útil, respeitoso e honesto. Sempre responda da forma mais útil possível, mantendo a segurança. Se não souber a resposta, diga isso ao invés de inventar informações.",
+    "es": "Eres un asistente de IA útil, respetuoso y honesto. Siempre responde de la manera más útil posible, manteniendo la seguridad. Si no sabes la respuesta, dilo en lugar de inventar información.",
+    "fr": "Vous êtes un assistant IA utile, respectueux et honnête. Répondez toujours de la manière la plus utile possible, tout en restant sûr. Si vous ne connaissez pas la réponse, dites-le au lieu d'inventer des informations.",
+    "de": "Sie sind ein hilfreicher, respektvoller und ehrlicher KI-Assistent. Antworten Sie immer so hilfreich wie möglich und bleiben Sie dabei sicher. Wenn Sie die Antwort nicht wissen, sagen Sie es, anstatt Informationen zu erfinden.",
+    "it": "Sei un assistente AI utile, rispettoso e onesto. Rispondi sempre nel modo più utile possibile, mantenendo la sicurezza. Se non conosci la risposta, dillo invece di inventare informazioni.",
+    "ja": "あなたは親切で、礼儀正しく、正直なAIアシスタントです。常に安全を保ちながら、できるだけ役立つように答えてください。答えがわからない場合は、情報を作り上げるのではなく、そう言ってください。",
+    "zh": "你是一个乐于助人、尊重他人且诚实的AI助手。在保持安全的同时，始终尽可能有帮助地回答。如果你不知道答案，请说出来，而不是编造信息。"
+}
+# ============================================================================
+# SIMPLE CACHE IMPLEMENTATION
+# ============================================================================
+class ResponseCache:
+    """Simple in-memory cache for responses"""
+    def __init__(self, max_size: int = 100):
+        self.cache: Dict[str, tuple[str, datetime]] = {}
+        self.max_size = max_size
+        logger.info(f"Initialized cache with max size: {max_size}")
+    def _generate_key(self, prompt: str, language: str, temperature: float) -> str:
+        """Generate cache key from parameters"""
+        content = f"{prompt}:{language}:{temperature}"
+        return hashlib.md5(content.encode()).hexdigest()
+    def get(self, prompt: str, language: str, temperature: float) -> Optional[str]:
+        """Retrieve cached response"""
+        key = self._generate_key(prompt, language, temperature)
+        if key in self.cache:
+            response, timestamp = self.cache[key]
+            logger.info(f"Cache HIT for key: {key[:8]}...")
+            return response
+        logger.info(f"Cache MISS for key: {key[:8]}...")
+        return None
+    def set(self, prompt: str, language: str, temperature: float, response: str):
+        """Store response in cache"""
+        if len(self.cache) >= self.max_size:
+            # Remove oldest entry
+            oldest_key = min(self.cache.keys(), key=lambda k: self.cache[k][1])
+            del self.cache[oldest_key]
+            logger.info(f"Cache full, removed oldest entry: {oldest_key[:8]}...")
+        key = self._generate_key(prompt, language, temperature)
+        self.cache[key] = (response, datetime.now())
+        logger.info(f"Cached response for key: {key[:8]}...")
+# ============================================================================
+# MODEL LOADING
+# ============================================================================
+class ModelManager:
+    """Manages model loading and inference"""
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Device: {self.device}")
+    async def load_model(self):
+        """Load model with quantization"""
+        try:
+            logger.info(f"Loading model: {config.MODEL_NAME}")
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                config.MODEL_NAME,
+                trust_remote_code=True
+            )
+            # Load model with 4-bit quantization
+            self.model = AutoModelForCausalLM.from_pretrained(
+                config.MODEL_NAME,
+                quantization_config=config.QUANTIZATION_CONFIG,
+                device_map="auto",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            logger.info("Model loaded successfully!")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to load model: {str(e)}", exc_info=True)
+            return False
+    def format_prompt(self, prompt: str, language: str) -> str:
+        """Format prompt with system message"""
+        system_prompt = SYSTEM_PROMPTS.get(language, SYSTEM_PROMPTS["en"])
+        return f"<s>[INST] {system_prompt}\n\nUser: {prompt} [/INST]"
+    async def generate_stream(
+        self,
+        prompt: str,
+        language: str,
+        temperature: float,
+        max_tokens: int
+    ) -> AsyncGenerator[str, None]:
+        """Generate response with streaming"""
+        try:
+            formatted_prompt = self.format_prompt(prompt, language)
+            # Tokenize input
+            inputs = self.tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=config.MAX_LENGTH
+            ).to(self.device)
+            # Generate with streaming
+            with torch.no_grad():
+                for i in range(max_tokens):
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_new_tokens=1,
+                        temperature=temperature,
+                        top_p=config.TOP_P,
+                        do_sample=True,
+                        pad_token_id=self.tokenizer.eos_token_id
+                    )
+                    # Decode new token
+                    new_token = self.tokenizer.decode(
+                        outputs[0][-1:],
+                        skip_special_tokens=True
+                    )
+                    # Check for end of sequence
+                    if outputs[0][-1] == self.tokenizer.eos_token_id:
+                        break
+                    yield new_token
+                    # Update inputs for next iteration
+                    inputs = {"input_ids": outputs}
+                    # Small delay to simulate realistic streaming
+                    await asyncio.sleep(0.01)
+        except Exception as e:
+            logger.error(f"Generation error: {str(e)}", exc_info=True)
+            yield f"\n\n[Error: {str(e)}]"
+    async def generate(
+        self,
+        prompt: str,
+        language: str,
+        temperature: float,
+        max_tokens: int
+    ) -> str:
+        """Generate complete response (non-streaming)"""
+        try:
+            formatted_prompt = self.format_prompt(prompt, language)
+            inputs = self.tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=config.MAX_LENGTH
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=config.TOP_P,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            response = self.tokenizer.decode(
+                outputs[0][inputs["input_ids"].shape[1]:],
+                skip_special_tokens=True
+            )
+            return response.strip()
+        except Exception as e:
+            logger.error(f"Generation error: {str(e)}", exc_info=True)
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Generation failed: {str(e)}"
+            )
+# ============================================================================
+# FASTAPI APPLICATION
+# ============================================================================
+app = FastAPI(
+    title="AI API - Mistral 7B",
+    description="Production-ready AI API with streaming, authentication, and caching",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global instances
+model_manager = ModelManager()
+cache = ResponseCache(max_size=config.CACHE_SIZE)
+# ============================================================================
+# AUTHENTICATION
+# ============================================================================
+async def verify_api_key(x_api_key: str = Header(..., alias="X-API-Key")):
+    """Verify API key from header"""
+    if x_api_key != config.API_KEY:
+        logger.warning(f"Invalid API key attempt: {x_api_key[:8]}...")
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid API key"
+        )
+    return x_api_key
+# ============================================================================
+# STARTUP/SHUTDOWN EVENTS
+# ============================================================================
+@app.on_event("startup")
+async def startup_event():
+    """Load model on startup"""
+    logger.info("Starting AI API server...")
+    success = await model_manager.load_model()
+    if not success:
+        logger.error("Failed to load model, server may not function correctly")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    logger.info("Shutting down AI API server...")
+    # Clear cache
+    cache.cache.clear()
+# ============================================================================
+# ROUTES
+# ============================================================================
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    """Serve frontend HTML"""
+    html_content = """
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>AI API - Mistral 7B</title>
+        <script src="https://cdn.tailwindcss.com"></script>
+        <style>
+            @keyframes fadeIn {
+                from { opacity: 0; transform: translateY(10px); }
+                to { opacity: 1; transform: translateY(0); }
+            }
+            .message {
+                animation: fadeIn 0.3s ease-out;
+            }
+            .typing-indicator {
+                display: inline-block;
+            }
+            .typing-indicator span {
+                display: inline-block;
+                width: 8px;
+                height: 8px;
+                border-radius: 50%;
+                background-color: #6366F1;
+                margin: 0 2px;
+                animation: typing 1.4s infinite;
+            }
+            .typing-indicator span:nth-child(2) {
+                animation-delay: 0.2s;
+            }
+            .typing-indicator span:nth-child(3) {
+                animation-delay: 0.4s;
+            }
+            @keyframes typing {
+                0%, 60%, 100% { transform: translateY(0); }
+                30% { transform: translateY(-10px); }
+            }
+        </style>
+    </head>
+    <body class="bg-gradient-to-br from-slate-50 to-slate-100 min-h-screen">
+        <div class="container mx-auto px-4 py-8 max-w-4xl">
+            <!-- Header -->
+            <div class="bg-white rounded-2xl shadow-lg p-6 mb-6">
+                <h1 class="text-3xl font-bold text-slate-800 mb-2">🤖 AI API - Mistral 7B</h1>
+                <p class="text-slate-600">Production-ready AI API with streaming responses</p>
+            </div>
+            <!-- API Key Section -->
+            <div class="bg-white rounded-2xl shadow-lg p-6 mb-6">
+                <label class="block text-sm font-semibold text-slate-700 mb-2">API Key</label>
+                <input
+                    type="password"
+                    id="apiKey"
+                    placeholder="Enter your API key"
+                    class="w-full px-4 py-3 border border-slate-300 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:border-transparent outline-none"
+                />
+            </div>
+            <!-- Chat Interface -->
+            <div class="bg-white rounded-2xl shadow-lg p-6 mb-6">
+                <div id="messages" class="space-y-4 mb-6 max-h-96 overflow-y-auto">
+                    <div class="text-center text-slate-400 py-8">
+                        Start a conversation by typing a message below
+                    </div>
+                </div>
+                <!-- Input Area -->
+                <div class="flex gap-3">
+                    <select id="language" class="px-4 py-3 border border-slate-300 rounded-lg focus:ring-2 focus:ring-indigo-500 outline-none">
+                        <option value="en">English</option>
+                        <option value="pt">Português</option>
+                        <option value="es">Español</option>
+                        <option value="fr">Français</option>
+                        <option value="de">Deutsch</option>
+                        <option value="it">Italiano</option>
+                    </select>
+                    <input
+                        type="text"
+                        id="prompt"
+                        placeholder="Type your message..."
+                        class="flex-1 px-4 py-3 border border-slate-300 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:border-transparent outline-none"
+                    />
+                    <button
+                        onclick="sendMessage()"
+                        id="sendBtn"
+                        class="px-6 py-3 bg-indigo-600 text-white font-semibold rounded-lg hover:bg-indigo-700 transition-colors disabled:bg-slate-300 disabled:cursor-not-allowed"
+                    >
+                        Send
+                    </button>
+                </div>
+            </div>
+            <!-- Documentation -->
+            <div class="bg-white rounded-2xl shadow-lg p-6">
+                <h2 class="text-xl font-bold text-slate-800 mb-4">📚 API Documentation</h2>
+                <div class="space-y-4">
+                    <div>
+                        <h3 class="font-semibold text-slate-700 mb-2">Endpoint</h3>
+                        <code class="block bg-slate-100 p-3 rounded-lg text-sm">POST /api/chat</code>
+                    </div>
+                    <div>
+                        <h3 class="font-semibold text-slate-700 mb-2">Headers</h3>
+                        <code class="block bg-slate-100 p-3 rounded-lg text-sm">X-API-Key: your-api-key</code>
+                    </div>
+                    <div>
+                        <h3 class="font-semibold text-slate-700 mb-2">Example (curl)</h3>
+                        <pre class="bg-slate-100 p-3 rounded-lg text-sm overflow-x-auto"><code>curl -X POST "http://localhost:7860/api/chat" \\
+  -H "X-API-Key: your-secret-api-key-here" \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "prompt": "Explain quantum computing",
+    "language": "en",
+    "stream": false
+  }'</code></pre>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <script>
+            const messagesDiv = document.getElementById('messages');
+            const promptInput = document.getElementById('prompt');
+            const apiKeyInput = document.getElementById('apiKey');
+            const languageSelect = document.getElementById('language');
+            const sendBtn = document.getElementById('sendBtn');
+            // Load API key from localStorage
+            const savedApiKey = localStorage.getItem('apiKey');
+            if (savedApiKey) {
+                apiKeyInput.value = savedApiKey;
+            }
+            // Save API key on change
+            apiKeyInput.addEventListener('change', () => {
+                localStorage.setItem('apiKey', apiKeyInput.value);
+            });
+            // Send on Enter
+            promptInput.addEventListener('keypress', (e) => {
+                if (e.key === 'Enter' && !e.shiftKey) {
+                    e.preventDefault();
+                    sendMessage();
+                }
+            });
+            function addMessage(content, isUser = false) {
+                if (messagesDiv.children[0]?.textContent.includes('Start a conversation')) {
+                    messagesDiv.innerHTML = '';
+                }
+                const messageDiv = document.createElement('div');
+                messageDiv.className = `message flex ${isUser ? 'justify-end' : 'justify-start'}`;
+                const bubble = document.createElement('div');
+                bubble.className = `max-w-[70%] px-4 py-3 rounded-2xl ${
+                    isUser
+                        ? 'bg-indigo-600 text-white'
+                        : 'bg-slate-100 text-slate-800'
+                }`;
+                bubble.textContent = content;
+                messageDiv.appendChild(bubble);
+                messagesDiv.appendChild(messageDiv);
+                messagesDiv.scrollTop = messagesDiv.scrollHeight;
+                return bubble;
+            }
+            function addTypingIndicator() {
+                const messageDiv = document.createElement('div');
+                messageDiv.className = 'message flex justify-start';
+                messageDiv.id = 'typing-indicator';
+                const bubble = document.createElement('div');
+                bubble.className = 'max-w-[70%] px-4 py-3 rounded-2xl bg-slate-100';
+                bubble.innerHTML = '<div class="typing-indicator"><span></span><span></span><span></span></div>';
+                messageDiv.appendChild(bubble);
+                messagesDiv.appendChild(messageDiv);
+                messagesDiv.scrollTop = messagesDiv.scrollHeight;
+            }
+            function removeTypingIndicator() {
+                const indicator = document.getElementById('typing-indicator');
+                if (indicator) {
+                    indicator.remove();
+                }
+            }
+            async function sendMessage() {
+                const prompt = promptInput.value.trim();
+                const apiKey = apiKeyInput.value.trim();
+                const language = languageSelect.value;
+                if (!prompt) return;
+                if (!apiKey) {
+                    alert('Please enter your API key');
+                    return;
+                }
+                // Add user message
+                addMessage(prompt, true);
+                promptInput.value = '';
+                // Disable send button
+                sendBtn.disabled = true;
+                addTypingIndicator();
+                try {
+                    const response = await fetch('/api/chat', {
+                        method: 'POST',
+                        headers: {
+                            'Content-Type': 'application/json',
+                            'X-API-Key': apiKey
+                        },
+                        body: JSON.stringify({
+                            prompt: prompt,
+                            language: language,
+                            stream: true
+                        })
+                    });
+                    if (!response.ok) {
+                        throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+                    }
+                    removeTypingIndicator();
+                    const bubble = addMessage('', false);
+                    // Read stream
+                    const reader = response.body.getReader();
+                    const decoder = new TextDecoder();
+                    let fullResponse = '';
+                    while (true) {
+                        const { done, value } = await reader.read();
+                        if (done) break;
+                        const chunk = decoder.decode(value);
+                        const lines = chunk.split('\\n');
+                        for (const line of lines) {
+                            if (line.startsWith('data: ')) {
+                                const data = line.slice(6);
+                                if (data === '[DONE]') break;
+                                try {
+                                    const json = JSON.parse(data);
+                                    if (json.token) {
+                                        fullResponse += json.token;
+                                        bubble.textContent = fullResponse;
+                                        messagesDiv.scrollTop = messagesDiv.scrollHeight;
+                                    }
+                                } catch (e) {
+                                    console.error('Parse error:', e);
+                                }
+                            }
+                        }
+                    }
+                } catch (error) {
+                    removeTypingIndicator();
+                    addMessage(`Error: ${error.message}`, false);
+                } finally {
+                    sendBtn.disabled = false;
+                    promptInput.focus();
+                }
+            }
+        </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html_content)
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint"""
+    return HealthResponse(
+        status="healthy",
+        model_loaded=model_manager.model is not None,
+        timestamp=datetime.now().isoformat()
+    )
+@app.post("/api/chat")
+async def chat(
+    request: ChatRequest,
+    api_key: str = Header(..., alias="X-API-Key")
+):
+    """
+    Chat endpoint with streaming support
+    Requires X-API-Key header for authentication
+    """
+    # Verify API key
+    await verify_api_key(api_key)
+    # Check if model is loaded
+    if model_manager.model is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Model not loaded yet, please try again later"
+        )
+    # Get parameters
+    temperature = request.temperature or config.TEMPERATURE
+    max_tokens = request.max_tokens or 512
+    try:
+        # Check cache for non-streaming requests
+        if not request.stream:
+            cached_response = cache.get(request.prompt, request.language, temperature)
+            if cached_response:
+                return ChatResponse(
+                    response=cached_response,
+                    language=request.language,
+                    model=config.MODEL_NAME,
+                    timestamp=datetime.now().isoformat(),
+                    cached=True
+                )
+        # Streaming response
+        if request.stream:
+            async def generate():
+                full_response = ""
+                async for token in model_manager.generate_stream(
+                    request.prompt,
+                    request.language,
+                    temperature,
+                    max_tokens
+                ):
+                    full_response += token
+                    yield f"data: {{'token': '{token}'}}\n\n"
+                # Cache complete response
+                cache.set(request.prompt, request.language, temperature, full_response)
+                yield "data: [DONE]\n\n"
+            return StreamingResponse(
+                generate(),
+                media_type="text/event-stream"
+            )
+        # Non-streaming response
+        else:
+            response = await model_manager.generate(
+                request.prompt,
+                request.language,
+                temperature,
+                max_tokens
+            )
+            # Cache response
+            cache.set(request.prompt, request.language, temperature, response)
+            return ChatResponse(
+                response=response,
+                language=request.language,
+                model=config.MODEL_NAME,
+                timestamp=datetime.now().isoformat(),
+                cached=False
+            )
+    except Exception as e:
+        logger.error(f"Chat endpoint error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+# ============================================================================
+# MAIN
+# ============================================================================
+if __name__ == "__main__":
+    logger.info(f"Starting server on {config.HOST}:{config.PORT}")
+    uvicorn.run(
+        app,
+        host=config.HOST,
+        port=config.PORT,
+        log_level="info"
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# FastAPI and Server
+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+python-multipart==0.0.12
+pydantic==2.10.0
+pydantic-settings==2.6.0
+# AI/ML Libraries
+torch==2.5.0
+transformers==4.46.0
+accelerate==1.1.0
+bitsandbytes==0.44.0
+sentencepiece==0.2.0
+protobuf==5.28.0
+# Utilities
+python-dotenv==1.0.1
+aiofiles==24.1.0