Spaces:

xce009
/

ai_chat_api

Running

App Files Files Community

Soumik Bose commited on 12 days ago

Commit

257c70f

1 Parent(s): d5c9ae8

ok

Browse files

Files changed (2) hide show

Dockerfile +7 -15
main.py +58 -55

Dockerfile CHANGED Viewed

@@ -1,46 +1,38 @@
-# Use Python 3.11 Slim as requested
 FROM python:3.11-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
-    # Hugging Face Spaces specific port
     PORT=7860 \
-    # Configure cache to be writable by non-root user
     HF_HOME=/app/cache \
     TRANSFORMERS_CACHE=/app/cache
 WORKDIR /app
-# 1. Install system tools (needed for compiling llama-cpp if wheels miss)
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     && rm -rf /var/lib/apt/lists/*
-# 2. Create a non-root user "user" with ID 1000 (Required for HF Spaces)
 RUN useradd -m -u 1000 user
-# 3. Create necessary directories with correct permissions
-# We need a cache folder that the user can write to when downloading the model
 RUN mkdir -p /app/cache && \
     mkdir -p /app/models && \
     chown -R user:user /app
-# 4. Switch to the non-root user
 USER user
-# 5. Install Python dependencies
-# We copy requirements first to leverage Docker layer caching
 COPY --chown=user:user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# 6. Copy the application code
 COPY --chown=user:user main.py .
-# 7. Expose the port
 EXPOSE 7860
-# 8. Run the application
-# host 0.0.0.0 is required for Docker networking
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PORT=7860 \
     HF_HOME=/app/cache \
     TRANSFORMERS_CACHE=/app/cache
 WORKDIR /app
+# 1. Install compilers (Required for llama-cpp build)
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     && rm -rf /var/lib/apt/lists/*
+# 2. Create non-root user (Security requirement for HF Spaces)
 RUN useradd -m -u 1000 user
+# 3. Create writable cache directories
 RUN mkdir -p /app/cache && \
     mkdir -p /app/models && \
     chown -R user:user /app
+# 4. Switch user
 USER user
+# 5. Install dependencies
 COPY --chown=user:user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# 6. Copy application code
 COPY --chown=user:user main.py .
+# 7. Launch
 EXPOSE 7860
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py CHANGED Viewed

@@ -1,75 +1,85 @@
 import os
 import logging
 from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException, Request
-from pydantic import BaseModel, Field
 from typing import List, Optional
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# --- 1. Logger Setup (Production Standard) ---
 logging.basicConfig(
     level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 )
-logger = logging.getLogger("qwen-api")
-# --- 2. Global State ---
-model_instance: Optional[Llama] = None
-# Settings
-REPO_ID = "Qwen/Qwen2.5-1.5B-Instruct-GGUF"
-FILENAME = "qwen2.5-1.5b-instruct-q4_k_m.gguf"
-N_THREADS = int(os.getenv("CPU_THREADS", "2")) # Default to 2 for your hardware
-# --- 3. Lifespan (Startup/Shutdown Logic) ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    global model_instance
-    logger.info("STARTUP: Initializing Application...")
     try:
-        # Download model using huggingface_hub (More robust than curl)
-        logger.info(f"Downloading model {REPO_ID} -> {FILENAME}...")
         model_path = hf_hub_download(
             repo_id=REPO_ID,
             filename=FILENAME,
-            local_dir="./models"
         )
-        logger.info(f"Model downloaded to: {model_path}")
-        # Load Model
-        logger.info("Loading Llama model into memory...")
-        model_instance = Llama(
             model_path=model_path,
-            n_ctx=4096,
-            n_threads=N_THREADS,
-            n_batch=512,
-            verbose=False
         )
-        logger.info("STARTUP: Model loaded successfully!")
     except Exception as e:
-        logger.error(f"CRITICAL: Failed to load model: {e}")
         raise e
-    yield
-    # Shutdown logic (if needed)
-    logger.info("SHUTDOWN: Cleaning up resources...")
-    model_instance = None
-# --- 4. FastAPI App Definition ---
-app = FastAPI(title="Qwen Production API", version="1.0.0", lifespan=lifespan)
 # --- 5. Data Models ---
 class Message(BaseModel):
     role: str
     content: str
-class ChatCompletionRequest(BaseModel):
     messages: List[Message]
-    temperature: Optional[float] = 0.7
     max_tokens: Optional[int] = 512
     stream: Optional[bool] = False
@@ -77,36 +87,29 @@ class ChatCompletionRequest(BaseModel):
 @app.get("/")
 async def root():
-    """Root endpoint to verify api is reachable"""
-    logger.info("Health check on root / accessed")
-    return {
-        "message": "Qwen 2.5 (1.5B) CPU Inference API is Running",
-        "docs_url": "/docs"
-    }
 @app.get("/ping")
 async def ping():
-    """Simple health check for monitoring tools"""
-    return "pong"
 @app.post("/v1/chat/completions")
-async def chat_completions(request: ChatCompletionRequest):
-    """OpenAI-compatible chat completion endpoint"""
-    if not model_instance:
-        logger.error("Request received but model not loaded")
-        raise HTTPException(status_code=503, detail="Model is not ready yet")
-    logger.info(f"Generating completion. Temp: {request.temperature}, MaxTokens: {request.max_tokens}")
     try:
-        # llama-cpp-python handles the chat formatting automatically
-        response = model_instance.create_chat_completion(
             messages=[m.model_dump() for m in request.messages],
             temperature=request.temperature,
             max_tokens=request.max_tokens,
             stream=request.stream
         )
-        return response
     except Exception as e:
-        logger.error(f"Inference Error: {str(e)}")
-        raise HTTPException(status_code=500, detail="Internal inference error")

 import os
 import logging
+import time
 from contextlib import asynccontextmanager
 from typing import List, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# --- 1. Logging Setup ---
 logging.basicConfig(
     level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
 )
+logger = logging.getLogger("SmolLM-API")
+# --- 2. Model Configuration ---
+# SWITCHED TO SMOLLM2 1.7B (Instruct Version)
+REPO_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
+FILENAME = "smollm2-1.7b-instruct-q4_k_m.gguf"
+# CPU Threads (Matches your hardware)
+N_THREADS = int(os.getenv("CPU_THREADS", "2"))
+llm_model: Optional[Llama] = None
+# --- 3. Lifecycle Manager ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    global llm_model
+    logger.info("--- STARTING SMOLLM2 API ---")
     try:
+        # Step A: Download
+        logger.info(f"Downloading {FILENAME} from Hugging Face...")
         model_path = hf_hub_download(
             repo_id=REPO_ID,
             filename=FILENAME,
+            cache_dir=os.getenv("HF_HOME", "/app/cache")
         )
+        logger.info(f"Download complete: {model_path}")
+        # Step B: Load into RAM
+        logger.info(f"Initializing Engine (Threads: {N_THREADS})...")
+        start_time = time.time()
+        llm_model = Llama(
             model_path=model_path,
+            n_ctx=2048,          # 2048 is standard for SmolLM
+            n_threads=N_THREADS,
+            n_batch=512,
+            verbose=False
         )
+        duration = time.time() - start_time
+        logger.info(f"SmolLM2 Loaded in {duration:.2f} seconds.")
     except Exception as e:
+        logger.critical(f"Startup Failed: {e}")
         raise e
+    yield
+    # Cleanup
+    if llm_model:
+        del llm_model
+        logger.info("Model unloaded.")
+# --- 4. FastAPI App ---
+app = FastAPI(title="SmolLM2 API", version="2.0", lifespan=lifespan)
 # --- 5. Data Models ---
 class Message(BaseModel):
     role: str
     content: str
+class ChatRequest(BaseModel):
     messages: List[Message]
+    temperature: Optional[float] = 0.6
     max_tokens: Optional[int] = 512
     stream: Optional[bool] = False
 @app.get("/")
 async def root():
+    return {"status": "Running", "model": "SmolLM2-1.7B-Instruct"}
 @app.get("/ping")
 async def ping():
+    if llm_model:
+        return {"status": "pong", "ready": True}
+    return JSONResponse(status_code=503, content={"status": "loading"})
 @app.post("/v1/chat/completions")
+async def chat(request: ChatRequest):
+    if not llm_model:
+        raise HTTPException(status_code=503, detail="Model loading...")
+    logger.info(f"Processing request: {len(request.messages)} msgs")
     try:
+        # llama-cpp-python handles the chat template automatically
+        return llm_model.create_chat_completion(
             messages=[m.model_dump() for m in request.messages],
             temperature=request.temperature,
             max_tokens=request.max_tokens,
             stream=request.stream
         )
     except Exception as e:
+        logger.error(f"Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))