Soumik Bose commited on
Commit
d5c9ae8
·
0 Parent(s):
Files changed (5) hide show
  1. .gitignore +0 -0
  2. Dockerfile +46 -0
  3. README.md +9 -0
  4. main.py +112 -0
  5. requirements.txt +5 -0
.gitignore ADDED
File without changes
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.11 Slim as requested
2
+ FROM python:3.11-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1 \
7
+ # Hugging Face Spaces specific port
8
+ PORT=7860 \
9
+ # Configure cache to be writable by non-root user
10
+ HF_HOME=/app/cache \
11
+ TRANSFORMERS_CACHE=/app/cache
12
+
13
+ WORKDIR /app
14
+
15
+ # 1. Install system tools (needed for compiling llama-cpp if wheels miss)
16
+ RUN apt-get update && apt-get install -y \
17
+ build-essential \
18
+ curl \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ # 2. Create a non-root user "user" with ID 1000 (Required for HF Spaces)
22
+ RUN useradd -m -u 1000 user
23
+
24
+ # 3. Create necessary directories with correct permissions
25
+ # We need a cache folder that the user can write to when downloading the model
26
+ RUN mkdir -p /app/cache && \
27
+ mkdir -p /app/models && \
28
+ chown -R user:user /app
29
+
30
+ # 4. Switch to the non-root user
31
+ USER user
32
+
33
+ # 5. Install Python dependencies
34
+ # We copy requirements first to leverage Docker layer caching
35
+ COPY --chown=user:user requirements.txt .
36
+ RUN pip install --no-cache-dir -r requirements.txt
37
+
38
+ # 6. Copy the application code
39
+ COPY --chown=user:user main.py .
40
+
41
+ # 7. Expose the port
42
+ EXPOSE 7860
43
+
44
+ # 8. Run the application
45
+ # host 0.0.0.0 is required for Docker networking
46
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: My AI-Chat APi
3
+ emoji: 🤥
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: docker
7
+ app_file: main.py
8
+ pinned: false
9
+ ---
main.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from contextlib import asynccontextmanager
4
+ from fastapi import FastAPI, HTTPException, Request
5
+ from pydantic import BaseModel, Field
6
+ from typing import List, Optional
7
+ from llama_cpp import Llama
8
+ from huggingface_hub import hf_hub_download
9
+
10
+ # --- 1. Logger Setup (Production Standard) ---
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
14
+ )
15
+ logger = logging.getLogger("qwen-api")
16
+
17
+ # --- 2. Global State ---
18
+ model_instance: Optional[Llama] = None
19
+
20
+ # Settings
21
+ REPO_ID = "Qwen/Qwen2.5-1.5B-Instruct-GGUF"
22
+ FILENAME = "qwen2.5-1.5b-instruct-q4_k_m.gguf"
23
+ N_THREADS = int(os.getenv("CPU_THREADS", "2")) # Default to 2 for your hardware
24
+
25
+ # --- 3. Lifespan (Startup/Shutdown Logic) ---
26
+ @asynccontextmanager
27
+ async def lifespan(app: FastAPI):
28
+ global model_instance
29
+ logger.info("STARTUP: Initializing Application...")
30
+
31
+ try:
32
+ # Download model using huggingface_hub (More robust than curl)
33
+ logger.info(f"Downloading model {REPO_ID} -> {FILENAME}...")
34
+ model_path = hf_hub_download(
35
+ repo_id=REPO_ID,
36
+ filename=FILENAME,
37
+ local_dir="./models"
38
+ )
39
+ logger.info(f"Model downloaded to: {model_path}")
40
+
41
+ # Load Model
42
+ logger.info("Loading Llama model into memory...")
43
+ model_instance = Llama(
44
+ model_path=model_path,
45
+ n_ctx=4096,
46
+ n_threads=N_THREADS,
47
+ n_batch=512,
48
+ verbose=False
49
+ )
50
+ logger.info("STARTUP: Model loaded successfully!")
51
+
52
+ except Exception as e:
53
+ logger.error(f"CRITICAL: Failed to load model: {e}")
54
+ raise e
55
+
56
+ yield
57
+
58
+ # Shutdown logic (if needed)
59
+ logger.info("SHUTDOWN: Cleaning up resources...")
60
+ model_instance = None
61
+
62
+ # --- 4. FastAPI App Definition ---
63
+ app = FastAPI(title="Qwen Production API", version="1.0.0", lifespan=lifespan)
64
+
65
+ # --- 5. Data Models ---
66
+ class Message(BaseModel):
67
+ role: str
68
+ content: str
69
+
70
+ class ChatCompletionRequest(BaseModel):
71
+ messages: List[Message]
72
+ temperature: Optional[float] = 0.7
73
+ max_tokens: Optional[int] = 512
74
+ stream: Optional[bool] = False
75
+
76
+ # --- 6. Endpoints ---
77
+
78
+ @app.get("/")
79
+ async def root():
80
+ """Root endpoint to verify api is reachable"""
81
+ logger.info("Health check on root / accessed")
82
+ return {
83
+ "message": "Qwen 2.5 (1.5B) CPU Inference API is Running",
84
+ "docs_url": "/docs"
85
+ }
86
+
87
+ @app.get("/ping")
88
+ async def ping():
89
+ """Simple health check for monitoring tools"""
90
+ return "pong"
91
+
92
+ @app.post("/v1/chat/completions")
93
+ async def chat_completions(request: ChatCompletionRequest):
94
+ """OpenAI-compatible chat completion endpoint"""
95
+ if not model_instance:
96
+ logger.error("Request received but model not loaded")
97
+ raise HTTPException(status_code=503, detail="Model is not ready yet")
98
+
99
+ logger.info(f"Generating completion. Temp: {request.temperature}, MaxTokens: {request.max_tokens}")
100
+
101
+ try:
102
+ # llama-cpp-python handles the chat formatting automatically
103
+ response = model_instance.create_chat_completion(
104
+ messages=[m.model_dump() for m in request.messages],
105
+ temperature=request.temperature,
106
+ max_tokens=request.max_tokens,
107
+ stream=request.stream
108
+ )
109
+ return response
110
+ except Exception as e:
111
+ logger.error(f"Inference Error: {str(e)}")
112
+ raise HTTPException(status_code=500, detail="Internal inference error")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.115.0
2
+ uvicorn>=0.30.0
3
+ pydantic>=2.8.0
4
+ llama-cpp-python>=0.2.90
5
+ huggingface-hub>=0.24.0