Spaces:

ResearchEngineering
/

AGI

Running

App Files Files Community

Dmitry Beresnev commited on Dec 3, 2025

Commit

dde400a

1 Parent(s): 8837f11

fix dockerfile and app module

Browse files

Files changed (2) hide show

Dockerfile +22 -14
app.py +167 -47

Dockerfile CHANGED Viewed

@@ -9,16 +9,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libcurl4-openssl-dev \
     && rm -rf /var/lib/apt/lists/*
-# Clone and build llama.cpp
 WORKDIR /build
 # Cache bust to force fresh build
-ARG CACHEBUST=1
 RUN git clone https://github.com/ggerganov/llama.cpp.git && \
     cd llama.cpp && \
     cmake -B build -DCMAKE_BUILD_TYPE=Release \
-        -DGGML_NATIVE=OFF \
-        -DGGML_AVX2=OFF \
-        -DGGML_OPENMP=OFF && \
     cmake --build build --config Release --target llama-server -j$(nproc) && \
     echo "=== Binary dependencies ===" && \
     ldd build/bin/llama-server || true
@@ -41,26 +42,33 @@ COPY --from=builder /build/llama.cpp/build/bin/*.so.* /usr/local/lib/
 # Update library cache
 RUN ldconfig
 # Create non-root user
 RUN useradd -m -u 1000 user && \
     mkdir -p /home/user/.cache/llama.cpp && \
     chown -R user:user /home/user
 USER user
 WORKDIR /home/user
 # Set environment variables
 ENV HOME=/home/user \
     LLAMA_CACHE=/home/user/.cache/llama.cpp \
-    PATH=/home/user/.local/bin:$PATH
 EXPOSE 7860
-# Start llama-server with HuggingFace model
-# Using DeepSeek LLM 7B Chat - general purpose model
-CMD ["llama-server", \
-     "-hf", "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf", \
-     "--host", "0.0.0.0", \
-     "--port", "7860", \
-     "-c", "2048", \
-     "--metrics"]

     libcurl4-openssl-dev \
     && rm -rf /var/lib/apt/lists/*
+# Clone and build llama.cpp with optimizations for speed
 WORKDIR /build
 # Cache bust to force fresh build
+ARG CACHEBUST=2
 RUN git clone https://github.com/ggerganov/llama.cpp.git && \
     cd llama.cpp && \
     cmake -B build -DCMAKE_BUILD_TYPE=Release \
+        -DGGML_NATIVE=ON \
+        -DGGML_AVX2=ON \
+        -DGGML_FMA=ON \
+        -DGGML_F16C=ON && \
     cmake --build build --config Release --target llama-server -j$(nproc) && \
     echo "=== Binary dependencies ===" && \
     ldd build/bin/llama-server || true
 # Update library cache
 RUN ldconfig
+# Install Python and FastAPI dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python packages
+RUN pip3 install --no-cache-dir fastapi uvicorn requests pydantic --break-system-packages
 # Create non-root user
 RUN useradd -m -u 1000 user && \
     mkdir -p /home/user/.cache/llama.cpp && \
     chown -R user:user /home/user
+# Copy application code
+COPY --chown=user:user app.py /home/user/app.py
 USER user
 WORKDIR /home/user
 # Set environment variables
 ENV HOME=/home/user \
     LLAMA_CACHE=/home/user/.cache/llama.cpp \
+    PATH=/home/user/.local/bin:$PATH \
+    PYTHONUNBUFFERED=1
 EXPOSE 7860
+# Start FastAPI app (which manages llama-server internally)
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,59 +1,179 @@
-from fastapi import FastAPI
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 import os
-# GGUF model configuration
-REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
-FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
 app = FastAPI()
-# Download and cache the GGUF model
-print(f"Downloading {FILENAME} from {REPO_ID}...")
-model_path = hf_hub_download(
-    repo_id=REPO_ID,
-    filename=FILENAME,
-    cache_dir=os.getenv("HF_HOME", "./models")
-)
-print(f"Model downloaded to: {model_path}")
-# Load the model with llama-cpp-python
-print("Loading model into memory...")
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,  # Context window
-    n_threads=4,  # CPU threads
-    n_gpu_layers=0,  # Use CPU only (set >0 if GPU available)
-    verbose=False
-)
-print("Model loaded successfully!")
-@app.post("/v1/chat/completions")
-def chat(req: dict):
-    messages = req.get("messages", [])
-    max_tokens = req.get("max_tokens", 256)
-    temperature = req.get("temperature", 0.7)
-    # Use llama-cpp-python's built-in chat completion
-    response = llm.create_chat_completion(
-        messages=messages,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        stop=["</s>", "User:", "###"]
     )
     return {
-        "choices": [{
-            "message": {
-                "role": "assistant",
-                "content": response["choices"][0]["message"]["content"]
-            }
-        }]
     }
-@app.get("/")
-def root():
-    return {"status": "DeepSeek API is online (GGUF)"}

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import subprocess
+import signal
 import os
+import requests
+import time
+from typing import Optional
 app = FastAPI()
+# Predefined list of available models
+AVAILABLE_MODELS = {
+    # === Financial & Summarization Models (Recommended) ===
+    "qwen-2.5-7b": "bartowski/Qwen2.5-7B-Instruct-GGUF:Qwen2.5-7B-Instruct-Q4_K_M.gguf",  # Best for financial + multilingual
+    "kimi-k2-9b": "bartowski/k2-chat-GGUF:k2-chat-Q4_K_M.gguf",  # Kimi K2 - long context, good reasoning
+    "yi-1.5-9b": "bartowski/Yi-1.5-9B-Chat-GGUF:Yi-1.5-9B-Chat-Q4_K_M.gguf",  # Excellent for finance
+    "llama-3.1-8b": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",  # Great reasoning
+    "mistral-7b": "TheBloke/Mistral-7B-Instruct-v0.3-GGUF:mistral-7b-instruct-v0.3.Q4_K_M.gguf",  # Reliable summarization
+    # === Coding Models ===
+    "deepseek-coder": "TheBloke/deepseek-coder-6.7B-instruct-GGUF:deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
+    # === General Purpose ===
+    "deepseek-chat": "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
+    "llama-3.2-3b": "bartowski/Llama-3.2-3B-Instruct-GGUF:Llama-3.2-3B-Instruct-Q4_K_M.gguf",  # Fast & lightweight
+}
+# Global state
+current_model = "deepseek-chat"  # Default model
+llama_process: Optional[subprocess.Popen] = None
+LLAMA_SERVER_PORT = 8080
+LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}"
+class ModelSwitchRequest(BaseModel):
+    model_name: str
+class ChatCompletionRequest(BaseModel):
+    messages: list[dict]
+    max_tokens: int = 256
+    temperature: float = 0.7
+def start_llama_server(model_id: str) -> subprocess.Popen:
+    """Start llama-server with specified model (optimized for speed)."""
+    cmd = [
+        "llama-server",
+        "-hf", model_id,
+        "--host", "0.0.0.0",
+        "--port", str(LLAMA_SERVER_PORT),
+        "-c", "2048",           # Context size
+        "-t", "4",              # CPU threads (adjust based on cores)
+        "-ngl", "0",            # GPU layers (0 for CPU-only)
+        "--cont-batching",      # Enable continuous batching for speed
+        "-b", "512",            # Batch size
+    ]
+    print(f"Starting llama-server with model: {model_id}")
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        preexec_fn=os.setsid if os.name != 'nt' else None
     )
+    # Wait for server to be ready
+    max_retries = 60
+    for i in range(max_retries):
+        try:
+            response = requests.get(f"{LLAMA_SERVER_URL}/health", timeout=1)
+            if response.status_code == 200:
+                print(f"llama-server ready after {i+1} seconds")
+                return process
+        except:
+            time.sleep(1)
+    raise RuntimeError("llama-server failed to start")
+def stop_llama_server():
+    """Stop the running llama-server."""
+    global llama_process
+    if llama_process:
+        print("Stopping llama-server...")
+        try:
+            if os.name != 'nt':
+                os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM)
+            else:
+                llama_process.terminate()
+            llama_process.wait(timeout=10)
+        except:
+            if os.name != 'nt':
+                os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL)
+            else:
+                llama_process.kill()
+        llama_process = None
+        time.sleep(2)  # Give it time to fully shut down
+@app.on_event("startup")
+async def startup_event():
+    """Start with default model."""
+    global llama_process
+    model_id = AVAILABLE_MODELS[current_model]
+    llama_process = start_llama_server(model_id)
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Clean shutdown."""
+    stop_llama_server()
+@app.get("/")
+async def root():
     return {
+        "status": "DeepSeek API with dynamic model switching",
+        "current_model": current_model,
+        "available_models": list(AVAILABLE_MODELS.keys())
     }
+@app.get("/models")
+async def list_models():
+    """List all available models."""
+    return {
+        "current_model": current_model,
+        "available_models": list(AVAILABLE_MODELS.keys())
+    }
+@app.post("/switch-model")
+async def switch_model(request: ModelSwitchRequest):
+    """Switch to a different model."""
+    global current_model, llama_process
+    if request.model_name not in AVAILABLE_MODELS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Model '{request.model_name}' not found. Available: {list(AVAILABLE_MODELS.keys())}"
+        )
+    if request.model_name == current_model:
+        return {"message": f"Already using model: {current_model}"}
+    # Stop current server
+    stop_llama_server()
+    # Start with new model
+    model_id = AVAILABLE_MODELS[request.model_name]
+    llama_process = start_llama_server(model_id)
+    current_model = request.model_name
+    return {
+        "message": f"Switched to model: {current_model}",
+        "model": current_model
+    }
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    """OpenAI-compatible chat completions endpoint."""
+    try:
+        # Forward to llama-server
+        response = requests.post(
+            f"{LLAMA_SERVER_URL}/v1/chat/completions",
+            json={
+                "messages": request.messages,
+                "max_tokens": request.max_tokens,
+                "temperature": request.temperature,
+            },
+            timeout=300
+        )
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")