File size: 1,523 Bytes
9d0ed97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from fastapi import FastAPI
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

# GGUF model configuration
REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"

app = FastAPI()

# Download and cache the GGUF model
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
    repo_id=REPO_ID,
    filename=FILENAME,
    cache_dir=os.getenv("HF_HOME", "./models")
)
print(f"Model downloaded to: {model_path}")

# Load the model with llama-cpp-python
print("Loading model into memory...")
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Context window
    n_threads=4,  # CPU threads
    n_gpu_layers=0,  # Use CPU only (set >0 if GPU available)
    verbose=False
)
print("Model loaded successfully!")


@app.post("/v1/chat/completions")
def chat(req: dict):
    messages = req.get("messages", [])
    max_tokens = req.get("max_tokens", 256)
    temperature = req.get("temperature", 0.7)

    # Use llama-cpp-python's built-in chat completion
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        stop=["</s>", "User:", "###"]
    )

    return {
        "choices": [{
            "message": {
                "role": "assistant",
                "content": response["choices"][0]["message"]["content"]
            }
        }]
    }


@app.get("/")
def root():
    return {"status": "DeepSeek API is online (GGUF)"}