AGI / app.py
Dmitry Beresnev
fix dockerfile, pyproject.toml, app
9d0ed97
raw
history blame
1.52 kB
from fastapi import FastAPI
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
# GGUF model configuration
REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
app = FastAPI()
# Download and cache the GGUF model
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=os.getenv("HF_HOME", "./models")
)
print(f"Model downloaded to: {model_path}")
# Load the model with llama-cpp-python
print("Loading model into memory...")
llm = Llama(
model_path=model_path,
n_ctx=2048, # Context window
n_threads=4, # CPU threads
n_gpu_layers=0, # Use CPU only (set >0 if GPU available)
verbose=False
)
print("Model loaded successfully!")
@app.post("/v1/chat/completions")
def chat(req: dict):
messages = req.get("messages", [])
max_tokens = req.get("max_tokens", 256)
temperature = req.get("temperature", 0.7)
# Use llama-cpp-python's built-in chat completion
response = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
stop=["</s>", "User:", "###"]
)
return {
"choices": [{
"message": {
"role": "assistant",
"content": response["choices"][0]["message"]["content"]
}
}]
}
@app.get("/")
def root():
return {"status": "DeepSeek API is online (GGUF)"}