Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| # GGUF model configuration | |
| REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF" | |
| FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf" | |
| app = FastAPI() | |
| # Download and cache the GGUF model | |
| print(f"Downloading {FILENAME} from {REPO_ID}...") | |
| model_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME, | |
| cache_dir=os.getenv("HF_HOME", "./models") | |
| ) | |
| print(f"Model downloaded to: {model_path}") | |
| # Load the model with llama-cpp-python | |
| print("Loading model into memory...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, # Context window | |
| n_threads=4, # CPU threads | |
| n_gpu_layers=0, # Use CPU only (set >0 if GPU available) | |
| verbose=False | |
| ) | |
| print("Model loaded successfully!") | |
| def chat(req: dict): | |
| messages = req.get("messages", []) | |
| max_tokens = req.get("max_tokens", 256) | |
| temperature = req.get("temperature", 0.7) | |
| # Use llama-cpp-python's built-in chat completion | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stop=["</s>", "User:", "###"] | |
| ) | |
| return { | |
| "choices": [{ | |
| "message": { | |
| "role": "assistant", | |
| "content": response["choices"][0]["message"]["content"] | |
| } | |
| }] | |
| } | |
| def root(): | |
| return {"status": "DeepSeek API is online (GGUF)"} |