Spaces:
Running
Running
| import os | |
| import requests | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from llama_cpp import Llama | |
| from pydantic import BaseModel | |
| import uvicorn | |
| MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" | |
| MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" | |
| MODEL_DIR = "model" | |
| MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| if not os.path.exists(MODEL_PATH): | |
| print(f"Downloading model from {MODEL_URL}...") | |
| response = requests.get(MODEL_URL, stream=True) | |
| if response.status_code == 200: | |
| with open(MODEL_PATH, "wb") as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| print("Model downloaded successfully!") | |
| else: | |
| raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") | |
| else: | |
| print("Model already exists. Skipping download.") | |
| app = FastAPI(title="DeepSeek-R1 OpenAI-Compatible API") | |
| # CORS Configuration | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| print("Loading model...") | |
| try: | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=2048, | |
| n_threads=4, | |
| n_gpu_layers=0, | |
| verbose=False | |
| ) | |
| print("Model loaded successfully!") | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load model: {str(e)}") | |
| class ChatCompletionRequest(BaseModel): | |
| model: str = "DeepSeek-R1-Distill-Qwen-1.5B" | |
| messages: list[dict] | |
| max_tokens: int = 128 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| stream: bool = False | |
| class ChatCompletionResponse(BaseModel): | |
| id: str = "chatcmpl-12345" | |
| object: str = "chat.completion" | |
| created: int = 1693161600 | |
| model: str = "DeepSeek-R1-Distill-Qwen-1.5B" | |
| choices: list[dict] | |
| usage: dict | |
| async def chat_completion(request: ChatCompletionRequest): | |
| try: | |
| prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages]) | |
| prompt += "\nassistant:" | |
| response = llm( | |
| prompt=prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| top_p=request.top_p, | |
| stop=["</s>"] | |
| ) | |
| return ChatCompletionResponse( | |
| choices=[{ | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": response['choices'][0]['text'].strip() | |
| }, | |
| "finish_reason": "stop" | |
| }], | |
| usage={ | |
| "prompt_tokens": len(prompt), | |
| "completion_tokens": len(response['choices'][0]['text']), | |
| "total_tokens": len(prompt) + len(response['choices'][0]['text']) | |
| } | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def health_check(): | |
| return {"status": "healthy"} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |