| | import os |
| | import requests |
| | from fastapi import FastAPI, HTTPException |
| | from fastapi.middleware.cors import CORSMiddleware |
| | from fastapi.responses import HTMLResponse |
| | from llama_cpp import Llama |
| | from pydantic import BaseModel |
| | import uvicorn |
| |
|
| | |
| | MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" |
| | MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" |
| | MODEL_DIR = "model" |
| | MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) |
| |
|
| | |
| | os.makedirs(MODEL_DIR, exist_ok=True) |
| |
|
| | |
| | if not os.path.exists(MODEL_PATH): |
| | print(f"Downloading model from {MODEL_URL}...") |
| | response = requests.get(MODEL_URL, stream=True) |
| | if response.status_code == 200: |
| | with open(MODEL_PATH, "wb") as f: |
| | for chunk in response.iter_content(chunk_size=8192): |
| | f.write(chunk) |
| | print("Model downloaded successfully!") |
| | else: |
| | raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") |
| | else: |
| | print("Model already exists. Skipping download.") |
| |
|
| | |
| | app = FastAPI( |
| | title="DeepSeek-R1 OpenAI-Compatible API", |
| | description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B", |
| | version="1.0.0" |
| | ) |
| |
|
| | |
| | app.add_middleware( |
| | CORSMiddleware, |
| | allow_origins=["*"], |
| | allow_methods=["*"], |
| | allow_headers=["*"], |
| | ) |
| |
|
| | |
| | print("Loading model...") |
| | try: |
| | llm = Llama( |
| | model_path=MODEL_PATH, |
| | n_ctx=2048, |
| | n_threads=4, |
| | n_gpu_layers=0, |
| | verbose=False |
| | ) |
| | print("Model loaded successfully!") |
| | except Exception as e: |
| | raise RuntimeError(f"Failed to load model: {str(e)}") |
| |
|
| | |
| | @app.get("/", response_class=HTMLResponse) |
| | async def root(): |
| | return f""" |
| | <html> |
| | <head> |
| | <title>DeepSeek-R1 OpenAI API</title> |
| | <style> |
| | body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }} |
| | .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }} |
| | a {{ color: #007bff; text-decoration: none; }} |
| | code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }} |
| | </style> |
| | </head> |
| | <body> |
| | <h1>DeepSeek-R1 OpenAI-Compatible API</h1> |
| | |
| | <div class="warning"> |
| | <h3>⚠️ Important Notice</h3> |
| | <p>For private use, please duplicate this space:<br> |
| | 1. Click your profile picture in the top-right<br> |
| | 2. Select "Duplicate Space"<br> |
| | 3. Set visibility to Private</p> |
| | </div> |
| | |
| | <h2>API Documentation</h2> |
| | <ul> |
| | <li><a href="/docs">Interactive Swagger Documentation</a></li> |
| | <li><a href="/redoc">ReDoc Documentation</a></li> |
| | </ul> |
| | |
| | <h2>API Endpoints</h2> |
| | <h3>Chat Completion</h3> |
| | <p><code>POST /v1/chat/completions</code></p> |
| | <p>Parameters:</p> |
| | <ul> |
| | <li><strong>messages</strong>: List of message objects</li> |
| | <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li> |
| | <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li> |
| | <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li> |
| | </ul> |
| | |
| | <h2>Example Request</h2> |
| | <pre> |
| | curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\ |
| | -H "Content-Type: application/json" \\ |
| | -d '{{ |
| | "messages": [{{"role": "user", "content": "Explain quantum computing"}}], |
| | "max_tokens": 150 |
| | }}' |
| | </pre> |
| | </body> |
| | </html> |
| | """ |
| |
|
| | |
| | class ChatCompletionRequest(BaseModel): |
| | model: str = "DeepSeek-R1-Distill-Qwen-1.5B" |
| | messages: list[dict] |
| | max_tokens: int = 128 |
| | temperature: float = 0.7 |
| | top_p: float = 0.9 |
| | stream: bool = False |
| |
|
| | |
| | class ChatCompletionResponse(BaseModel): |
| | id: str = "chatcmpl-12345" |
| | object: str = "chat.completion" |
| | created: int = 1693161600 |
| | model: str = "DeepSeek-R1-Distill-Qwen-1.5B" |
| | choices: list[dict] |
| | usage: dict |
| |
|
| | @app.post("/v1/chat/completions") |
| | async def chat_completion(request: ChatCompletionRequest): |
| | try: |
| | prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages]) |
| | prompt += "\nassistant:" |
| |
|
| | response = llm( |
| | prompt=prompt, |
| | max_tokens=request.max_tokens, |
| | temperature=request.temperature, |
| | top_p=request.top_p, |
| | stop=["</s>"] |
| | ) |
| |
|
| | return ChatCompletionResponse( |
| | choices=[{ |
| | "index": 0, |
| | "message": { |
| | "role": "assistant", |
| | "content": response['choices'][0]['text'].strip() |
| | }, |
| | "finish_reason": "stop" |
| | }], |
| | usage={ |
| | "prompt_tokens": len(prompt), |
| | "completion_tokens": len(response['choices'][0]['text']), |
| | "total_tokens": len(prompt) + len(response['choices'][0]['text']) |
| | } |
| | ) |
| | except Exception as e: |
| | raise HTTPException(status_code=500, detail=str(e)) |
| |
|
| | @app.get("/health") |
| | def health_check(): |
| | return {"status": "healthy"} |
| |
|
| | if __name__ == "__main__": |
| | uvicorn.run(app, host="0.0.0.0", port=7860) |