Spaces:
Sleeping
Sleeping
| """ | |
| FastAPI server providing OpenAI-compatible endpoints for code generation. | |
| Designed to work with MCP servers and provide unlimited tokens with minimal rate limiting. | |
| """ | |
| import os | |
| import time | |
| import uuid | |
| from typing import Optional, List, Dict, Any | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| import uvicorn | |
| # ============================================================================ | |
| # CONFIGURATION | |
| # ============================================================================ | |
| MODEL_REPO = "TheBloke/deepseek-coder-1.3b-instruct-GGUF" | |
| MODEL_FILE = "deepseek-coder-1.3b-instruct.Q4_K_M.gguf" | |
| MODEL_NAME = "deepseek-coder-1.3b-instruct" | |
| # Context and generation settings for "unlimited" tokens | |
| MAX_CONTEXT = 4096 # Larger context window | |
| MAX_TOKENS = 4096 # Allow very long responses | |
| DEFAULT_TEMP = 0.7 | |
| DEFAULT_TOP_P = 0.95 | |
| # ============================================================================ | |
| # PYDANTIC MODELS (OpenAI-compatible) | |
| # ============================================================================ | |
| class Message(BaseModel): | |
| role: str | |
| content: str | |
| class ChatCompletionRequest(BaseModel): | |
| model: str = MODEL_NAME | |
| messages: List[Message] | |
| temperature: Optional[float] = DEFAULT_TEMP | |
| top_p: Optional[float] = DEFAULT_TOP_P | |
| max_tokens: Optional[int] = MAX_TOKENS | |
| stream: Optional[bool] = False | |
| stop: Optional[List[str]] = None | |
| class CompletionRequest(BaseModel): | |
| model: str = MODEL_NAME | |
| prompt: str | |
| temperature: Optional[float] = DEFAULT_TEMP | |
| top_p: Optional[float] = DEFAULT_TOP_P | |
| max_tokens: Optional[int] = MAX_TOKENS | |
| stop: Optional[List[str]] = None | |
| class Usage(BaseModel): | |
| prompt_tokens: int | |
| completion_tokens: int | |
| total_tokens: int | |
| class ChatCompletionChoice(BaseModel): | |
| index: int | |
| message: Message | |
| finish_reason: str | |
| class ChatCompletionResponse(BaseModel): | |
| id: str | |
| object: str = "chat.completion" | |
| created: int | |
| model: str | |
| choices: List[ChatCompletionChoice] | |
| usage: Usage | |
| class CompletionChoice(BaseModel): | |
| index: int | |
| text: str | |
| finish_reason: str | |
| class CompletionResponse(BaseModel): | |
| id: str | |
| object: str = "text_completion" | |
| created: int | |
| model: str | |
| choices: List[CompletionChoice] | |
| usage: Usage | |
| # ============================================================================ | |
| # FASTAPI APP | |
| # ============================================================================ | |
| app = FastAPI( | |
| title="Code LLM API", | |
| description="OpenAI-compatible API for code generation with minimal rate limiting", | |
| version="1.0.0" | |
| ) | |
| # Enable CORS for MCP server access | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Global model instance | |
| llm: Optional[Llama] = None | |
| # ============================================================================ | |
| # MODEL LOADING | |
| # ============================================================================ | |
| async def load_model(): | |
| """Load the LLM model on startup.""" | |
| global llm | |
| print(f"Downloading model {MODEL_REPO}/{MODEL_FILE}...") | |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) | |
| print(f"Model downloaded to: {model_path}") | |
| print("Loading model into memory...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=MAX_CONTEXT, | |
| n_threads=8, # Changed from 4 to 8 | |
| n_batch=1024, # Changed from 512 to 1024 | |
| verbose=False, | |
| n_gpu_layers=0 | |
| ) | |
| print("Model loaded successfully!") | |
| # ============================================================================ | |
| # HELPER FUNCTIONS | |
| # ============================================================================ | |
| def messages_to_prompt(messages: List[Message]) -> str: | |
| """Convert OpenAI-style messages to a prompt for CodeLlama.""" | |
| prompt_parts = [] | |
| for msg in messages: | |
| if msg.role == "system": | |
| prompt_parts.append(f"### System: {msg.content}") | |
| elif msg.role == "user": | |
| prompt_parts.append(f"### Instruction: {msg.content}") | |
| elif msg.role == "assistant": | |
| prompt_parts.append(f"### Response: {msg.content}") | |
| prompt_parts.append("### Response:") | |
| return "\n".join(prompt_parts) | |
| def estimate_tokens(text: str) -> int: | |
| """Rough token estimation (1 token ≈ 4 chars).""" | |
| return len(text) // 4 | |
| # ============================================================================ | |
| # API ENDPOINTS | |
| # ============================================================================ | |
| async def root(): | |
| """Health check endpoint.""" | |
| return { | |
| "status": "online", | |
| "model": MODEL_NAME, | |
| "max_context": MAX_CONTEXT, | |
| "max_tokens": MAX_TOKENS, | |
| "endpoints": { | |
| "chat": "/v1/chat/completions", | |
| "completion": "/v1/completions", | |
| "models": "/v1/models" | |
| } | |
| } | |
| async def health(): | |
| """Health check for monitoring.""" | |
| return { | |
| "status": "healthy" if llm is not None else "loading", | |
| "model_loaded": llm is not None | |
| } | |
| async def list_models(): | |
| """List available models (OpenAI-compatible).""" | |
| return { | |
| "object": "list", | |
| "data": [ | |
| { | |
| "id": MODEL_NAME, | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "huggingface", | |
| "permission": [], | |
| "root": MODEL_NAME, | |
| "parent": None | |
| } | |
| ] | |
| } | |
| async def chat_completions(request: ChatCompletionRequest): | |
| """ | |
| OpenAI-compatible chat completions endpoint. | |
| No rate limiting - designed for unlimited use. | |
| """ | |
| if llm is None: | |
| raise HTTPException(status_code=503, detail="Model still loading") | |
| if request.stream: | |
| raise HTTPException(status_code=501, detail="Streaming not yet implemented") | |
| # Convert messages to prompt | |
| prompt = messages_to_prompt(request.messages) | |
| # Generate response | |
| try: | |
| output = llm( | |
| prompt, | |
| max_tokens=request.max_tokens or MAX_TOKENS, | |
| temperature=request.temperature or DEFAULT_TEMP, | |
| top_p=request.top_p or DEFAULT_TOP_P, | |
| stop=request.stop or ["###", "\n\n\n"], | |
| echo=False | |
| ) | |
| generated_text = output['choices'][0]['text'].strip() | |
| # Estimate token usage | |
| prompt_tokens = estimate_tokens(prompt) | |
| completion_tokens = estimate_tokens(generated_text) | |
| return ChatCompletionResponse( | |
| id=f"chatcmpl-{uuid.uuid4().hex[:8]}", | |
| created=int(time.time()), | |
| model=request.model, | |
| choices=[ | |
| ChatCompletionChoice( | |
| index=0, | |
| message=Message(role="assistant", content=generated_text), | |
| finish_reason="stop" | |
| ) | |
| ], | |
| usage=Usage( | |
| prompt_tokens=prompt_tokens, | |
| completion_tokens=completion_tokens, | |
| total_tokens=prompt_tokens + completion_tokens | |
| ) | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") | |
| async def completions(request: CompletionRequest): | |
| """ | |
| OpenAI-compatible completions endpoint. | |
| No rate limiting - designed for unlimited use. | |
| """ | |
| if llm is None: | |
| raise HTTPException(status_code=503, detail="Model still loading") | |
| try: | |
| output = llm( | |
| request.prompt, | |
| max_tokens=request.max_tokens or MAX_TOKENS, | |
| temperature=request.temperature or DEFAULT_TEMP, | |
| top_p=request.top_p or DEFAULT_TOP_P, | |
| stop=request.stop or [], | |
| echo=False | |
| ) | |
| generated_text = output['choices'][0]['text'].strip() | |
| # Estimate token usage | |
| prompt_tokens = estimate_tokens(request.prompt) | |
| completion_tokens = estimate_tokens(generated_text) | |
| return CompletionResponse( | |
| id=f"cmpl-{uuid.uuid4().hex[:8]}", | |
| created=int(time.time()), | |
| model=request.model, | |
| choices=[ | |
| CompletionChoice( | |
| index=0, | |
| text=generated_text, | |
| finish_reason="stop" | |
| ) | |
| ], | |
| usage=Usage( | |
| prompt_tokens=prompt_tokens, | |
| completion_tokens=completion_tokens, | |
| total_tokens=prompt_tokens + completion_tokens | |
| ) | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") | |
| # ============================================================================ | |
| # SIMPLE ENDPOINTS (for easier testing) | |
| # ============================================================================ | |
| async def generate(prompt: str, max_tokens: int = 512): | |
| """Simple generation endpoint for quick testing.""" | |
| if llm is None: | |
| raise HTTPException(status_code=503, detail="Model still loading") | |
| try: | |
| output = llm(prompt, max_tokens=max_tokens, temperature=0.7) | |
| return { | |
| "prompt": prompt, | |
| "response": output['choices'][0]['text'].strip(), | |
| "model": MODEL_NAME | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # ============================================================================ | |
| # MAIN | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| uvicorn.run( | |
| app, | |
| host="0.0.0.0", | |
| port=int(os.getenv("PORT", "7860")), # HF Spaces uses port 7860 | |
| log_level="info" | |
| ) | |