Spaces:
Running
Running
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import multiprocessing | |
| app = FastAPI() | |
| # =============================== | |
| # MODEL CONFIG | |
| # =============================== | |
| MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF" | |
| MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf" | |
| model_path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE | |
| ) | |
| # =============================== | |
| # LLM INITIALIZATION (OPTIMIZED) | |
| # =============================== | |
| llm = Llama( | |
| model_path=model_path, | |
| # Larger context for coding tasks | |
| n_ctx=8192, | |
| # Use all CPU cores | |
| n_threads=multiprocessing.cpu_count(), | |
| # CPU inference | |
| n_gpu_layers=0, | |
| # PERFORMANCE BOOST | |
| n_batch=512, | |
| use_mmap=True, | |
| use_mlock=True, | |
| ) | |
| # =============================== | |
| # REQUEST MODEL | |
| # =============================== | |
| class ChatRequest(BaseModel): | |
| message: str | |
| # =============================== | |
| # HEALTH CHECK | |
| # =============================== | |
| def root(): | |
| return {"status": "Coding AI engine running"} | |
| # =============================== | |
| # CHAT ENDPOINT | |
| # =============================== | |
| def chat(req: ChatRequest): | |
| # CODING SPECIALIZED SYSTEM PROMPT | |
| system_prompt = ( | |
| "<|system|>" | |
| "You are an elite senior software engineer AI. " | |
| "Write clean, production-ready code. " | |
| "Always include comments. " | |
| "Use best practices, error handling, and optimization. " | |
| "Format output in proper markdown with code blocks." | |
| "<|end|>" | |
| ) | |
| prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" | |
| output = llm( | |
| prompt, | |
| # Larger token output for code | |
| max_tokens=800, | |
| # Lower randomness = better code | |
| temperature=0.4, | |
| # Stable generation | |
| top_p=0.9, | |
| # Prevent repetition loops | |
| repeat_penalty=1.2, | |
| stop=["<|end|>"] | |
| ) | |
| response_text = output["choices"][0]["text"].strip() | |
| return {"reply": response_text} | |
| # =============================== | |
| # LOCAL RUN | |
| # =============================== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |