from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download import multiprocessing app = FastAPI() # =============================== # MODEL CONFIG # =============================== MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF" MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf" model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE ) # =============================== # LLM INITIALIZATION (OPTIMIZED) # =============================== llm = Llama( model_path=model_path, # Larger context for coding tasks n_ctx=8192, # Use all CPU cores n_threads=multiprocessing.cpu_count(), # CPU inference n_gpu_layers=0, # PERFORMANCE BOOST n_batch=512, use_mmap=True, use_mlock=True, ) # =============================== # REQUEST MODEL # =============================== class ChatRequest(BaseModel): message: str # =============================== # HEALTH CHECK # =============================== @app.get("/") def root(): return {"status": "Coding AI engine running"} # =============================== # CHAT ENDPOINT # =============================== @app.post("/chat") def chat(req: ChatRequest): # CODING SPECIALIZED SYSTEM PROMPT system_prompt = ( "<|system|>" "You are an elite senior software engineer AI. " "Write clean, production-ready code. " "Always include comments. " "Use best practices, error handling, and optimization. " "Format output in proper markdown with code blocks." "<|end|>" ) prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" output = llm( prompt, # Larger token output for code max_tokens=800, # Lower randomness = better code temperature=0.4, # Stable generation top_p=0.9, # Prevent repetition loops repeat_penalty=1.2, stop=["<|end|>"] ) response_text = output["choices"][0]["text"].strip() return {"reply": response_text} # =============================== # LOCAL RUN # =============================== if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)