File size: 2,289 Bytes
23e0603
 
 
 
c7135bb
23e0603
 
 
c7135bb
 
 
 
23e0603
 
 
 
 
 
 
 
c7135bb
 
 
 
23e0603
 
c7135bb
 
 
 
 
 
 
 
 
 
 
 
 
 
23e0603
 
c7135bb
 
 
 
23e0603
 
 
c7135bb
 
 
 
23e0603
 
c7135bb
 
 
 
 
23e0603
 
 
c7135bb
 
 
 
 
 
 
 
 
 
 
 
 
 
23e0603
c7135bb
 
 
 
 
 
 
 
 
48b1cf8
c7135bb
 
 
 
23e0603
 
48b1cf8
c7135bb
 
 
 
 
 
 
23e0603
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import multiprocessing

app = FastAPI()

# ===============================
# MODEL CONFIG
# ===============================

MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)

# ===============================
# LLM INITIALIZATION (OPTIMIZED)
# ===============================

llm = Llama(
    model_path=model_path,

    # Larger context for coding tasks
    n_ctx=8192,

    # Use all CPU cores
    n_threads=multiprocessing.cpu_count(),

    # CPU inference
    n_gpu_layers=0,

    # PERFORMANCE BOOST
    n_batch=512,
    use_mmap=True,
    use_mlock=True,
)

# ===============================
# REQUEST MODEL
# ===============================

class ChatRequest(BaseModel):
    message: str

# ===============================
# HEALTH CHECK
# ===============================

@app.get("/")
def root():
    return {"status": "Coding AI engine running"}

# ===============================
# CHAT ENDPOINT
# ===============================

@app.post("/chat")
def chat(req: ChatRequest):

    # CODING SPECIALIZED SYSTEM PROMPT
    system_prompt = (
        "<|system|>"
        "You are an elite senior software engineer AI. "
        "Write clean, production-ready code. "
        "Always include comments. "
        "Use best practices, error handling, and optimization. "
        "Format output in proper markdown with code blocks."
        "<|end|>"
    )

    prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"

    output = llm(
        prompt,

        # Larger token output for code
        max_tokens=800,

        # Lower randomness = better code
        temperature=0.4,

        # Stable generation
        top_p=0.9,

        # Prevent repetition loops
        repeat_penalty=1.2,

        stop=["<|end|>"]
    )

    response_text = output["choices"][0]["text"].strip()

    return {"reply": response_text}

# ===============================
# LOCAL RUN
# ===============================

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)