File size: 2,355 Bytes
885354f
73a88d0
b9c72be
 
cf95752
59d2308
885354f
3a784e6
e5cd937
 
 
 
bca4678
0eea540
db0a3fa
885354f
 
 
 
59d2308
e5cd937
 
 
 
885354f
 
e5cd937
 
cf95752
e5cd937
 
cf95752
e5cd937
 
 
 
 
 
 
 
885354f
cd2d9ab
e5cd937
 
 
 
73a88d0
 
4f9c2f2
e5cd937
 
 
 
8e15e85
 
e5cd937
 
 
 
 
8e15e85
73a88d0
 
cf95752
e5cd937
 
 
 
 
 
 
 
 
cf95752
 
 
e5cd937
ef3a046
cf95752
e5cd937
 
 
ef3a046
e5cd937
 
ef3a046
 
69bab40
e5cd937
 
 
 
 
 
 
e2bfa03
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import multiprocessing

app = FastAPI()

# ===============================
# MODEL CONFIG
# ===============================

MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)

# ===============================
# LLM INITIALIZATION (OPTIMIZED)
# ===============================

llm = Llama(
    model_path=model_path,

    # Context window (balance speed + memory)
    n_ctx=4096,

    # Use all CPU cores automatically
    n_threads=multiprocessing.cpu_count(),

    # CPU inference
    n_gpu_layers=0,

    # Performance optimizations
    n_batch=512,            # faster token processing
    use_mmap=True,          # faster loading
    use_mlock=True,         # prevents RAM swapping
)

# ===============================
# REQUEST MODEL
# ===============================

class ChatRequest(BaseModel):
    message: str

# ===============================
# HEALTH CHECK
# ===============================

@app.get("/")
def root():
    return {"status": "Speed AI engine running"}

# ===============================
# CHAT ENDPOINT
# ===============================

@app.post("/chat")
def chat(req: ChatRequest):

    # PROFESSIONAL SYSTEM PROMPT
    system_prompt = (
        "<|system|>"
        "You are a high-speed professional AI assistant. "
        "Respond clearly, concisely, and in structured markdown format. "
        "Use bullet points, headings, and emojis when helpful. "
        "Never include conversation history unless asked."
        "<|end|>"
    )

    prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"

    # GENERATION SETTINGS (OPTIMIZED BALANCE)
    output = llm(
        prompt,

        max_tokens=400,        # faster than 512
        temperature=0.6,       # less hallucination
        top_p=0.9,
        repeat_penalty=1.15,   # reduces loops

        stop=["<|end|>"]
    )

    response_text = output["choices"][0]["text"].strip()

    return {"reply": response_text}

# ===============================
# LOCAL RUN
# ===============================

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)