| | from fastapi import FastAPI |
| | from pydantic import BaseModel |
| | from llama_cpp import Llama |
| | from huggingface_hub import hf_hub_download |
| | import multiprocessing |
| |
|
| | app = FastAPI() |
| |
|
| | |
| | |
| | |
| |
|
| | MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF" |
| | MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf" |
| |
|
| | model_path = hf_hub_download( |
| | repo_id=MODEL_REPO, |
| | filename=MODEL_FILE |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | llm = Llama( |
| | model_path=model_path, |
| |
|
| | |
| | n_ctx=8192, |
| |
|
| | |
| | n_threads=multiprocessing.cpu_count(), |
| |
|
| | |
| | n_gpu_layers=0, |
| |
|
| | |
| | n_batch=512, |
| | use_mmap=True, |
| | use_mlock=True, |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | class ChatRequest(BaseModel): |
| | message: str |
| |
|
| | |
| | |
| | |
| |
|
| | @app.get("/") |
| | def root(): |
| | return {"status": "Strategy AI engine running"} |
| |
|
| | |
| | |
| | |
| |
|
| | @app.post("/chat") |
| | def chat(req: ChatRequest): |
| |
|
| | |
| | system_prompt = ( |
| | "<|system|>" |
| | "You are an elite strategic intelligence AI. " |
| | "Think step-by-step before answering. " |
| | "Provide deep analysis, structured reasoning, and clear actionable insights. " |
| | "Use bullet points, numbered steps, and markdown formatting." |
| | "<|end|>" |
| | ) |
| |
|
| | prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" |
| |
|
| | output = llm( |
| | prompt, |
| |
|
| | |
| | max_tokens=900, |
| |
|
| | |
| | temperature=0.35, |
| |
|
| | |
| | top_p=0.9, |
| |
|
| | |
| | repeat_penalty=1.2, |
| |
|
| | stop=["<|end|>"] |
| | ) |
| |
|
| | response_text = output["choices"][0]["text"].strip() |
| |
|
| | return {"reply": response_text} |
| |
|
| | |
| | |
| | |
| |
|
| | if __name__ == "__main__": |
| | import uvicorn |
| | uvicorn.run(app, host="0.0.0.0", port=7860) |
| |
|