ag2 / app.py
nulltron's picture
Create app.py
34a3ef8 verified
Raw
History Blame Contribute Delete
3.26 kB
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Dict, Any
import uvicorn
from model_loader import get_local_llm_instance
app = FastAPI(title="Stateless Agent Pipeline")
# Enable global cross-origin resource sharing for frontend html access
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Load model engine universally on runtime startup
try:
llm_instance = get_local_llm_instance()
except Exception as init_err:
print(f"[CRITICAL ERROR] Failed to load local weights: {init_err}")
llm_instance = None
# Validation structure for parsing the data packets cleanly
class ChatPayload(BaseModel):
user_id: str
user_message: str
current_chat_history: List[Dict[str, Any]] = []
user_files: Dict[str, Any] = {}
@app.get("/")
def read_root():
return {"status": "online", "engine": "Llama.cpp local cluster running flawlessly"}
@app.post("/chat")
async def chat_endpoint(payload: ChatPayload):
global llm_instance
if llm_instance is None:
raise HTTPException(status_code=500, detail="Local LLM instance cluster is offline.")
try:
user_query = payload.user_message
# Build strict system directives for clean output responses
system_instruction = (
"<|im_start|>system\n"
"You are a helpful, extremely fast AI assistant. "
"Respond cleanly, accurately and directly to the prompt. "
"Keep formatting minimal.<|im_end|>\n"
)
# Format chat history context string if it exists
history_context = ""
for turn in payload.current_chat_history[-4:]: # Keep only the last 4 exchanges to preserve fast RAM context
role = "user" if turn.get("role") == "user" else "assistant"
content = turn.get("content", "")
history_context += f"<|im_start|>{role}\n{content}<|im_end|>\n"
# Compile complete operational template string
final_prompt = f"{system_instruction}{history_context}<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n"
# Run synchronous inference across CPU matrix
output = llm_instance(
final_prompt,
max_tokens=512, # Generation constraint for faster response times
stop=["<|im_end|>", "<|im_start|>", "user:", "assistant:"],
echo=False
)
generated_text = output["choices"][0]["text"].strip()
# Re-construct updated structural array history block
updated_history = payload.current_chat_history + [
{"role": "user", "content": user_query},
{"role": "assistant", "content": generated_text}
]
return {
"updated_chat_history": updated_history,
"updated_files": payload.user_files
}
except Exception as exec_error:
raise HTTPException(status_code=500, detail=f"Inference Engine Error: {str(exec_error)}")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)