| from fastapi import FastAPI, HTTPException |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel |
| from typing import List, Dict, Any |
| import uvicorn |
| from model_loader import get_local_llm_instance |
|
|
| app = FastAPI(title="Stateless Agent Pipeline") |
|
|
| |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| |
| try: |
| llm_instance = get_local_llm_instance() |
| except Exception as init_err: |
| print(f"[CRITICAL ERROR] Failed to load local weights: {init_err}") |
| llm_instance = None |
|
|
| |
| class ChatPayload(BaseModel): |
| user_id: str |
| user_message: str |
| current_chat_history: List[Dict[str, Any]] = [] |
| user_files: Dict[str, Any] = {} |
|
|
| @app.get("/") |
| def read_root(): |
| return {"status": "online", "engine": "Llama.cpp local cluster running flawlessly"} |
|
|
| @app.post("/chat") |
| async def chat_endpoint(payload: ChatPayload): |
| global llm_instance |
| if llm_instance is None: |
| raise HTTPException(status_code=500, detail="Local LLM instance cluster is offline.") |
| |
| try: |
| user_query = payload.user_message |
| |
| |
| system_instruction = ( |
| "<|im_start|>system\n" |
| "You are a helpful, extremely fast AI assistant. " |
| "Respond cleanly, accurately and directly to the prompt. " |
| "Keep formatting minimal.<|im_end|>\n" |
| ) |
| |
| |
| history_context = "" |
| for turn in payload.current_chat_history[-4:]: |
| role = "user" if turn.get("role") == "user" else "assistant" |
| content = turn.get("content", "") |
| history_context += f"<|im_start|>{role}\n{content}<|im_end|>\n" |
| |
| |
| final_prompt = f"{system_instruction}{history_context}<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n" |
| |
| |
| output = llm_instance( |
| final_prompt, |
| max_tokens=512, |
| stop=["<|im_end|>", "<|im_start|>", "user:", "assistant:"], |
| echo=False |
| ) |
| |
| generated_text = output["choices"][0]["text"].strip() |
| |
| |
| updated_history = payload.current_chat_history + [ |
| {"role": "user", "content": user_query}, |
| {"role": "assistant", "content": generated_text} |
| ] |
| |
| return { |
| "updated_chat_history": updated_history, |
| "updated_files": payload.user_files |
| } |
| |
| except Exception as exec_error: |
| raise HTTPException(status_code=500, detail=f"Inference Engine Error: {str(exec_error)}") |
|
|
| if __name__ == "__main__": |
| uvicorn.run(app, host="0.0.0.0", port=7860) |