ag2

Paused

App Files Files Community

ag2 / app.py

nulltron

Create app.py

34a3ef8 verified 4 days ago

Raw

History Blame Contribute Delete

3.26 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from typing import List, Dict, Any
	import uvicorn
	from model_loader import get_local_llm_instance

	app = FastAPI(title="Stateless Agent Pipeline")

	# Enable global cross-origin resource sharing for frontend html access
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Load model engine universally on runtime startup
	try:
	llm_instance = get_local_llm_instance()
	except Exception as init_err:
	print(f"[CRITICAL ERROR] Failed to load local weights: {init_err}")
	llm_instance = None

	# Validation structure for parsing the data packets cleanly
	class ChatPayload(BaseModel):
	user_id: str
	user_message: str
	current_chat_history: List[Dict[str, Any]] = []
	user_files: Dict[str, Any] = {}

	@app.get("/")
	def read_root():
	return {"status": "online", "engine": "Llama.cpp local cluster running flawlessly"}

	@app.post("/chat")
	async def chat_endpoint(payload: ChatPayload):
	global llm_instance
	if llm_instance is None:
	raise HTTPException(status_code=500, detail="Local LLM instance cluster is offline.")

	try:
	user_query = payload.user_message

	# Build strict system directives for clean output responses
	system_instruction = (
	"<\|im_start\|>system\n"
	"You are a helpful, extremely fast AI assistant. "
	"Respond cleanly, accurately and directly to the prompt. "
	"Keep formatting minimal.<\|im_end\|>\n"
	)

	# Format chat history context string if it exists
	history_context = ""
	for turn in payload.current_chat_history[-4:]: # Keep only the last 4 exchanges to preserve fast RAM context
	role = "user" if turn.get("role") == "user" else "assistant"
	content = turn.get("content", "")
	history_context += f"<\|im_start\|>{role}\n{content}<\|im_end\|>\n"

	# Compile complete operational template string
	final_prompt = f"{system_instruction}{history_context}<\|im_start\|>user\n{user_query}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Run synchronous inference across CPU matrix
	output = llm_instance(
	final_prompt,
	max_tokens=512, # Generation constraint for faster response times
	stop=["<\|im_end\|>", "<\|im_start\|>", "user:", "assistant:"],
	echo=False
	)

	generated_text = output["choices"][0]["text"].strip()

	# Re-construct updated structural array history block
	updated_history = payload.current_chat_history + [
	{"role": "user", "content": user_query},
	{"role": "assistant", "content": generated_text}
	]

	return {
	"updated_chat_history": updated_history,
	"updated_files": payload.user_files
	}

	except Exception as exec_error:
	raise HTTPException(status_code=500, detail=f"Inference Engine Error: {str(exec_error)}")

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)