Spaces:

SharmaGroups07
/

ai-engine

Running

App Files Files Community

ai-engine / app.py

SharmaGroups07

Update app.py

e5cd937 verified 27 days ago

raw

history blame contribute delete

2.36 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import multiprocessing

	app = FastAPI()

	# ===============================
	# MODEL CONFIG
	# ===============================

	MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
	MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"

	model_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE
	)

	# ===============================
	# LLM INITIALIZATION (OPTIMIZED)
	# ===============================

	llm = Llama(
	model_path=model_path,

	# Context window (balance speed + memory)
	n_ctx=4096,

	# Use all CPU cores automatically
	n_threads=multiprocessing.cpu_count(),

	# CPU inference
	n_gpu_layers=0,

	# Performance optimizations
	n_batch=512, # faster token processing
	use_mmap=True, # faster loading
	use_mlock=True, # prevents RAM swapping
	)

	# ===============================
	# REQUEST MODEL
	# ===============================

	class ChatRequest(BaseModel):
	message: str

	# ===============================
	# HEALTH CHECK
	# ===============================

	@app.get("/")
	def root():
	return {"status": "Speed AI engine running"}

	# ===============================
	# CHAT ENDPOINT
	# ===============================

	@app.post("/chat")
	def chat(req: ChatRequest):

	# PROFESSIONAL SYSTEM PROMPT
	system_prompt = (
	"<\|system\|>"
	"You are a high-speed professional AI assistant. "
	"Respond clearly, concisely, and in structured markdown format. "
	"Use bullet points, headings, and emojis when helpful. "
	"Never include conversation history unless asked."
	"<\|end\|>"
	)

	prompt = system_prompt + f"<\|user\|>{req.message}<\|assistant\|>"

	# GENERATION SETTINGS (OPTIMIZED BALANCE)
	output = llm(
	prompt,

	max_tokens=400, # faster than 512
	temperature=0.6, # less hallucination
	top_p=0.9,
	repeat_penalty=1.15, # reduces loops

	stop=["<\|end\|>"]
	)

	response_text = output["choices"][0]["text"].strip()

	return {"reply": response_text}

	# ===============================
	# LOCAL RUN
	# ===============================

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)