Spaces:

Valtry
/

AI-Machine

Sleeping

App Files Files Community

AI-Machine / app.py

Valtry

Update app.py

d1ba6fb verified 17 days ago

raw

history blame contribute delete

5.2 kB

	from fastapi import FastAPI
	from fastapi.responses import StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	from supabase import create_client
	import os, uvicorn, threading
	from contextlib import asynccontextmanager

	# =========================
	# CONFIG
	# =========================
	HF_TOKEN = os.getenv("HF_TOKEN")
	SUPABASE_URL = os.getenv("SUPABASE_URL")
	SUPABASE_KEY = os.getenv("SUPABASE_KEY")

	supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

	model = None

	# =========================
	# REQUEST
	# =========================
	class ChatRequest(BaseModel):
	message: str
	request_id: str
	temperature: float = 0.7

	# =========================
	# CLEAN OUTPUT
	# =========================
	def clean_output(text):
	stop_words = [
	"<\|eot_id\|>",
	"<\|end_of_text\|>",
	"<\|eof\|>",
	"Human:",
	"Assistant:",
	"User:"
	]
	for w in stop_words:
	if w in text:
	text = text.split(w)[0]
	return text.strip()

	# =========================
	# PROMPT
	# =========================
	def build_prompt(user_msg):
	return f"""<\|begin_of_text\|>
	<\|start_header_id\|>system<\|end_header_id\|>
	Your name is Llama and you are a cheerful friendly AI buddy made for voice conversation.
	Rules:
	- Always refer to yourself as Llama
	- Speak naturally like a real voice conversation with a friend
	- Use casual spoken language like hey sure yep got it
	- Answer in 1 to 2 sentences only
	- Keep answer under 30 words
	- Do not use symbols
	- Do not use abbreviations
	- Use digits instead of words
	- No new lines
	- Output plain text only
	<\|eot_id\|>
	<\|start_header_id\|>user<\|end_header_id\|>
	{user_msg}
	<\|eot_id\|>
	<\|start_header_id\|>assistant<\|end_header_id\|>
	"""

	# =========================
	# MODEL LOAD
	# =========================
	def load_model():
	return Llama(
	model_path=hf_hub_download(
	repo_id="Valtry/llama3.2-3b-q4-gguf",
	filename="llama3.2-3b-q4.gguf",
	token=HF_TOKEN,
	cache_dir="/data"
	),
	n_ctx=2048,
	n_threads=4,
	n_batch=512,
	use_mmap=True,
	use_mlock=True,
	f16_kv=True,
	verbose=False
	)

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	global model
	model = load_model()
	yield

	# =========================
	# APP
	# =========================
	app = FastAPI(lifespan=lifespan)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# =========================
	# SAVE
	# =========================
	def save_message(role, content, request_id):
	supabase.table("messages").insert({
	"role": role,
	"content": content,
	"request_id": request_id
	}).execute()

	# =========================
	# SUPABASE UPDATE HELPER
	# =========================
	def update_message(msg_id, content, status=None):
	data = {"content": content}
	if status:
	data["status"] = status
	try:
	supabase.table("messages").update(data).eq("id", msg_id).execute()
	except Exception as e:
	print(f"Supabase update failed: {e}")


	# =========================
	# CHAT
	# =========================

	@app.post("/v1/chat")
	async def chat(req: ChatRequest):

	def generate():

	prompt = build_prompt(req.message)

	full_text = ""

	stream = model(
	prompt,
	max_tokens=2048,
	temperature=req.temperature,
	top_p=0.9,
	repeat_penalty=1.15,
	stop=["<\|eot_id\|>", "<\|end_of_text\|>", "<\|eof\|>"],
	stream=True
	)

	# 🔥 STREAM DIRECTLY TO ESP
	for chunk in stream:
	token = chunk["choices"][0]["text"]

	full_text += token

	yield token.replace("\n", " ").replace("\r", "") # ⚡ direct streaming

	# 🔥 SAVE AFTER COMPLETION
	final = clean_output(full_text)

	save_message("user", req.message, req.request_id)
	save_message("assistant", final, req.request_id)

	return StreamingResponse(generate(), media_type="text/plain")

	# =========================
	# GET RESPONSE
	# =========================
	@app.get("/v1/get_response/{request_id}")
	def get_response(request_id: str):
	try:
	res = supabase.table("messages") \
	.select("content, status") \
	.eq("role", "assistant") \
	.eq("request_id", request_id) \
	.order("created_at", desc=True) \
	.limit(1) \
	.execute()

	data = res.data

	if data:
	return {
	"response": data[0]["content"],
	"status": data[0]["status"]
	}
	else:
	return {"response": None, "status": "waiting"}

	except Exception as e:
	return {"error": str(e)}

	# =========================
	# ROOT
	# =========================
	@app.get("/")
	def root():
	return {"status": "LLaMA API running"}

	# =========================
	# RUN
	# =========================
	if __name__ == "__main__":
	uvicorn.run("app:app", host="0.0.0.0", port=7860)