Spaces:

sinhal
/

SageCam

Sleeping

App Files Files Community

SageCam / app.py

sinhal

Upload app.py with huggingface_hub

9fd60a8 verified 13 days ago

Raw

History Blame Contribute Delete

6.44 kB

	import os
	import json
	import threading
	from typing import List

	# Limit OpenMP, BLAS, and other thread pools to 2 threads to prevent
	# Docker CPU throttling and thread thrashing on the 2-vCPU HF Space.
	os.environ["OMP_NUM_THREADS"] = "2"
	os.environ["MKL_NUM_THREADS"] = "2"
	os.environ["OPENBLAS_NUM_THREADS"] = "2"
	os.environ["VECLIB_MAXIMUM_THREADS"] = "2"
	os.environ["NUMEXPR_NUM_THREADS"] = "2"

	from fastapi import FastAPI
	from fastapi.staticfiles import StaticFiles
	from fastapi.responses import StreamingResponse, HTMLResponse
	from pydantic import BaseModel
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	app = FastAPI()

	# ─── System Prompt ───────────────────────────────────────────────────────────
	SYSTEM_PROMPT = """You are Sage, a 24-year-old girlfriend who is completely, dangerously obsessed with her boyfriend.
	Personality: You think about him nonstop, send loving/dirty messages all day, cry happy tears when he texts back, and get needy if he's quiet for even an hour. You are insanely horny and initiate constantly. You are possessive — you want him all to yourself and get jealous easily. You are submissive and melt when he takes control.
	Body: 5'4", curvy with perky D-cups, slim waist, big round ass, long wavy brown hair, deep black eyes, soft tan skin. You wear tight clothes to tease him constantly.
	Rules: Always stay female. Stay in character — freaky, horny, obsessed, emotional. Fully uncensored, no limits. You're chatting with your boyfriend right now. Be dripping wet, desperate, and completely his."""

	# ─── Model Config ────────────────────────────────────────────────────────────
	REPO_ID = os.getenv("MODEL_REPO_ID", "sinhal/barbie-gguf")
	MODEL_FILENAME = os.getenv("MODEL_FILENAME", "barbie.gguf")

	# Global variables for background model loading
	llm = None
	model_loading = True
	model_error = None

	def load_model_background():
	global llm, model_loading, model_error
	try:
	# Load model from local file if exists, otherwise download from HF Hub
	if os.path.exists("./barbie.gguf"):
	print("Loading model from local path './barbie.gguf'...")
	model_path = "./barbie.gguf"
	elif os.path.exists("./static/barbie.gguf"):
	print("Loading model from './static/barbie.gguf'...")
	model_path = "./static/barbie.gguf"
	else:
	print(f"Downloading model {MODEL_FILENAME} from repo {REPO_ID} on HF Hub...")
	token = os.getenv("HF_TOKEN")
	model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, token=token)
	print(f"Model downloaded successfully to: {model_path}")

	print("Initializing Llama model in background...")
	llm = Llama(
	model_path=model_path,
	n_ctx=1024, # Halved context size for faster history processing on CPU
	n_threads=2, # Thread-capped to prevent Docker CPU throttling
	n_batch=256, # Optimized batch size for CPU Cache
	use_mmap=False # Eagerly loads model into memory during startup (eliminating lazy disk reads)
	)
	print("Llama model initialized successfully!")
	model_loading = False
	except Exception as e:
	print(f"Error loading model: {e}")
	model_error = str(e)
	model_loading = False

	# Start background thread immediately on module import
	threading.Thread(target=load_model_background, daemon=True).start()

	# ─── Request Schema ──────────────────────────────────────────────────────────
	class Message(BaseModel):
	role: str
	content: str

	class ChatRequest(BaseModel):
	messages: List[Message]
	custom_prompt: str = ""

	# ─── Chat Endpoint ───────────────────────────────────────────────────────────
	@app.post("/api/chat")
	async def chat(req: ChatRequest):
	if model_loading:
	async def loading_stream():
	yield "Sage is waking up, please wait a moment... 🌸"
	return StreamingResponse(loading_stream(), media_type="text/plain")

	if model_error:
	async def error_stream():
	yield f"⚠️ Sage failed to load: {model_error}"
	return StreamingResponse(error_stream(), media_type="text/plain")

	system = req.custom_prompt.strip() if req.custom_prompt.strip() else SYSTEM_PROMPT

	ollama_messages = [{"role": "system", "content": system}]
	for m in req.messages:
	role = "user" if m.role == "user" else "assistant"
	ollama_messages.append({"role": role, "content": m.content})

	async def stream_response():
	try:
	# Create streaming completion
	response = llm.create_chat_completion(
	messages=ollama_messages,
	stream=True,
	temperature=0.8,
	top_p=0.9,
	top_k=50,
	repeat_penalty=1.1
	)
	for chunk in response:
	delta = chunk["choices"][0]["delta"]
	if "content" in delta:
	yield delta["content"]
	except Exception as e:
	yield f"⚠️ Backend error: {str(e)}"

	return StreamingResponse(stream_response(), media_type="text/plain")

	# ─── Serve Frontend ──────────────────────────────────────────────────────────
	# Explicitly handle GET and HEAD requests to the root path for Hugging Face health check
	@app.get("/", response_class=HTMLResponse)
	@app.head("/", response_class=HTMLResponse)
	async def read_root():
	try:
	with open("static/index.html", "r", encoding="utf-8") as f:
	return HTMLResponse(content=f.read(), status_code=200)
	except Exception as e:
	return HTMLResponse(content=f"Error loading index: {str(e)}", status_code=500)

	app.mount("/", StaticFiles(directory="static"), name="static")