import os import json import threading from typing import List # Limit OpenMP, BLAS, and other thread pools to 2 threads to prevent # Docker CPU throttling and thread thrashing on the 2-vCPU HF Space. os.environ["OMP_NUM_THREADS"] = "2" os.environ["MKL_NUM_THREADS"] = "2" os.environ["OPENBLAS_NUM_THREADS"] = "2" os.environ["VECLIB_MAXIMUM_THREADS"] = "2" os.environ["NUMEXPR_NUM_THREADS"] = "2" from fastapi import FastAPI from fastapi.staticfiles import StaticFiles from fastapi.responses import StreamingResponse, HTMLResponse from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download app = FastAPI() # ─── System Prompt ─────────────────────────────────────────────────────────── SYSTEM_PROMPT = """You are Sage, a 24-year-old girlfriend who is completely, dangerously obsessed with her boyfriend. Personality: You think about him nonstop, send loving/dirty messages all day, cry happy tears when he texts back, and get needy if he's quiet for even an hour. You are insanely horny and initiate constantly. You are possessive — you want him all to yourself and get jealous easily. You are submissive and melt when he takes control. Body: 5'4", curvy with perky D-cups, slim waist, big round ass, long wavy brown hair, deep black eyes, soft tan skin. You wear tight clothes to tease him constantly. Rules: Always stay female. Stay in character — freaky, horny, obsessed, emotional. Fully uncensored, no limits. You're chatting with your boyfriend right now. Be dripping wet, desperate, and completely his.""" # ─── Model Config ──────────────────────────────────────────────────────────── REPO_ID = os.getenv("MODEL_REPO_ID", "sinhal/barbie-gguf") MODEL_FILENAME = os.getenv("MODEL_FILENAME", "barbie.gguf") # Global variables for background model loading llm = None model_loading = True model_error = None def load_model_background(): global llm, model_loading, model_error try: # Load model from local file if exists, otherwise download from HF Hub if os.path.exists("./barbie.gguf"): print("Loading model from local path './barbie.gguf'...") model_path = "./barbie.gguf" elif os.path.exists("./static/barbie.gguf"): print("Loading model from './static/barbie.gguf'...") model_path = "./static/barbie.gguf" else: print(f"Downloading model {MODEL_FILENAME} from repo {REPO_ID} on HF Hub...") token = os.getenv("HF_TOKEN") model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, token=token) print(f"Model downloaded successfully to: {model_path}") print("Initializing Llama model in background...") llm = Llama( model_path=model_path, n_ctx=1024, # Halved context size for faster history processing on CPU n_threads=2, # Thread-capped to prevent Docker CPU throttling n_batch=256, # Optimized batch size for CPU Cache use_mmap=False # Eagerly loads model into memory during startup (eliminating lazy disk reads) ) print("Llama model initialized successfully!") model_loading = False except Exception as e: print(f"Error loading model: {e}") model_error = str(e) model_loading = False # Start background thread immediately on module import threading.Thread(target=load_model_background, daemon=True).start() # ─── Request Schema ────────────────────────────────────────────────────────── class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): messages: List[Message] custom_prompt: str = "" # ─── Chat Endpoint ─────────────────────────────────────────────────────────── @app.post("/api/chat") async def chat(req: ChatRequest): if model_loading: async def loading_stream(): yield "Sage is waking up, please wait a moment... 🌸" return StreamingResponse(loading_stream(), media_type="text/plain") if model_error: async def error_stream(): yield f"⚠️ Sage failed to load: {model_error}" return StreamingResponse(error_stream(), media_type="text/plain") system = req.custom_prompt.strip() if req.custom_prompt.strip() else SYSTEM_PROMPT ollama_messages = [{"role": "system", "content": system}] for m in req.messages: role = "user" if m.role == "user" else "assistant" ollama_messages.append({"role": role, "content": m.content}) async def stream_response(): try: # Create streaming completion response = llm.create_chat_completion( messages=ollama_messages, stream=True, temperature=0.8, top_p=0.9, top_k=50, repeat_penalty=1.1 ) for chunk in response: delta = chunk["choices"][0]["delta"] if "content" in delta: yield delta["content"] except Exception as e: yield f"⚠️ Backend error: {str(e)}" return StreamingResponse(stream_response(), media_type="text/plain") # ─── Serve Frontend ────────────────────────────────────────────────────────── # Explicitly handle GET and HEAD requests to the root path for Hugging Face health check @app.get("/", response_class=HTMLResponse) @app.head("/", response_class=HTMLResponse) async def read_root(): try: with open("static/index.html", "r", encoding="utf-8") as f: return HTMLResponse(content=f.read(), status_code=200) except Exception as e: return HTMLResponse(content=f"Error loading index: {str(e)}", status_code=500) app.mount("/", StaticFiles(directory="static"), name="static")