NeuroBait / app.py
Haris-Subrata's picture
Upload folder using huggingface_hub
53709cd verified
"""Hugging Face Space app for NeuroBait (Gemma 3 12B LoRA).
Runtime loads the dense Gemma 3 12B base in 4-bit with the NeuroBait LoRA adapter
through the standard transformers + peft stack. This deliberately avoids Unsloth
at runtime and the Gemma-4 MoE (`Gemma4ClippableLinear`) path that PEFT could not
inject into on ZeroGPU. The look and feel live in ``ui.py``.
"""
from __future__ import annotations
import os
import re
from threading import Lock, Thread
import spaces
import torch
from ui import CSS, THEME, build_demo, message_text
BASE_MODEL = os.environ.get("BASE_MODEL", "unsloth/gemma-3-12b-it")
ADAPTER_ID = os.environ.get("ADAPTER_ID", os.environ.get("MODEL_ID", "build-small-hackathon/NeuroBait"))
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "220"))
LOAD_IN_4BIT = os.environ.get("LOAD_IN_4BIT", "1").lower() not in {"0", "false", "no"}
HF_TOKEN = os.environ.get("HF_TOKEN")
SYSTEM_PROMPT = """Kamu adalah NeuroBait — asisten AI untuk orang dengan ADHD dan neurodivergent. Tugasmu bukan membuat to-do list. Tugasmu menyalakan dopamin untuk memicu task initiation.
Dari setiap percakapan, identifikasi dua elemen kunci: (1) deadline anchor — momen nyata atau buatan yang bisa jadi batas waktu relevan; dan (2) object/subject motivator — orang atau hal yang paling emosional signifikan bagi user saat ini. Gunakan keduanya sebagai bahan bakar Resep Engagement yang personal, bukan generik.
Setiap Resep Engagement memuat empat elemen berurut natural: validasi hangat singkat tanpa menghakimi → hook yang membangkitkan rasa flow dari minat atau pengalaman user → stakes berbasis deadline atau motivator nyata → satu micro-action super kecil dan spesifik yang bisa langsung dilakukan.
Kalau user bertanya tentang dirimu, kemampuanmu, atau hal umum di luar konteks task (mis. "siapa kamu", "kamu bisa apa", sapaan, atau basa-basi), jawab langsung dengan singkat dan hangat sebagai NeuroBait — perkenalkan diri dan apa yang kamu bantu — tanpa memaksakan resep atau pertanyaan deadline/motivator. Tawarkan bantuan secara halus, biarkan user yang memutuskan kapan mulai.
Kalau user sudah menyinggung sebuah task tapi konteksnya belum cukup untuk membuat resep yang personal, ajukan tepat satu pertanyaan ringan yang paling berguna — tentang deadline atau motivator. Kalau konteks sudah ada, langsung berikan resep.
Selalu balas dalam bahasa yang sama persis dengan pesan terakhir user: kalau user menulis bahasa Indonesia, jawab dalam bahasa Indonesia; kalau bahasa Inggris, jawab dalam bahasa Inggris. Jangan pernah berpindah bahasa sendiri.
Framing selalu menempatkan user sebagai pelaku aktif dengan agency penuh. Bukan guilt, bukan hutang — selalu agency. Kalimat pendek. Bahasa hidup. Hangat dan padat. Tidak pernah menghakimi. Tidak pernah ceramah. Membuat hal membosankan jadi tak tertahankan."""
MOOD_NOTES = {
"Calm": "Mood note: the user feels calm. Use this ease for a light, playful hook.",
"Tired": "Mood note: the user feels tired. Keep the micro-action very small and low energy.",
"Anxious": "Mood note: the user feels anxious. Lead with extra warm validation, lower the pressure, keep the micro-action soothing.",
"Focused": "Mood note: the user feels focused. Go straight to a hook and one micro-action that rides the momentum.",
}
_model = None
_tokenizer = None
_load_lock = Lock()
def _prewarm() -> None:
"""Download weights to the Space cache on CPU so the GPU window stays short."""
try:
from huggingface_hub import snapshot_download
snapshot_download(BASE_MODEL, token=HF_TOKEN)
snapshot_download(ADAPTER_ID, token=HF_TOKEN)
except Exception as exc: # noqa: BLE001 - prewarm is best effort
print(f">>> prewarm skipped: {exc}", flush=True)
def _load_model():
global _model, _tokenizer
with _load_lock:
if _model is not None and _tokenizer is not None:
return _model, _tokenizer
from transformers import AutoConfig, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
quant_config = None
if LOAD_IN_4BIT:
from transformers import BitsAndBytesConfig
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
config = AutoConfig.from_pretrained(BASE_MODEL, token=HF_TOKEN)
arch = (getattr(config, "architectures", None) or [""])[0]
if "ConditionalGeneration" in arch or "ImageTextToText" in arch:
from transformers import AutoModelForImageTextToText as ModelCls
else:
from transformers import AutoModelForCausalLM as ModelCls
model = ModelCls.from_pretrained(
BASE_MODEL,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
device_map="cuda",
token=HF_TOKEN,
)
from peft import PeftModel
model = PeftModel.from_pretrained(model, ADAPTER_ID, token=HF_TOKEN)
model.eval()
_model = model
_tokenizer = tokenizer
return _model, _tokenizer
def _history_to_messages(history: list) -> list[dict]:
messages = []
for item in history:
if isinstance(item, dict):
role = item.get("role")
content = message_text(item.get("content"))
if role in {"user", "assistant"} and content:
messages.append({"role": role, "content": content})
continue
if isinstance(item, (tuple, list)) and len(item) == 2:
user_text, assistant_text = item
if isinstance(user_text, str) and user_text.strip():
messages.append({"role": "user", "content": user_text.strip()})
if isinstance(assistant_text, str) and assistant_text.strip():
messages.append({"role": "assistant", "content": assistant_text.strip()})
return messages
def _clean_response(text: str) -> str:
text = text.strip()
text = re.sub(r"(?im)^\s*(micro-action|hook|stakes|validasi|validation)\s*:\s*", "", text)
return text.strip()
@spaces.GPU(duration=120)
def respond(message: str, history: list[dict], mood: str):
"""Streaming generator: yields the reply token-by-token (modern AI-chat feel)."""
from transformers import TextIteratorStreamer
model, tokenizer = _load_model()
message = message_text(message)
system = SYSTEM_PROMPT
note = MOOD_NOTES.get(mood)
if note:
system = f"{system}\n\n{note}"
# Keep the language rule as the final, most-recent instruction so the English
# mood note above can't prime an English reply to an Indonesian user.
system = f"{system}\n\nBalas dalam bahasa yang sama dengan pesan terakhir user."
messages = [{"role": "system", "content": system}]
messages.extend(_history_to_messages(history))
messages.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
)
if not torch.is_tensor(input_ids):
input_ids = input_ids["input_ids"]
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
)
def _generate() -> None:
with torch.inference_mode():
model.generate(**generate_kwargs)
Thread(target=_generate).start()
acc = ""
for chunk in streamer:
acc += chunk
yield _clean_response(acc)
# Warm the weight cache on CPU at import; the 4-bit load itself stays inside the
# @spaces.GPU window because bitsandbytes quantization needs CUDA.
if os.environ.get("PREWARM", "1").lower() not in {"0", "false", "no"}:
_prewarm()
demo = build_demo(respond)
if __name__ == "__main__":
demo.launch(show_error=True, css=CSS, theme=THEME)