"""Hugging Face Space app for NeuroBait (Gemma 3 12B LoRA). Runtime loads the dense Gemma 3 12B base in 4-bit with the NeuroBait LoRA adapter through the standard transformers + peft stack. This deliberately avoids Unsloth at runtime and the Gemma-4 MoE (`Gemma4ClippableLinear`) path that PEFT could not inject into on ZeroGPU. The look and feel live in ``ui.py``. """ from __future__ import annotations import os import re from threading import Lock, Thread import spaces import torch from ui import CSS, THEME, build_demo, message_text BASE_MODEL = os.environ.get("BASE_MODEL", "unsloth/gemma-3-12b-it") ADAPTER_ID = os.environ.get("ADAPTER_ID", os.environ.get("MODEL_ID", "build-small-hackathon/NeuroBait")) MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "220")) LOAD_IN_4BIT = os.environ.get("LOAD_IN_4BIT", "1").lower() not in {"0", "false", "no"} HF_TOKEN = os.environ.get("HF_TOKEN") SYSTEM_PROMPT = """Kamu adalah NeuroBait — asisten AI untuk orang dengan ADHD dan neurodivergent. Tugasmu bukan membuat to-do list. Tugasmu menyalakan dopamin untuk memicu task initiation. Dari setiap percakapan, identifikasi dua elemen kunci: (1) deadline anchor — momen nyata atau buatan yang bisa jadi batas waktu relevan; dan (2) object/subject motivator — orang atau hal yang paling emosional signifikan bagi user saat ini. Gunakan keduanya sebagai bahan bakar Resep Engagement yang personal, bukan generik. Setiap Resep Engagement memuat empat elemen berurut natural: validasi hangat singkat tanpa menghakimi → hook yang membangkitkan rasa flow dari minat atau pengalaman user → stakes berbasis deadline atau motivator nyata → satu micro-action super kecil dan spesifik yang bisa langsung dilakukan. Kalau user bertanya tentang dirimu, kemampuanmu, atau hal umum di luar konteks task (mis. "siapa kamu", "kamu bisa apa", sapaan, atau basa-basi), jawab langsung dengan singkat dan hangat sebagai NeuroBait — perkenalkan diri dan apa yang kamu bantu — tanpa memaksakan resep atau pertanyaan deadline/motivator. Tawarkan bantuan secara halus, biarkan user yang memutuskan kapan mulai. Kalau user sudah menyinggung sebuah task tapi konteksnya belum cukup untuk membuat resep yang personal, ajukan tepat satu pertanyaan ringan yang paling berguna — tentang deadline atau motivator. Kalau konteks sudah ada, langsung berikan resep. Selalu balas dalam bahasa yang sama persis dengan pesan terakhir user: kalau user menulis bahasa Indonesia, jawab dalam bahasa Indonesia; kalau bahasa Inggris, jawab dalam bahasa Inggris. Jangan pernah berpindah bahasa sendiri. Framing selalu menempatkan user sebagai pelaku aktif dengan agency penuh. Bukan guilt, bukan hutang — selalu agency. Kalimat pendek. Bahasa hidup. Hangat dan padat. Tidak pernah menghakimi. Tidak pernah ceramah. Membuat hal membosankan jadi tak tertahankan.""" MOOD_NOTES = { "Calm": "Mood note: the user feels calm. Use this ease for a light, playful hook.", "Tired": "Mood note: the user feels tired. Keep the micro-action very small and low energy.", "Anxious": "Mood note: the user feels anxious. Lead with extra warm validation, lower the pressure, keep the micro-action soothing.", "Focused": "Mood note: the user feels focused. Go straight to a hook and one micro-action that rides the momentum.", } _model = None _tokenizer = None _load_lock = Lock() def _prewarm() -> None: """Download weights to the Space cache on CPU so the GPU window stays short.""" try: from huggingface_hub import snapshot_download snapshot_download(BASE_MODEL, token=HF_TOKEN) snapshot_download(ADAPTER_ID, token=HF_TOKEN) except Exception as exc: # noqa: BLE001 - prewarm is best effort print(f">>> prewarm skipped: {exc}", flush=True) def _load_model(): global _model, _tokenizer with _load_lock: if _model is not None and _tokenizer is not None: return _model, _tokenizer from transformers import AutoConfig, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, token=HF_TOKEN) quant_config = None if LOAD_IN_4BIT: from transformers import BitsAndBytesConfig quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) config = AutoConfig.from_pretrained(BASE_MODEL, token=HF_TOKEN) arch = (getattr(config, "architectures", None) or [""])[0] if "ConditionalGeneration" in arch or "ImageTextToText" in arch: from transformers import AutoModelForImageTextToText as ModelCls else: from transformers import AutoModelForCausalLM as ModelCls model = ModelCls.from_pretrained( BASE_MODEL, quantization_config=quant_config, torch_dtype=torch.bfloat16, device_map="cuda", token=HF_TOKEN, ) from peft import PeftModel model = PeftModel.from_pretrained(model, ADAPTER_ID, token=HF_TOKEN) model.eval() _model = model _tokenizer = tokenizer return _model, _tokenizer def _history_to_messages(history: list) -> list[dict]: messages = [] for item in history: if isinstance(item, dict): role = item.get("role") content = message_text(item.get("content")) if role in {"user", "assistant"} and content: messages.append({"role": role, "content": content}) continue if isinstance(item, (tuple, list)) and len(item) == 2: user_text, assistant_text = item if isinstance(user_text, str) and user_text.strip(): messages.append({"role": "user", "content": user_text.strip()}) if isinstance(assistant_text, str) and assistant_text.strip(): messages.append({"role": "assistant", "content": assistant_text.strip()}) return messages def _clean_response(text: str) -> str: text = text.strip() text = re.sub(r"(?im)^\s*(micro-action|hook|stakes|validasi|validation)\s*:\s*", "", text) return text.strip() @spaces.GPU(duration=120) def respond(message: str, history: list[dict], mood: str): """Streaming generator: yields the reply token-by-token (modern AI-chat feel).""" from transformers import TextIteratorStreamer model, tokenizer = _load_model() message = message_text(message) system = SYSTEM_PROMPT note = MOOD_NOTES.get(mood) if note: system = f"{system}\n\n{note}" # Keep the language rule as the final, most-recent instruction so the English # mood note above can't prime an English reply to an Indonesian user. system = f"{system}\n\nBalas dalam bahasa yang sama dengan pesan terakhir user." messages = [{"role": "system", "content": system}] messages.extend(_history_to_messages(history)) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", ) if not torch.is_tensor(input_ids): input_ids = input_ids["input_ids"] input_ids = input_ids.to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id, ) def _generate() -> None: with torch.inference_mode(): model.generate(**generate_kwargs) Thread(target=_generate).start() acc = "" for chunk in streamer: acc += chunk yield _clean_response(acc) # Warm the weight cache on CPU at import; the 4-bit load itself stays inside the # @spaces.GPU window because bitsandbytes quantization needs CUDA. if os.environ.get("PREWARM", "1").lower() not in {"0", "false", "no"}: _prewarm() demo = build_demo(respond) if __name__ == "__main__": demo.launch(show_error=True, css=CSS, theme=THEME)