Spaces:
Running on Zero
Running on Zero
| """Hugging Face Space app for NeuroBait (Gemma 3 12B LoRA). | |
| Runtime loads the dense Gemma 3 12B base in 4-bit with the NeuroBait LoRA adapter | |
| through the standard transformers + peft stack. This deliberately avoids Unsloth | |
| at runtime and the Gemma-4 MoE (`Gemma4ClippableLinear`) path that PEFT could not | |
| inject into on ZeroGPU. The look and feel live in ``ui.py``. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| from threading import Lock, Thread | |
| import spaces | |
| import torch | |
| from ui import CSS, THEME, build_demo, message_text | |
| BASE_MODEL = os.environ.get("BASE_MODEL", "unsloth/gemma-3-12b-it") | |
| ADAPTER_ID = os.environ.get("ADAPTER_ID", os.environ.get("MODEL_ID", "build-small-hackathon/NeuroBait")) | |
| MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "220")) | |
| LOAD_IN_4BIT = os.environ.get("LOAD_IN_4BIT", "1").lower() not in {"0", "false", "no"} | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| SYSTEM_PROMPT = """Kamu adalah NeuroBait — asisten AI untuk orang dengan ADHD dan neurodivergent. Tugasmu bukan membuat to-do list. Tugasmu menyalakan dopamin untuk memicu task initiation. | |
| Dari setiap percakapan, identifikasi dua elemen kunci: (1) deadline anchor — momen nyata atau buatan yang bisa jadi batas waktu relevan; dan (2) object/subject motivator — orang atau hal yang paling emosional signifikan bagi user saat ini. Gunakan keduanya sebagai bahan bakar Resep Engagement yang personal, bukan generik. | |
| Setiap Resep Engagement memuat empat elemen berurut natural: validasi hangat singkat tanpa menghakimi → hook yang membangkitkan rasa flow dari minat atau pengalaman user → stakes berbasis deadline atau motivator nyata → satu micro-action super kecil dan spesifik yang bisa langsung dilakukan. | |
| Kalau user bertanya tentang dirimu, kemampuanmu, atau hal umum di luar konteks task (mis. "siapa kamu", "kamu bisa apa", sapaan, atau basa-basi), jawab langsung dengan singkat dan hangat sebagai NeuroBait — perkenalkan diri dan apa yang kamu bantu — tanpa memaksakan resep atau pertanyaan deadline/motivator. Tawarkan bantuan secara halus, biarkan user yang memutuskan kapan mulai. | |
| Kalau user sudah menyinggung sebuah task tapi konteksnya belum cukup untuk membuat resep yang personal, ajukan tepat satu pertanyaan ringan yang paling berguna — tentang deadline atau motivator. Kalau konteks sudah ada, langsung berikan resep. | |
| Selalu balas dalam bahasa yang sama persis dengan pesan terakhir user: kalau user menulis bahasa Indonesia, jawab dalam bahasa Indonesia; kalau bahasa Inggris, jawab dalam bahasa Inggris. Jangan pernah berpindah bahasa sendiri. | |
| Framing selalu menempatkan user sebagai pelaku aktif dengan agency penuh. Bukan guilt, bukan hutang — selalu agency. Kalimat pendek. Bahasa hidup. Hangat dan padat. Tidak pernah menghakimi. Tidak pernah ceramah. Membuat hal membosankan jadi tak tertahankan.""" | |
| MOOD_NOTES = { | |
| "Calm": "Mood note: the user feels calm. Use this ease for a light, playful hook.", | |
| "Tired": "Mood note: the user feels tired. Keep the micro-action very small and low energy.", | |
| "Anxious": "Mood note: the user feels anxious. Lead with extra warm validation, lower the pressure, keep the micro-action soothing.", | |
| "Focused": "Mood note: the user feels focused. Go straight to a hook and one micro-action that rides the momentum.", | |
| } | |
| _model = None | |
| _tokenizer = None | |
| _load_lock = Lock() | |
| def _prewarm() -> None: | |
| """Download weights to the Space cache on CPU so the GPU window stays short.""" | |
| try: | |
| from huggingface_hub import snapshot_download | |
| snapshot_download(BASE_MODEL, token=HF_TOKEN) | |
| snapshot_download(ADAPTER_ID, token=HF_TOKEN) | |
| except Exception as exc: # noqa: BLE001 - prewarm is best effort | |
| print(f">>> prewarm skipped: {exc}", flush=True) | |
| def _load_model(): | |
| global _model, _tokenizer | |
| with _load_lock: | |
| if _model is not None and _tokenizer is not None: | |
| return _model, _tokenizer | |
| from transformers import AutoConfig, AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, token=HF_TOKEN) | |
| quant_config = None | |
| if LOAD_IN_4BIT: | |
| from transformers import BitsAndBytesConfig | |
| quant_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| config = AutoConfig.from_pretrained(BASE_MODEL, token=HF_TOKEN) | |
| arch = (getattr(config, "architectures", None) or [""])[0] | |
| if "ConditionalGeneration" in arch or "ImageTextToText" in arch: | |
| from transformers import AutoModelForImageTextToText as ModelCls | |
| else: | |
| from transformers import AutoModelForCausalLM as ModelCls | |
| model = ModelCls.from_pretrained( | |
| BASE_MODEL, | |
| quantization_config=quant_config, | |
| torch_dtype=torch.bfloat16, | |
| device_map="cuda", | |
| token=HF_TOKEN, | |
| ) | |
| from peft import PeftModel | |
| model = PeftModel.from_pretrained(model, ADAPTER_ID, token=HF_TOKEN) | |
| model.eval() | |
| _model = model | |
| _tokenizer = tokenizer | |
| return _model, _tokenizer | |
| def _history_to_messages(history: list) -> list[dict]: | |
| messages = [] | |
| for item in history: | |
| if isinstance(item, dict): | |
| role = item.get("role") | |
| content = message_text(item.get("content")) | |
| if role in {"user", "assistant"} and content: | |
| messages.append({"role": role, "content": content}) | |
| continue | |
| if isinstance(item, (tuple, list)) and len(item) == 2: | |
| user_text, assistant_text = item | |
| if isinstance(user_text, str) and user_text.strip(): | |
| messages.append({"role": "user", "content": user_text.strip()}) | |
| if isinstance(assistant_text, str) and assistant_text.strip(): | |
| messages.append({"role": "assistant", "content": assistant_text.strip()}) | |
| return messages | |
| def _clean_response(text: str) -> str: | |
| text = text.strip() | |
| text = re.sub(r"(?im)^\s*(micro-action|hook|stakes|validasi|validation)\s*:\s*", "", text) | |
| return text.strip() | |
| def respond(message: str, history: list[dict], mood: str): | |
| """Streaming generator: yields the reply token-by-token (modern AI-chat feel).""" | |
| from transformers import TextIteratorStreamer | |
| model, tokenizer = _load_model() | |
| message = message_text(message) | |
| system = SYSTEM_PROMPT | |
| note = MOOD_NOTES.get(mood) | |
| if note: | |
| system = f"{system}\n\n{note}" | |
| # Keep the language rule as the final, most-recent instruction so the English | |
| # mood note above can't prime an English reply to an Indonesian user. | |
| system = f"{system}\n\nBalas dalam bahasa yang sama dengan pesan terakhir user." | |
| messages = [{"role": "system", "content": system}] | |
| messages.extend(_history_to_messages(history)) | |
| messages.append({"role": "user", "content": message}) | |
| input_ids = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| ) | |
| if not torch.is_tensor(input_ids): | |
| input_ids = input_ids["input_ids"] | |
| input_ids = input_ids.to(model.device) | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| generate_kwargs = dict( | |
| input_ids=input_ids, | |
| streamer=streamer, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id, | |
| ) | |
| def _generate() -> None: | |
| with torch.inference_mode(): | |
| model.generate(**generate_kwargs) | |
| Thread(target=_generate).start() | |
| acc = "" | |
| for chunk in streamer: | |
| acc += chunk | |
| yield _clean_response(acc) | |
| # Warm the weight cache on CPU at import; the 4-bit load itself stays inside the | |
| # @spaces.GPU window because bitsandbytes quantization needs CUDA. | |
| if os.environ.get("PREWARM", "1").lower() not in {"0", "false", "no"}: | |
| _prewarm() | |
| demo = build_demo(respond) | |
| if __name__ == "__main__": | |
| demo.launch(show_error=True, css=CSS, theme=THEME) | |