# core/model_config.py from dataclasses import dataclass # UI mock mode — DISABLED for the published Hugging Face Space (always uses the real # Modal-served models). The core wrappers keep a mock branch for offline UI dev; set # this True locally to use canned story/audio without calling Modal. Keep False here. UI_MOCK: bool = False # ───────────────────────────────────────────── # ACTIVE STACK — Stack A is the sole submission stack (OpenBMB prize path). # The StackConfig machinery below is kept so a stack could be added/swapped, but # only "A" is defined; Stacks B/C were dropped. ACTIVE_STACK: str = "A" # ───────────────────────────────────────────── # WHERE INFERENCE RUNS # "modal" → models served on Modal cloud GPUs (current; see CLAUDE.md infra note) # "local" → models served on the local machine (legacy offline path; unsupported) COMPUTE_LOCATION: str = "modal" # Modal GPU tier for inference containers. A10G (24 GB) covers Stack A's runtime. MODAL_GPU: str = "A10G" # Keep one container warm to hide cold starts. =1 during a demo (no cold start, # but bills idle GPU — incl. the A100 FT-serve container, so revert to 0 after). # =0 scales to zero and only bills per request (cold start ~3-5 min on first call). MODAL_MIN_CONTAINERS: int = 0 # ───────────────────────────────────────────── @dataclass(frozen=True) class StackConfig: name: str description: str vision_model: str # Ollama model tag vision_backend: str # "ollama" (all stacks use Ollama) stt_model: str # faster-whisper model size stt_bn_model: str | None # optional Bengali-specific STT model (HF repo) tts_en_backend: str # "voxcpm2" tts_bn_backend: str # "indic_tts" (chosen) | "indic_parler" (alt) tts_bn_ref_audio: str | None # unused (was for the removed IndicF5 voice-clone) total_params_b: float # informational — for README generation openbmb_prize_eligible: bool STACKS: dict[str, StackConfig] = { "A": StackConfig( name="Stack A — OpenBMB Prize Path", description="MiniCPM-V 4.5 + VoxCPM2 + AI4Bharat Indic-TTS. ~12.4B. OpenBMB prize eligible.", # Default (~Q4, 6.1GB). q8_0 was tested and gave NO Bengali quality gain at # higher cost/latency — precision is not the bottleneck, the 8B model's # Bengali capability is. Bengali quality is addressed via two-pass (Lever C). vision_model="openbmb/minicpm-v4.5", vision_backend="ollama", stt_model="large-v3", stt_bn_model="bangla-asr/whisper-medium-bn", tts_en_backend="voxcpm2", # Bengali TTS: AI4Bharat Indic-TTS (FastPitch). Chosen over Indic Parler-TTS # (sounded artificial) and IndicF5 (voice clone — removed: too slow even on # A100, needs a reference clip). FastPitch is fast, no reference needed. tts_bn_backend="indic_tts", tts_bn_ref_audio=None, total_params_b=12.4, openbmb_prize_eligible=True, ), } def get_config() -> StackConfig: """Returns the active stack config. Import this everywhere model details are needed.""" if ACTIVE_STACK not in STACKS: raise ValueError( f"ACTIVE_STACK='{ACTIVE_STACK}' is not valid. Stack A is the only defined stack." ) return STACKS[ACTIVE_STACK] def get_all_stacks() -> dict[str, StackConfig]: """Returns all defined stacks (currently just Stack A).""" return STACKS # HF repo IDs for the TTS backends. Model names live ONLY in this file — the # StackConfig stores the backend *key* ('voxcpm2' | 'indic_parler' | 'indic_tts'); # this maps that key to the actual repo passed to core/modal_infra.py. TTS_BACKEND_REPOS: dict[str, str] = { "voxcpm2": "openbmb/VoxCPM2", # English (Voice Design) "indic_parler": "ai4bharat/indic-parler-tts", # AI4Bharat Indic-TTS (FastPitch + HiFi-GAN) — no HF repo; the value is the # GitHub-release checkpoint zip (per-language). Bengali = bn.zip (~1.5 GB). # Dedicated, MOS-tuned, no reference clip; fixed voice (no persona control). "indic_tts": "https://github.com/AI4Bharat/Indic-TTS/releases/download/v1-checkpoints-release/bn.zip", } def get_tts_repo(backend: str) -> str: """Resolve a TTS backend key to its HuggingFace repo ID.""" try: return TTS_BACKEND_REPOS[backend] except KeyError: raise ValueError( f"Unknown TTS backend '{backend}'. Known: {list(TTS_BACKEND_REPOS)}" ) # Per-language decoding params for the vision/story model (passed to Ollama). # Bengali uses a more conservative profile: lower temperature + min_p floor + # repetition penalty suppress the wrong-script (Latin) tokens, invented non-words, # and phrase repetition that high-temperature sampling produces in a lower-resource # language. English can afford a livelier profile. Tune these here only. VISION_GEN_OPTIONS: dict[str, dict] = { "en": { "temperature": 0.8, "top_p": 0.95, "repeat_penalty": 1.1, "num_predict": 500, # bound the response; a bedtime story is short }, "bn": { "temperature": 0.45, "top_p": 0.9, "top_k": 40, "min_p": 0.05, "repeat_penalty": 1.18, "repeat_last_n": 64, "num_predict": 700, # Bengali uses more tokens per word; still bounded }, } def get_vision_options(language: str) -> dict: """Return a copy of the decoding params for the given language ('en'|'bn').""" return dict(VISION_GEN_OPTIONS.get(language, VISION_GEN_OPTIONS["en"])) # Translation-pivot path (research option #1): MiniCPM writes the story in English # (its strength), then IndicTrans2 translates it to fluent Bengali. Model name lives # here only. 1B (gated, MIT) for quality; dist-200M is the faster/smaller option. TRANSLATION_MODEL = "ai4bharat/indictrans2-en-indic-1B" # IndicTrans2 FLORES-style language codes. INDICTRANS_LANG_CODES: dict[str, str] = {"en": "eng_Latn", "bn": "ben_Beng"} def get_indictrans_code(language: str) -> str: """Map our 'en'/'bn' codes to IndicTrans2's FLORES codes.""" try: return INDICTRANS_LANG_CODES[language] except KeyError: raise ValueError(f"No IndicTrans2 code for language '{language}'.") # ── Bengali distillation fine-tuning (see finetune/) ──────────────────────── # Teacher that writes native Bengali story labels from a drawing. Gemma 3 is # multimodal and writes excellent Bengali (চাঁদমামা/পুকুর register). 27B gives the # best labels (fewer code-switch leaks); 12B is faster. Label quality caps the # student, and data-gen is a one-time job, so quality is prioritised here. TEACHER_MODEL = "gemma3:27b" # Base student that gets fine-tuned (the Stack A vision model, HF repo form for # training — Ollama tag form for serving is in the StackConfig). STUDENT_BASE_REPO = "openbmb/MiniCPM-V-4_5" # Once a LoRA is trained + merged and served via vLLM, set this to the merged # model path/repo and route the vision path to it. None = use the base model. # Set 2026-06-13 after the held-out eval (finetune/eval_ft.py): the fine-tuned # model decisively beats the base on Bengali (base output was garbled + looping; # FT is coherent native রূপকথা), confirmed by a Bengali speaker. Bengali now routes # to the FT model served by finetune/serve_vllm.py (app `rupkotha-ft-serve`). FINETUNED_VISION_MODEL: str | None = "/data/out/minicpm-v-bengali-merged" def get_compute() -> dict: """Returns the active compute-location settings (Modal infra). Import this in core/modal_infra.py and the core/ wrappers — never hardcode GPU tier or location.""" if COMPUTE_LOCATION not in ("modal", "local"): raise ValueError(f"COMPUTE_LOCATION='{COMPUTE_LOCATION}' is not valid. Use 'modal' or 'local'.") return { "location": COMPUTE_LOCATION, "gpu": MODAL_GPU, "min_containers": MODAL_MIN_CONTAINERS, }