Spaces:
Running
Running
| # core/model_config.py | |
| from dataclasses import dataclass | |
| # UI mock mode — DISABLED for the published Hugging Face Space (always uses the real | |
| # Modal-served models). The core wrappers keep a mock branch for offline UI dev; set | |
| # this True locally to use canned story/audio without calling Modal. Keep False here. | |
| UI_MOCK: bool = False | |
| # ───────────────────────────────────────────── | |
| # ACTIVE STACK — Stack A is the sole submission stack (OpenBMB prize path). | |
| # The StackConfig machinery below is kept so a stack could be added/swapped, but | |
| # only "A" is defined; Stacks B/C were dropped. | |
| ACTIVE_STACK: str = "A" | |
| # ───────────────────────────────────────────── | |
| # WHERE INFERENCE RUNS | |
| # "modal" → models served on Modal cloud GPUs (current; see CLAUDE.md infra note) | |
| # "local" → models served on the local machine (legacy offline path; unsupported) | |
| COMPUTE_LOCATION: str = "modal" | |
| # Modal GPU tier for inference containers. A10G (24 GB) covers Stack A's runtime. | |
| MODAL_GPU: str = "A10G" | |
| # Keep one container warm to hide cold starts. =1 during a demo (no cold start, | |
| # but bills idle GPU — incl. the A100 FT-serve container, so revert to 0 after). | |
| # =0 scales to zero and only bills per request (cold start ~3-5 min on first call). | |
| MODAL_MIN_CONTAINERS: int = 0 | |
| # ───────────────────────────────────────────── | |
| class StackConfig: | |
| name: str | |
| description: str | |
| vision_model: str # Ollama model tag | |
| vision_backend: str # "ollama" (all stacks use Ollama) | |
| stt_model: str # faster-whisper model size | |
| stt_bn_model: str | None # optional Bengali-specific STT model (HF repo) | |
| tts_en_backend: str # "voxcpm2" | |
| tts_bn_backend: str # "indic_tts" (chosen) | "indic_parler" (alt) | |
| tts_bn_ref_audio: str | None # unused (was for the removed IndicF5 voice-clone) | |
| total_params_b: float # informational — for README generation | |
| openbmb_prize_eligible: bool | |
| STACKS: dict[str, StackConfig] = { | |
| "A": StackConfig( | |
| name="Stack A — OpenBMB Prize Path", | |
| description="MiniCPM-V 4.5 + VoxCPM2 + AI4Bharat Indic-TTS. ~12.4B. OpenBMB prize eligible.", | |
| # Default (~Q4, 6.1GB). q8_0 was tested and gave NO Bengali quality gain at | |
| # higher cost/latency — precision is not the bottleneck, the 8B model's | |
| # Bengali capability is. Bengali quality is addressed via two-pass (Lever C). | |
| vision_model="openbmb/minicpm-v4.5", | |
| vision_backend="ollama", | |
| stt_model="large-v3", | |
| stt_bn_model="bangla-asr/whisper-medium-bn", | |
| tts_en_backend="voxcpm2", | |
| # Bengali TTS: AI4Bharat Indic-TTS (FastPitch). Chosen over Indic Parler-TTS | |
| # (sounded artificial) and IndicF5 (voice clone — removed: too slow even on | |
| # A100, needs a reference clip). FastPitch is fast, no reference needed. | |
| tts_bn_backend="indic_tts", | |
| tts_bn_ref_audio=None, | |
| total_params_b=12.4, | |
| openbmb_prize_eligible=True, | |
| ), | |
| } | |
| def get_config() -> StackConfig: | |
| """Returns the active stack config. Import this everywhere model details are needed.""" | |
| if ACTIVE_STACK not in STACKS: | |
| raise ValueError( | |
| f"ACTIVE_STACK='{ACTIVE_STACK}' is not valid. Stack A is the only defined stack." | |
| ) | |
| return STACKS[ACTIVE_STACK] | |
| def get_all_stacks() -> dict[str, StackConfig]: | |
| """Returns all defined stacks (currently just Stack A).""" | |
| return STACKS | |
| # HF repo IDs for the TTS backends. Model names live ONLY in this file — the | |
| # StackConfig stores the backend *key* ('voxcpm2' | 'indic_parler' | 'indic_tts'); | |
| # this maps that key to the actual repo passed to core/modal_infra.py. | |
| TTS_BACKEND_REPOS: dict[str, str] = { | |
| "voxcpm2": "openbmb/VoxCPM2", # English (Voice Design) | |
| "indic_parler": "ai4bharat/indic-parler-tts", | |
| # AI4Bharat Indic-TTS (FastPitch + HiFi-GAN) — no HF repo; the value is the | |
| # GitHub-release checkpoint zip (per-language). Bengali = bn.zip (~1.5 GB). | |
| # Dedicated, MOS-tuned, no reference clip; fixed voice (no persona control). | |
| "indic_tts": "https://github.com/AI4Bharat/Indic-TTS/releases/download/v1-checkpoints-release/bn.zip", | |
| } | |
| def get_tts_repo(backend: str) -> str: | |
| """Resolve a TTS backend key to its HuggingFace repo ID.""" | |
| try: | |
| return TTS_BACKEND_REPOS[backend] | |
| except KeyError: | |
| raise ValueError( | |
| f"Unknown TTS backend '{backend}'. Known: {list(TTS_BACKEND_REPOS)}" | |
| ) | |
| # Per-language decoding params for the vision/story model (passed to Ollama). | |
| # Bengali uses a more conservative profile: lower temperature + min_p floor + | |
| # repetition penalty suppress the wrong-script (Latin) tokens, invented non-words, | |
| # and phrase repetition that high-temperature sampling produces in a lower-resource | |
| # language. English can afford a livelier profile. Tune these here only. | |
| VISION_GEN_OPTIONS: dict[str, dict] = { | |
| "en": { | |
| "temperature": 0.8, | |
| "top_p": 0.95, | |
| "repeat_penalty": 1.1, | |
| "num_predict": 500, # bound the response; a bedtime story is short | |
| }, | |
| "bn": { | |
| "temperature": 0.45, | |
| "top_p": 0.9, | |
| "top_k": 40, | |
| "min_p": 0.05, | |
| "repeat_penalty": 1.18, | |
| "repeat_last_n": 64, | |
| "num_predict": 700, # Bengali uses more tokens per word; still bounded | |
| }, | |
| } | |
| def get_vision_options(language: str) -> dict: | |
| """Return a copy of the decoding params for the given language ('en'|'bn').""" | |
| return dict(VISION_GEN_OPTIONS.get(language, VISION_GEN_OPTIONS["en"])) | |
| # Translation-pivot path (research option #1): MiniCPM writes the story in English | |
| # (its strength), then IndicTrans2 translates it to fluent Bengali. Model name lives | |
| # here only. 1B (gated, MIT) for quality; dist-200M is the faster/smaller option. | |
| TRANSLATION_MODEL = "ai4bharat/indictrans2-en-indic-1B" | |
| # IndicTrans2 FLORES-style language codes. | |
| INDICTRANS_LANG_CODES: dict[str, str] = {"en": "eng_Latn", "bn": "ben_Beng"} | |
| def get_indictrans_code(language: str) -> str: | |
| """Map our 'en'/'bn' codes to IndicTrans2's FLORES codes.""" | |
| try: | |
| return INDICTRANS_LANG_CODES[language] | |
| except KeyError: | |
| raise ValueError(f"No IndicTrans2 code for language '{language}'.") | |
| # ── Bengali distillation fine-tuning (see finetune/) ──────────────────────── | |
| # Teacher that writes native Bengali story labels from a drawing. Gemma 3 is | |
| # multimodal and writes excellent Bengali (চাঁদমামা/পুকুর register). 27B gives the | |
| # best labels (fewer code-switch leaks); 12B is faster. Label quality caps the | |
| # student, and data-gen is a one-time job, so quality is prioritised here. | |
| TEACHER_MODEL = "gemma3:27b" | |
| # Base student that gets fine-tuned (the Stack A vision model, HF repo form for | |
| # training — Ollama tag form for serving is in the StackConfig). | |
| STUDENT_BASE_REPO = "openbmb/MiniCPM-V-4_5" | |
| # Once a LoRA is trained + merged and served via vLLM, set this to the merged | |
| # model path/repo and route the vision path to it. None = use the base model. | |
| # Set 2026-06-13 after the held-out eval (finetune/eval_ft.py): the fine-tuned | |
| # model decisively beats the base on Bengali (base output was garbled + looping; | |
| # FT is coherent native রূপকথা), confirmed by a Bengali speaker. Bengali now routes | |
| # to the FT model served by finetune/serve_vllm.py (app `rupkotha-ft-serve`). | |
| FINETUNED_VISION_MODEL: str | None = "/data/out/minicpm-v-bengali-merged" | |
| def get_compute() -> dict: | |
| """Returns the active compute-location settings (Modal infra). Import this in | |
| core/modal_infra.py and the core/ wrappers — never hardcode GPU tier or location.""" | |
| if COMPUTE_LOCATION not in ("modal", "local"): | |
| raise ValueError(f"COMPUTE_LOCATION='{COMPUTE_LOCATION}' is not valid. Use 'modal' or 'local'.") | |
| return { | |
| "location": COMPUTE_LOCATION, | |
| "gpu": MODAL_GPU, | |
| "min_containers": MODAL_MIN_CONTAINERS, | |
| } | |