Spaces:

Tonic
/

fr-on-device

Sleeping

App Files Files Community

Joseph Pollack commited on Feb 19

Commit

7c96057

unverified ·

1 Parent(s): c15f462

initial commit

Browse files

Files changed (8) hide show

README.md +31 -5
__pycache__/app.cpython-313.pyc +0 -0
__pycache__/inference.cpython-313.pyc +0 -0
__pycache__/model_config.cpython-313.pyc +0 -0
app.py +255 -4
inference.py +144 -0
model_config.py +143 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,14 +1,40 @@
 ---
-title: Fr On Device
-emoji: 👁
 colorFrom: blue
 colorTo: indigo
 sdk: gradio
-sdk_version: 6.6.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: fully subsidized versus non-subsidized fr understanding
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Baguettotron vs Luth models
+emoji: 🥖
 colorFrom: blue
 colorTo: indigo
 sdk: gradio
+sdk_version: "4"
 app_file: app.py
 pinned: false
 license: mit
+short_description: All models, all outputs — apples-to-apples comparison by parameter size
 ---
+# Baguettotron vs Luth models
+Apples-to-apples comparison of **Baguettotron** (PleIAs, 321M) and **5 Luth models** (kurakurai, 0.4B–1.7B) from the [Luth Models collection](https://huggingface.co/collections/kurakurai/luth-models).
+## Features
+- **All models, all outputs:** Each prompt runs through all 6 models; outputs appear in tabs grouped by parameter size.
+- **Ultimate footprint:** Per-model disk size and VRAM estimates; combined footprint for all models.
+- **Per-tier hyperparameters:** Temperature, max_tokens, top_p, top_k, repeat_penalty per size tier.
+- **Transformers-only:** No quantization; all models run in BF16/FP16.
+## Size tiers
+| Tier | Models |
+|------|--------|
+| ~0.3–0.4B (Small) | Baguettotron, Luth-LFM2-350M |
+| ~0.6–0.7B (Medium) | Luth-0.6B-Instruct, Luth-LFM2-700M |
+| ~1–2B (Large) | Luth-LFM2-1.2B, Luth-1.7B-Instruct |
+## Baguettotron EOS quirk
+Baguettotron's tokenizer uses `"<|im_end>"` (no trailing pipe) for EOS. The app uses manual prompt formatting and stop sequences to avoid multi-token tokenization. See [quirk.md](quirk.md) for details.
+## Deployment
+- **Hugging Face Spaces:** Set hardware to **Zero GPU** (or standard GPU). The app uses `@spaces.GPU` when available.
+- **Local:** Run `python app.py`; requires a GPU with ~10 GB VRAM for all 6 models.

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (8.72 kB). View file

__pycache__/inference.cpython-313.pyc ADDED Viewed

Binary file (6.57 kB). View file

__pycache__/model_config.cpython-313.pyc ADDED Viewed

Binary file (4.76 kB). View file

app.py CHANGED Viewed

@@ -1,7 +1,258 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""
+Baguettotron vs Luth models — Gradio comparison app.
+All models, all outputs; tabbed by parameter size.
+"""
 import gradio as gr
+from inference import run_all
+from model_config import (
+    TIER_LABELS,
+    combined_footprint,
+    footprint_table_data,
+    get_models_by_tier,
+    MODEL_IDS,
+)
+# Optional: use @spaces.GPU for ZeroGPU deployment
+try:
+    import spaces
+    GPU_DECORATOR = spaces.GPU
+except ImportError:
+    GPU_DECORATOR = lambda f: f  # no-op when not on Spaces
+def build_params_by_model(
+    temp_small: float,
+    max_tok_small: int,
+    top_p_small: float,
+    top_k_small: int,
+    rep_small: float,
+    temp_med: float,
+    max_tok_med: int,
+    top_p_med: float,
+    top_k_med: int,
+    rep_med: float,
+    temp_large: float,
+    max_tok_large: int,
+    top_p_large: float,
+    top_k_large: int,
+    rep_large: float,
+) -> dict[str, dict]:
+    """Build params dict keyed by model_id from tier-level controls."""
+    tier_params = {
+        "small": {
+            "temperature": temp_small,
+            "max_tokens": max_tok_small,
+            "top_p": top_p_small,
+            "top_k": top_k_small,
+            "repeat_penalty": rep_small,
+        },
+        "medium": {
+            "temperature": temp_med,
+            "max_tokens": max_tok_med,
+            "top_p": top_p_med,
+            "top_k": top_k_med,
+            "repeat_penalty": rep_med,
+        },
+        "large": {
+            "temperature": temp_large,
+            "max_tokens": max_tok_large,
+            "top_p": top_p_large,
+            "top_k": top_k_large,
+            "repeat_penalty": rep_large,
+        },
+    }
+    models_by_tier = get_models_by_tier()
+    params_by_model: dict[str, dict] = {}
+    for tier, models in models_by_tier.items():
+        p = tier_params[tier]
+        for m in models:
+            params_by_model[m.repo_id] = p.copy()
+    return params_by_model
+@GPU_DECORATOR
+def generate_all(
+    prompt: str,
+    temp_small: float,
+    max_tok_small: int,
+    top_p_small: float,
+    top_k_small: int,
+    rep_small: float,
+    temp_med: float,
+    max_tok_med: int,
+    top_p_med: float,
+    top_k_med: int,
+    rep_med: float,
+    temp_large: float,
+    max_tok_large: int,
+    top_p_large: float,
+    top_k_large: int,
+    rep_large: float,
+) -> tuple[str, str, str, str, str, str]:
+    """Run all 6 models, return outputs in tab order: small (2), medium (2), large (2)."""
+    if not prompt.strip():
+        return ("",) * 6
+    params = build_params_by_model(
+        temp_small,
+        max_tok_small,
+        top_p_small,
+        top_k_small,
+        rep_small,
+        temp_med,
+        max_tok_med,
+        top_p_med,
+        top_k_med,
+        rep_med,
+        temp_large,
+        max_tok_large,
+        top_p_large,
+        top_k_large,
+        rep_large,
+    )
+    results = run_all(prompt, params)
+    models_by_tier = get_models_by_tier()
+    outputs: list[str] = []
+    for tier in ["small", "medium", "large"]:
+        for m in models_by_tier[tier]:
+            outputs.append(results.get(m.repo_id, ""))
+    return tuple(outputs)
+def create_ui():
+    total_disk, total_vram = combined_footprint()
+    footprint_md = f"""
+**Combined footprint —** Total disk: {total_disk:,} MB | Total VRAM (est.): {total_vram:.2f} GB
+"""
+    with gr.Blocks(title="Baguettotron vs Luth models") as demo:
+        gr.Markdown("# Baguettotron vs Luth models")
+        gr.Markdown(
+            "All models, all outputs — apples-to-apples comparison by parameter size."
+        )
+        # Row 1: Footprint table
+        gr.Markdown("## Model footprint")
+        footprint_df = gr.Dataframe(
+            value=footprint_table_data(),
+            headers=["Model", "Params", "File size (MB)", "Est. VRAM (MB)"],
+            interactive=False,
+        )
+        gr.Markdown(footprint_md)
+        # Row 2: Per-tier hyperparameters
+        gr.Markdown("## Generation settings (by size tier)")
+        with gr.Accordion("~0.3–0.4B (Small)", open=False):
+            temp_small = gr.Slider(0, 2, value=0.7, label="Temperature")
+            max_tok_small = gr.Number(value=256, label="Max tokens", minimum=64, maximum=2048)
+            top_p_small = gr.Slider(0, 1, value=0.9, label="Top p")
+            top_k_small = gr.Number(value=40, label="Top k")
+            rep_small = gr.Slider(1.0, 1.5, value=1.1, label="Repeat penalty")
+        with gr.Accordion("~0.6–0.7B (Medium)", open=False):
+            temp_med = gr.Slider(0, 2, value=0.7, label="Temperature")
+            max_tok_med = gr.Number(value=256, label="Max tokens", minimum=64, maximum=2048)
+            top_p_med = gr.Slider(0, 1, value=0.9, label="Top p")
+            top_k_med = gr.Number(value=40, label="Top k")
+            rep_med = gr.Slider(1.0, 1.5, value=1.1, label="Repeat penalty")
+        with gr.Accordion("~1–2B (Large)", open=False):
+            temp_large = gr.Slider(0, 2, value=0.7, label="Temperature")
+            max_tok_large = gr.Number(value=256, label="Max tokens", minimum=64, maximum=2048)
+            top_p_large = gr.Slider(0, 1, value=0.9, label="Top p")
+            top_k_large = gr.Number(value=40, label="Top k")
+            rep_large = gr.Slider(1.0, 1.5, value=1.1, label="Repeat penalty")
+        # Row 3: Prompt + Generate + tabbed outputs
+        gr.Markdown("## Live inference")
+        prompt_in = gr.Textbox(
+            label="Prompt",
+            placeholder="Enter your prompt here...",
+            lines=3,
+        )
+        gen_btn = gr.Button("Generate", variant="primary")
+        models_by_tier = get_models_by_tier()
+        with gr.Tabs():
+            with gr.Tab(TIER_LABELS["small"]):
+                with gr.Row():
+                    out_baguettotron = gr.Textbox(
+                        label="Baguettotron (321M)",
+                        lines=12,
+                        max_lines=24,
+                    )
+                    out_luth_350 = gr.Textbox(
+                        label="Luth-LFM2-350M (0.4B)",
+                        lines=12,
+                        max_lines=24,
+                    )
+            with gr.Tab(TIER_LABELS["medium"]):
+                with gr.Row():
+                    out_luth_06 = gr.Textbox(
+                        label="Luth-0.6B-Instruct",
+                        lines=12,
+                        max_lines=24,
+                    )
+                    out_luth_07 = gr.Textbox(
+                        label="Luth-LFM2-700M",
+                        lines=12,
+                        max_lines=24,
+                    )
+            with gr.Tab(TIER_LABELS["large"]):
+                with gr.Row():
+                    out_luth_12 = gr.Textbox(
+                        label="Luth-LFM2-1.2B",
+                        lines=12,
+                        max_lines=24,
+                    )
+                    out_luth_17 = gr.Textbox(
+                        label="Luth-1.7B-Instruct",
+                        lines=12,
+                        max_lines=24,
+                    )
+        all_inputs = [
+            prompt_in,
+            temp_small,
+            max_tok_small,
+            top_p_small,
+            top_k_small,
+            rep_small,
+            temp_med,
+            max_tok_med,
+            top_p_med,
+            top_k_med,
+            rep_med,
+            temp_large,
+            max_tok_large,
+            top_p_large,
+            top_k_large,
+            rep_large,
+        ]
+        all_outputs = [
+            out_baguettotron,
+            out_luth_350,
+            out_luth_06,
+            out_luth_07,
+            out_luth_12,
+            out_luth_17,
+        ]
+        gen_btn.click(
+            fn=generate_all,
+            inputs=all_inputs,
+            outputs=all_outputs,
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_ui()
+    demo.launch()

inference.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Parallel load and inference for all 6 models (Baguettotron + 5 Luth).
+Baguettotron uses EOS-safe formatting: "<|im_end>" (no trailing pipe), stop=["<|im_end>", "</think>"].
+"""
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any
+import torch
+from model_config import MODEL_IDS
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# In-memory cache: model_id -> (model, tokenizer)
+_model_cache: dict[str, tuple[Any, Any]] = {}
+_cache_lock = __import__("threading").Lock()
+# Baguettotron repo_id for EOS quirk handling
+BAGUETTOTRON_ID = "PleIAs/Baguettotron"
+def _format_prompt_baguettotron(prompt: str) -> tuple[str, list[str]]:
+    """
+    Manual prompt build for Baguettotron. Uses "<|im_end>" (no trailing pipe)
+    per tokenizer; stop=["<|im_end>", "</think>"] for generation.
+    """
+    # Qwen-style: <|im_start|>user\n{content}<|im_end>\n<|im_start|>assistant\n<think>\n
+    text = f"<|im_start|>user\n{prompt}<|im_end>\n<|im_start|>assistant\n<think>\n"
+    stop = ["<|im_end>", "</think>"]
+    return text, stop
+def _format_prompt_luth(prompt: str, tokenizer: Any) -> tuple[dict[str, Any], list[str] | None]:
+    """Use tokenizer's chat template for Luth models."""
+    messages = [{"role": "user", "content": prompt}]
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_tensors="pt",
+        return_dict=True,
+    )
+    return inputs, None  # no custom stop for Luth
+def _get_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def _load_model(model_id: str, device: str | None = None) -> tuple[Any, Any]:
+    """Load model and tokenizer; cache by model_id."""
+    if device is None:
+        device = _get_device()
+    with _cache_lock:
+        if model_id in _model_cache:
+            return _model_cache[model_id]
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype="auto",
+        device_map="auto" if device == "cuda" else device,
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    with _cache_lock:
+        _model_cache[model_id] = (model, tokenizer)
+    return model, tokenizer
+def _generate_one(
+    model_id: str,
+    prompt: str,
+    params: dict[str, Any],
+    device: str = "cuda",
+) -> tuple[str, str]:
+    """Load (or use cached) model, run inference, return (model_id, text)."""
+    model, tokenizer = _load_model(model_id, device)
+    device = next(model.parameters()).device
+    gen_kwargs: dict[str, Any] = {
+        "max_new_tokens": params.get("max_tokens", 256),
+        "temperature": params.get("temperature", 0.7),
+        "top_p": params.get("top_p", 0.9),
+        "top_k": params.get("top_k", 40),
+        "repetition_penalty": params.get("repeat_penalty", 1.1),
+        "do_sample": True,
+        "pad_token_id": tokenizer.eos_token_id or tokenizer.pad_token_id,
+    }
+    if model_id == BAGUETTOTRON_ID:
+        text_prompt, _stop = _format_prompt_baguettotron(prompt)
+        inputs = tokenizer(text_prompt, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+    else:
+        inputs_dict, _ = _format_prompt_luth(prompt, tokenizer)
+        inputs = {k: v.to(device) for k, v in inputs_dict.items()}
+    outputs = model.generate(**inputs, **gen_kwargs)
+    input_len = inputs["input_ids"].shape[-1]
+    text = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
+    # Post-process: truncate at stop strings for Baguettotron
+    if model_id == BAGUETTOTRON_ID:
+        for s in ["<|im_end>", "</think>"]:
+            if s in text:
+                text = text.split(s)[0].strip()
+    return model_id, text
+def run_all(
+    prompt: str,
+    params_by_model: dict[str, dict[str, Any]],
+    device: str | None = None,
+    max_workers: int = 6,
+) -> dict[str, str]:
+    """
+    Load all 6 models in parallel, run all 6 inferences in parallel.
+    Returns dict {model_id: text}.
+    """
+    if device is None:
+        device = _get_device()
+    default_params = {
+        "temperature": 0.7,
+        "max_tokens": 256,
+        "top_p": 0.9,
+        "top_k": 40,
+        "repeat_penalty": 1.1,
+    }
+    def task(model_id: str):
+        p = {**default_params, **(params_by_model.get(model_id) or {})}
+        return _generate_one(model_id, prompt, p, device)
+    results: dict[str, str] = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as ex:
+        futures = {ex.submit(task, mid): mid for mid in MODEL_IDS}
+        for fut in as_completed(futures):
+            model_id, text = fut.result()
+            results[model_id] = text
+    return results

model_config.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+Model registry for Baguettotron vs Luth comparison app.
+All 6 models with footprint data and size tiers for tab grouping.
+"""
+from dataclasses import dataclass
+from typing import Literal
+SizeTier = Literal["small", "medium", "large"]
+@dataclass
+class ModelEntry:
+    repo_id: str
+    name: str
+    author: str
+    params: int
+    params_display: str
+    file_size_mb: int
+    vram_estimate_mb: int
+    size_tier: SizeTier
+    description: str
+    architecture: str = "decoder"
+    license: str = "apache-2.0"
+    model_card_url: str = ""
+# Baguettotron: 321M, ~642 MB (BF16)
+# Luth models: from HF safetensors metadata where available; else params * 2 bytes
+MODELS: list[ModelEntry] = [
+    ModelEntry(
+        repo_id="PleIAs/Baguettotron",
+        name="Baguettotron",
+        author="PleIAs",
+        params=320_956_992,
+        params_display="321M",
+        file_size_mb=642,
+        vram_estimate_mb=642,
+        size_tier="small",
+        description="321M generalist reasoning model, SYNTH, 80 layers",
+        model_card_url="https://huggingface.co/PleIAs/Baguettotron",
+    ),
+    ModelEntry(
+        repo_id="kurakurai/Luth-LFM2-350M",
+        name="Luth-LFM2-350M",
+        author="kurakurai",
+        params=354_483_968,
+        params_display="0.4B",
+        file_size_mb=709,
+        vram_estimate_mb=709,
+        size_tier="small",
+        description="French fine-tuned LFM2-350M",
+        model_card_url="https://huggingface.co/kurakurai/Luth-LFM2-350M",
+    ),
+    ModelEntry(
+        repo_id="kurakurai/Luth-0.6B-Instruct",
+        name="Luth-0.6B-Instruct",
+        author="kurakurai",
+        params=600_000_000,
+        params_display="0.6B",
+        file_size_mb=1200,
+        vram_estimate_mb=1200,
+        size_tier="medium",
+        description="Luth 0.6B Instruct",
+        model_card_url="https://huggingface.co/kurakurai/Luth-0.6B-Instruct",
+    ),
+    ModelEntry(
+        repo_id="kurakurai/Luth-LFM2-700M",
+        name="Luth-LFM2-700M",
+        author="kurakurai",
+        params=700_000_000,
+        params_display="0.7B",
+        file_size_mb=1400,
+        vram_estimate_mb=1400,
+        size_tier="medium",
+        description="Luth LFM2 700M",
+        model_card_url="https://huggingface.co/kurakurai/Luth-LFM2-700M",
+    ),
+    ModelEntry(
+        repo_id="kurakurai/Luth-LFM2-1.2B",
+        name="Luth-LFM2-1.2B",
+        author="kurakurai",
+        params=1_200_000_000,
+        params_display="1.2B",
+        file_size_mb=2400,
+        vram_estimate_mb=2400,
+        size_tier="large",
+        description="Luth LFM2 1.2B",
+        model_card_url="https://huggingface.co/kurakurai/Luth-LFM2-1.2B",
+    ),
+    ModelEntry(
+        repo_id="kurakurai/Luth-1.7B-Instruct",
+        name="Luth-1.7B-Instruct",
+        author="kurakurai",
+        params=1_700_000_000,
+        params_display="1.7B",
+        file_size_mb=3400,
+        vram_estimate_mb=3400,
+        size_tier="large",
+        description="Luth 1.7B Instruct",
+        model_card_url="https://huggingface.co/kurakurai/Luth-1.7B-Instruct",
+    ),
+]
+# Model IDs for inference (repo_id as key)
+MODEL_IDS = [m.repo_id for m in MODELS]
+# Group by size tier for tabs
+TIER_ORDER: list[SizeTier] = ["small", "medium", "large"]
+TIER_LABELS: dict[SizeTier, str] = {
+    "small": "~0.3–0.4B (Small)",
+    "medium": "~0.6–0.7B (Medium)",
+    "large": "~1–2B (Large)",
+}
+def get_models_by_tier() -> dict[SizeTier, list[ModelEntry]]:
+    out: dict[SizeTier, list[ModelEntry]] = {t: [] for t in TIER_ORDER}
+    for m in MODELS:
+        out[m.size_tier].append(m)
+    return out
+def get_model_by_id(repo_id: str) -> ModelEntry | None:
+    for m in MODELS:
+        if m.repo_id == repo_id:
+            return m
+    return None
+def footprint_table_data() -> list[list[str]]:
+    """Rows for gr.Dataframe: Model | Params | File size (MB) | Est. VRAM (MB)"""
+    return [
+        [m.name, m.params_display, str(m.file_size_mb), str(m.vram_estimate_mb)]
+        for m in MODELS
+    ]
+def combined_footprint() -> tuple[int, float]:
+    """Total disk (MB) and total VRAM (GB) for all 6 models."""
+    total_disk = sum(m.file_size_mb for m in MODELS)
+    total_vram_mb = sum(m.vram_estimate_mb for m in MODELS)
+    return total_disk, total_vram_mb / 1024

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.0
+transformers>=4.36
+accelerate
+safetensors
+huggingface_hub
+torch