Spaces:

markury
/

midmid3

Running on Zero

App Files Files Community

markury commited on Mar 28

Commit

d171350

0 Parent(s):

Initial commit

Browse files

Files changed (19) hide show

.gitignore +5 -0
README.md +26 -0
app.py +108 -0
convert_checkpoint.py +57 -0
midmid/__init__.py +1 -0
midmid/audio_prep.py +16 -0
midmid/beat_tracker.py +42 -0
midmid/constraints.py +79 -0
midmid/datatypes.py +23 -0
midmid/inference.py +335 -0
midmid/ini_writer.py +32 -0
midmid/midi_writer.py +104 -0
midmid/nn.py +222 -0
midmid/offset.py +29 -0
midmid/sections.py +90 -0
midmid/tempo_map.py +80 -0
pipeline.py +304 -0
requirements.txt +12 -0
visualizer.py +360 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.venv/
+__pycache__/
+*.pyc
+*.egg-info/
+model_upload/

README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+---
+title: Midmid - Guitar Hero Chart Generator
+emoji: 🎸
+colorFrom: purple
+colorTo: yellow
+sdk: gradio
+sdk_version: "5.23.0"
+app_file: app.py
+pinned: true
+license: mit
+hardware: zero-a10g
+---
+# Midmid — AI Guitar Hero Chart Generator
+Upload a song, get a playable Guitar Hero chart. Powered by a 19M-parameter
+masked-prediction transformer trained on thousands of community-charted songs.
+**How it works:**
+1. Upload an audio file (MP3, FLAC, OGG, WAV)
+2. Enter song metadata (title, artist, etc.)
+3. Hit Generate — the model analyzes beats, structure, and audio features,
+   then predicts note placements for all four difficulty levels
+4. Preview the chart in-browser, then download the ready-to-play song package
+The output folder drops straight into GHWT:DE's MODS directory.

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Midmid — AI Guitar Hero Chart Generator (Hugging Face Space)."""
+import os
+import gradio as gr
+# ZeroGPU: import spaces if available (no-op locally)
+try:
+    import spaces
+    ON_ZEROGPU = True
+except ImportError:
+    ON_ZEROGPU = False
+from pipeline import ensure_model, generate_chart
+from visualizer import build_visualizer_html
+# Pre-load model on CPU at startup
+ensure_model()
+PLACEHOLDER_HTML = """
+<div style="font-family: system-ui, sans-serif; background: #111; border-radius: 12px;
+    padding: 60px 20px; text-align: center; color: #666; max-width: 900px; margin: 0 auto;">
+  <div style="font-size: 48px; margin-bottom: 12px;">🎸</div>
+  <div style="font-size: 16px;">Upload a song and hit Generate to see your chart here</div>
+</div>
+"""
+def _generate_wrapper(audio_path, title, artist, album, year, genre, progress=gr.Progress()):
+    """Gradio-facing wrapper with validation and progress."""
+    if not audio_path:
+        raise gr.Error("Please upload an audio file.")
+    if not title or not title.strip():
+        raise gr.Error("Song title is required.")
+    if not artist or not artist.strip():
+        raise gr.Error("Artist name is required.")
+    zip_path, chart_json = generate_chart(
+        audio_path=audio_path,
+        title=title.strip(),
+        artist=artist.strip(),
+        album=album.strip() if album else "",
+        year=year.strip() if year else "",
+        genre=genre.strip() if genre else "rock",
+        progress_cb=progress,
+    )
+    html = build_visualizer_html(chart_json)
+    return html, zip_path
+# Apply ZeroGPU decorator if running on HF Spaces
+if ON_ZEROGPU:
+    _generate_wrapper = spaces.GPU(duration=180)(_generate_wrapper)
+# --- UI ---
+with gr.Blocks(
+    title="Midmid — Guitar Hero Chart Generator",
+    theme=gr.themes.Base(primary_hue="purple", neutral_hue="gray"),
+    css="""
+    .gradio-container { max-width: 960px !important; }
+    #generate-btn { min-height: 48px; font-size: 16px; }
+    """,
+) as demo:
+    gr.Markdown(
+        "# Midmid — AI Guitar Hero Chart Generator\n"
+        "Upload a song, get a playable chart with 4 difficulty levels. "
+        "Preview it here, then download the GHWT:DE-ready package."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                label="Upload audio",
+                type="filepath",
+                sources=["upload"],
+            )
+            title_input = gr.Textbox(label="Song title *", placeholder="e.g. Through the Fire and Flames")
+            artist_input = gr.Textbox(label="Artist *", placeholder="e.g. DragonForce")
+            with gr.Row():
+                album_input = gr.Textbox(label="Album", placeholder="(optional)")
+                year_input = gr.Textbox(label="Year", placeholder="(optional)")
+            genre_input = gr.Textbox(label="Genre", placeholder="rock", value="rock")
+            generate_btn = gr.Button("Generate Chart", variant="primary", elem_id="generate-btn")
+        with gr.Column(scale=2):
+            viz_output = gr.HTML(value=PLACEHOLDER_HTML, label="Chart Preview")
+            zip_output = gr.File(label="Download song package (.zip)")
+    generate_btn.click(
+        fn=_generate_wrapper,
+        inputs=[audio_input, title_input, artist_input, album_input, year_input, genre_input],
+        outputs=[viz_output, zip_output],
+    )
+    gr.Markdown(
+        "---\n"
+        "*Charts generated by [Midmid](https://github.com/markury/midmid) — "
+        "a 19M-parameter masked transformer trained on community Guitar Hero charts. "
+        "Model: `markury/midmid3-19m-0326`*"
+    )
+if __name__ == "__main__":
+    demo.launch()

convert_checkpoint.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Convert a midmid PyTorch checkpoint to safetensors + config.json.
+Usage:
+    python convert_checkpoint.py path/to/best.pt --output-dir ./model_upload
+This produces:
+    model_upload/model.safetensors   (weights only, no pickle)
+    model_upload/config.json         (model hyperparameters)
+Then upload to HF:
+    huggingface-cli upload markury/midmid3-19m-0326 ./model_upload
+"""
+import argparse
+import json
+from pathlib import Path
+import torch
+from safetensors.torch import save_file
+def main():
+    parser = argparse.ArgumentParser(description="Convert midmid checkpoint to safetensors")
+    parser.add_argument("checkpoint", type=Path, help="Path to .pt checkpoint")
+    parser.add_argument("--output-dir", type=Path, default=Path("model_upload"),
+                        help="Output directory (default: ./model_upload)")
+    args = parser.parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Loading checkpoint: {args.checkpoint}")
+    ckpt = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
+    # Save config
+    config = ckpt["config"]
+    config_path = args.output_dir / "config.json"
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"Config saved: {config_path}")
+    print(f"  {json.dumps(config, indent=2)}")
+    # Save weights as safetensors
+    state_dict = ckpt["model_state_dict"]
+    safetensors_path = args.output_dir / "model.safetensors"
+    save_file(state_dict, str(safetensors_path))
+    print(f"Weights saved: {safetensors_path}")
+    # Summary
+    n_params = sum(p.numel() for p in state_dict.values())
+    print(f"  {n_params:,} parameters ({n_params / 1e6:.1f}M)")
+    print(f"\nUpload to HF with:")
+    print(f"  huggingface-cli upload markury/midmid3-19m-0326 {args.output_dir}")
+if __name__ == "__main__":
+    main()

midmid/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Midmid — Guitar Hero chart generation core

midmid/audio_prep.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Audio preparation — silence prepend, format conversion."""
+from pydub import AudioSegment
+def prepare_audio(
+    audio_path: str,
+    output_path: str,
+    silence_duration_sec: float = 3.0,
+    output_format: str = "ogg",
+) -> None:
+    """Prepend silence to audio and export in the target format."""
+    audio = AudioSegment.from_file(audio_path)
+    silence = AudioSegment.silent(duration=int(silence_duration_sec * 1000))
+    prepared = silence + audio
+    prepared.export(output_path, format=output_format)

midmid/beat_tracker.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Beat and downbeat tracking via beat_this (CPJKU)."""
+from dataclasses import dataclass
+import numpy as np
+from beat_this.inference import File2Beats
+@dataclass
+class BeatData:
+    beats: np.ndarray
+    downbeats: np.ndarray
+    beat_numbers: np.ndarray
+def track_beats(audio_path: str, device: str = "cuda") -> BeatData:
+    """Run beat and downbeat tracking on an audio file."""
+    processor = File2Beats(checkpoint_path="final0", device=device)
+    beats, downbeats = processor(audio_path)
+    beat_numbers = _assign_beat_numbers(beats, downbeats)
+    return BeatData(
+        beats=np.asarray(beats),
+        downbeats=np.asarray(downbeats),
+        beat_numbers=beat_numbers,
+    )
+def _assign_beat_numbers(beats: np.ndarray, downbeats: np.ndarray) -> np.ndarray:
+    beats = np.asarray(beats)
+    downbeats_set = set(np.round(downbeats, 6))
+    numbers = np.zeros(len(beats), dtype=int)
+    beat_num = 1
+    for i, t in enumerate(beats):
+        if round(float(t), 6) in downbeats_set:
+            beat_num = 1
+        numbers[i] = beat_num
+        beat_num += 1
+    return numbers

midmid/constraints.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Difficulty constraints — enforce per-difficulty fret and chord rules."""
+from midmid.datatypes import NoteEvent
+ALLOWED_FRETS = {
+    "easy":   {0, 1, 2},
+    "medium": {0, 1, 2, 3},
+    "hard":   {0, 1, 2, 3, 4},
+    "expert": {0, 1, 2, 3, 4},
+}
+MAX_CHORD_SIZE = {
+    "easy":   1,
+    "medium": 2,
+    "hard":   3,
+    "expert": 5,
+}
+MIN_NOTE_SPACING = {
+    "easy":   192,
+    "medium": 96,
+    "hard":   48,
+    "expert": 0,
+}
+def enforce_constraints(
+    notes: list[NoteEvent], difficulty: str, resolution: int = 192,
+) -> list[NoteEvent]:
+    allowed = ALLOWED_FRETS.get(difficulty, {0, 1, 2, 3, 4})
+    max_chord = MAX_CHORD_SIZE.get(difficulty, 5)
+    min_spacing = MIN_NOTE_SPACING.get(difficulty, 0)
+    result = []
+    for note in notes:
+        filtered = note.fret_set & allowed
+        if not filtered:
+            for fret in sorted(note.fret_set):
+                closest = min(allowed, key=lambda a: abs(a - fret))
+                filtered.add(closest)
+                break
+        if not filtered:
+            continue
+        if len(filtered) > max_chord:
+            filtered = set(sorted(filtered)[:max_chord])
+        if min_spacing > 0 and result:
+            if note.tick - result[-1].tick < min_spacing:
+                continue
+        if result and result[-1].sustain_ticks > 0:
+            prev_end = result[-1].tick + result[-1].sustain_ticks
+            if note.tick < prev_end:
+                continue
+        result.append(NoteEvent(
+            tick=note.tick,
+            fret_set=filtered,
+            sustain_ticks=note.sustain_ticks,
+            is_hopo=note.is_hopo,
+        ))
+        sixteenth = resolution // 4
+        if len(result) >= 2 and result[-2].sustain_ticks > 0:
+            prev = result[-2]
+            max_sustain = note.tick - prev.tick - sixteenth
+            max_sustain = (max_sustain // sixteenth) * sixteenth
+            if max_sustain < sixteenth:
+                max_sustain = 0
+            if prev.sustain_ticks > max_sustain:
+                result[-2] = NoteEvent(
+                    tick=prev.tick,
+                    fret_set=prev.fret_set,
+                    sustain_ticks=max_sustain,
+                    is_hopo=prev.is_hopo,
+                )
+    return result

midmid/datatypes.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Shared data types used across the pipeline."""
+from dataclasses import dataclass, field
+@dataclass
+class NoteEvent:
+    """A single note or chord at a specific tick position."""
+    tick: int
+    fret_set: set  # {0, 1, 2, 3, 4} where 0=Green, 4=Orange
+    sustain_ticks: int = 0
+    is_hopo: bool = False
+@dataclass
+class ChartData:
+    """Complete chart data ready for MIDI serialization."""
+    resolution: int = 192  # ticks per quarter note
+    tempo_events: list = field(default_factory=lambda: [(0, 120.0)])
+    time_signatures: list = field(default_factory=lambda: [(0, 4, 4)])
+    sections: list = field(default_factory=list)  # [(tick, label), ...]
+    notes: dict = field(default_factory=dict)  # {"expert": [NoteEvent, ...], ...}
+    beats: list = field(default_factory=list)  # [(tick, is_downbeat), ...]

midmid/inference.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""Audio encoding and iterative unmasking inference.
+Adapted from midmid/prediction/model.py for standalone use.
+Device management is caller-controlled (for ZeroGPU compatibility).
+"""
+import itertools as _it
+import json
+import math
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+from midmid.nn import (
+    ChartMaskPredictor, ChartMaskPredictorConfig,
+    MASK_TOKEN, SILENCE_TOKEN,
+)
+from midmid.datatypes import NoteEvent
+MERT_MODEL_ID = "m-a-p/MERT-v1-95M"
+DIFF_ID = {"easy": 0, "medium": 1, "hard": 2, "expert": 3}
+# Class ID -> fret tuple
+_CLASS_TO_FRETS: list[tuple[int, ...]] = []
+for _r in range(1, 6):
+    _CLASS_TO_FRETS.extend(_it.combinations(range(5), _r))
+_CLASS_TO_FRETS.append((7,))  # class 31 = open
+# Sustain bucket center values in beats
+_BUCKET_BEATS = [0.0, 1.0, 2.0, 4.0, 8.0, 16.0]
+# ---------------------------------------------------------------------------
+# Model loading (safetensors from HF Hub)
+# ---------------------------------------------------------------------------
+def load_model_from_hub(
+    repo_id: str = "markury/midmid3-19m-0326",
+    device: str = "cpu",
+) -> ChartMaskPredictor:
+    """Download and load model from HuggingFace Hub (safetensors)."""
+    from huggingface_hub import hf_hub_download
+    from safetensors.torch import load_file
+    config_path = hf_hub_download(repo_id, "config.json")
+    weights_path = hf_hub_download(repo_id, "model.safetensors")
+    with open(config_path) as f:
+        config_dict = json.load(f)
+    config = ChartMaskPredictorConfig(**config_dict)
+    model = ChartMaskPredictor(config)
+    state_dict = load_file(weights_path, device=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    return model
+# ---------------------------------------------------------------------------
+# MERT audio encoding (lazy-loaded)
+# ---------------------------------------------------------------------------
+_mert_model = None
+_mert_processor = None
+_mert_frame_rate = None
+def _ensure_mert(device: torch.device):
+    """Load MERT model and processor on first use."""
+    global _mert_model, _mert_processor, _mert_frame_rate
+    if _mert_model is not None:
+        # Move to correct device if needed
+        if next(_mert_model.parameters()).device != device:
+            _mert_model.to(device)
+        return
+    from transformers import AutoModel, Wav2Vec2FeatureExtractor
+    print(f"Loading MERT ({MERT_MODEL_ID}) ...")
+    _mert_processor = Wav2Vec2FeatureExtractor.from_pretrained(
+        MERT_MODEL_ID, trust_remote_code=True,
+    )
+    _mert_model = AutoModel.from_pretrained(MERT_MODEL_ID, trust_remote_code=True)
+    _mert_model.to(device)
+    _mert_model.eval()
+    # Compute frame rate dynamically
+    sr = _mert_processor.sampling_rate
+    test_wav = np.zeros(sr, dtype=np.float32)
+    inputs = _mert_processor(test_wav, sampling_rate=sr, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        out = _mert_model(**inputs, output_hidden_states=False)
+    _mert_frame_rate = float(out.last_hidden_state.shape[1])
+    print(f"  MERT frame rate: {_mert_frame_rate:.2f} Hz")
+def move_models_to_device(device: torch.device):
+    """Move all cached models to the specified device (for ZeroGPU)."""
+    global _mert_model
+    if _mert_model is not None:
+        _mert_model.to(device)
+@torch.no_grad()
+def encode_audio_mert(
+    audio_path: str,
+    device: torch.device,
+    chunk_sec: float = 60.0,
+) -> tuple[torch.Tensor, float]:
+    """Encode audio with MERT, return (embeddings, frame_rate)."""
+    import librosa
+    _ensure_mert(device)
+    sr = _mert_processor.sampling_rate
+    wav, _ = librosa.load(audio_path, sr=sr, mono=True)
+    chunk_samples = int(chunk_sec * sr)
+    overlap_sec = 5.0
+    overlap_samples = int(overlap_sec * sr)
+    stride_samples = chunk_samples - overlap_samples
+    if len(wav) <= chunk_samples:
+        inputs = _mert_processor(wav, sampling_rate=sr, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        out = _mert_model(**inputs, output_hidden_states=False)
+        return out.last_hidden_state.squeeze(0).cpu(), _mert_frame_rate
+    # Chunked processing for long audio
+    all_emb = []
+    pos = 0
+    idx = 0
+    while pos < len(wav):
+        end = min(pos + chunk_samples, len(wav))
+        chunk = wav[pos:end]
+        min_len = chunk_samples // 4
+        if len(chunk) < min_len:
+            chunk = np.pad(chunk, (0, min_len - len(chunk)))
+        inputs = _mert_processor(chunk, sampling_rate=sr, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        out = _mert_model(**inputs, output_hidden_states=False)
+        emb = out.last_hidden_state.squeeze(0)
+        n = emb.shape[0]
+        fps = n / (len(chunk) / sr)
+        half_overlap = int(round((overlap_sec / 2) * fps))
+        if idx == 0:
+            keep = n - half_overlap if end < len(wav) else n
+            all_emb.append(emb[:keep].cpu())
+        elif end >= len(wav):
+            all_emb.append(emb[half_overlap:].cpu())
+        else:
+            keep = int(round((len(chunk) / sr - overlap_sec) * fps))
+            all_emb.append(emb[half_overlap:half_overlap + keep].cpu())
+        pos += stride_samples
+        idx += 1
+    return torch.cat(all_emb, dim=0), _mert_frame_rate
+# ---------------------------------------------------------------------------
+# Grid helpers
+# ---------------------------------------------------------------------------
+def _build_16th_grid(fretbars):
+    """Build 16th-note timestamps (ms) from beat positions."""
+    if len(fretbars) < 2:
+        return list(fretbars)
+    positions = []
+    for i in range(len(fretbars) - 1):
+        start = fretbars[i]
+        interval = fretbars[i + 1] - start
+        for sub in range(4):
+            positions.append(start + sub * interval / 4.0)
+    positions.append(fretbars[-1])
+    return positions
+def _get_local_beat_ms(grid_idx, fretbars):
+    beat_idx = min(grid_idx // 4, len(fretbars) - 2)
+    beat_idx = max(0, beat_idx)
+    if beat_idx + 1 < len(fretbars):
+        return fretbars[beat_idx + 1] - fretbars[beat_idx]
+    return 500.0
+# ---------------------------------------------------------------------------
+# Main inference
+# ---------------------------------------------------------------------------
+@torch.no_grad()
+def predict_notes(
+    audio_path: str,
+    model: ChartMaskPredictor,
+    beat_times: list[float],
+    difficulty: str = "expert",
+    device: torch.device = None,
+    num_steps: int = 12,
+    temperature: float = 0.9,
+) -> list[NoteEvent]:
+    """MaskGIT-style iterative unmasking inference."""
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dev = device
+    model.to(dev)
+    model.eval()
+    fretbars = [t * 1000.0 for t in beat_times]
+    if len(fretbars) < 2:
+        return []
+    # MERT embeddings
+    embeddings, frame_rate = encode_audio_mert(audio_path, dev)
+    # Build grid and sample MERT frames with windowing
+    grid_times = _build_16th_grid(fretbars)
+    num_positions = len(grid_times)
+    max_frame = embeddings.shape[0] - 1
+    frame_indices = torch.tensor(
+        [min(int(round(t / 1000.0 * frame_rate)), max_frame)
+         for t in grid_times], dtype=torch.long,
+    )
+    window = 2
+    if window > 0 and max_frame >= window * 2:
+        padded = torch.nn.functional.pad(
+            embeddings.unsqueeze(0), (0, 0, window, window), mode="replicate",
+        ).squeeze(0)
+        shifted = frame_indices + window
+        stacked = torch.stack(
+            [padded[shifted + d] for d in range(-window, window + 1)], dim=0,
+        )
+        grid_emb = stacked.mean(dim=0)
+    else:
+        grid_emb = embeddings[frame_indices]
+    # Compute and concat audio features if model expects them
+    if model.config.audio_dim > grid_emb.shape[-1]:
+        import librosa as _lr
+        wav, _ = _lr.load(audio_path, sr=24000, mono=True)
+        hop = 320
+        onset = _lr.onset.onset_strength(y=wav, sr=24000, hop_length=hop)
+        rms_arr = _lr.feature.rms(y=wav, hop_length=hop)[0]
+        centroid = _lr.feature.spectral_centroid(y=wav, sr=24000, hop_length=hop)[0]
+        def _norm(x):
+            mn, mx = x.min(), x.max()
+            return (x - mn) / max(mx - mn, 1e-8)
+        onset, rms_arr, centroid = _norm(onset), _norm(rms_arr), _norm(centroid)
+        af_rate = 24000 / hop
+        af_max = len(onset) - 1
+        af_indices = [min(int(round(t / 1000.0 * af_rate)), af_max) for t in grid_times]
+        af_tensor = torch.tensor(
+            [[onset[i], rms_arr[i], centroid[i]] for i in af_indices],
+            dtype=torch.float32,
+        )
+        grid_emb = torch.cat([grid_emb, af_tensor], dim=-1)
+    audio_features = grid_emb.unsqueeze(0).to(dev)
+    diff_id = DIFF_ID.get(difficulty, 3)
+    diff_tensor = torch.tensor([diff_id], dtype=torch.long, device=dev)
+    padding_mask = torch.ones(1, num_positions, dtype=torch.bool, device=dev)
+    # Start fully masked
+    chart_tokens = torch.full(
+        (1, num_positions), MASK_TOKEN, dtype=torch.long, device=dev,
+    )
+    # Cosine unmasking schedule
+    schedule = []
+    for step in range(num_steps):
+        r_prev = math.cos(math.pi / 2 * step / num_steps)
+        r_next = math.cos(math.pi / 2 * (step + 1) / num_steps)
+        n_unmask = max(1, int((r_prev - r_next) * num_positions))
+        schedule.append(n_unmask)
+    # Iterative unmasking
+    for step in range(num_steps):
+        outputs = model(audio_features, chart_tokens, diff_tensor, padding_mask)
+        token_logits = outputs["token_logits"].squeeze(0)
+        is_masked = (chart_tokens.squeeze(0) == MASK_TOKEN)
+        masked_indices = is_masked.nonzero(as_tuple=True)[0]
+        if len(masked_indices) == 0:
+            break
+        probs = torch.softmax(token_logits / temperature, dim=-1)
+        sampled = torch.multinomial(probs, num_samples=1).squeeze(-1)
+        n_unmask = min(schedule[step], len(masked_indices))
+        perm = torch.randperm(len(masked_indices), device=dev)
+        unmask_idx = masked_indices[perm[:n_unmask]]
+        chart_tokens[0, unmask_idx] = sampled[unmask_idx]
+    # Final pass for sustain predictions
+    outputs = model(audio_features, chart_tokens, diff_tensor, padding_mask)
+    sustain_prob = outputs["sustain_logits"].squeeze(0).squeeze(-1).sigmoid()
+    dur_pred = outputs["duration_logits"].squeeze(0).argmax(dim=-1)
+    # Convert tokens to NoteEvents
+    tokens = chart_tokens.squeeze(0).cpu()
+    notes = []
+    for i in range(num_positions):
+        tok = tokens[i].item()
+        if tok >= SILENCE_TOKEN or tok < 0:
+            continue
+        fret_set = set(_CLASS_TO_FRETS[tok])
+        if not fret_set:
+            continue
+        sustain_ticks = 0
+        if sustain_prob[i] >= 0.5:
+            bucket = dur_pred[i].item()
+            beat_ms = _get_local_beat_ms(i, fretbars)
+            sustain_ticks = _BUCKET_BEATS[bucket] * beat_ms
+        notes.append(NoteEvent(
+            tick=i,
+            fret_set=fret_set,
+            sustain_ticks=sustain_ticks,
+        ))
+    return notes

midmid/ini_writer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Generate song.ini metadata for GHWT:DE."""
+from pathlib import Path
+def write_ini(
+    output_path: str,
+    title: str = "Unknown Song",
+    artist: str = "Unknown Artist",
+    album: str = "",
+    genre: str = "rock",
+    year: str = "2024",
+    charter: str = "Midmid",
+    diff_guitar: int = 0,
+    preview_start_time: int = 30000,
+    song_length: int = 0,
+) -> None:
+    lines = [
+        "[Song]",
+        f"name = {title}",
+        f"artist = {artist}",
+        f"album = {album}",
+        f"genre = {genre}",
+        f"year = {year}",
+        f"charter = {charter}",
+        f"diff_guitar = {diff_guitar}",
+        f"preview_start_time = {preview_start_time}",
+    ]
+    if song_length > 0:
+        lines.append(f"song_length = {song_length}")
+    Path(output_path).write_text("\n".join(lines) + "\n", encoding="utf-8")

midmid/midi_writer.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""MIDI serialization: write ChartData to a GH-format .mid file."""
+import mido
+from midmid.datatypes import ChartData, NoteEvent
+DIFFICULTY_OFFSETS = {"easy": 60, "medium": 72, "hard": 84, "expert": 96}
+HOPO_NOTE = {"easy": 65, "medium": 77, "hard": 89, "expert": 101}
+NOTE_VELOCITY = 100
+def write_midi(chart: ChartData, output_path: str) -> None:
+    mid = mido.MidiFile(ticks_per_beat=chart.resolution)
+    mid.tracks.append(_build_tempo_track(chart))
+    mid.tracks.append(_build_events_track(chart))
+    mid.tracks.append(_build_guitar_track(chart))
+    if chart.beats:
+        mid.tracks.append(_build_beat_track(chart))
+    mid.save(output_path)
+def _build_tempo_track(chart):
+    track = mido.MidiTrack()
+    events = []
+    for tick, bpm in chart.tempo_events:
+        events.append((tick, mido.MetaMessage(
+            "set_tempo", tempo=mido.bpm2tempo(bpm), time=0)))
+    for tick, num, den in chart.time_signatures:
+        events.append((tick, mido.MetaMessage(
+            "time_signature", numerator=num, denominator=den, time=0)))
+    _write_sorted_events(track, events)
+    return track
+def _build_events_track(chart):
+    track = mido.MidiTrack()
+    track.append(mido.MetaMessage("track_name", name="EVENTS", time=0))
+    events = []
+    for tick, label in chart.sections:
+        events.append((tick, mido.MetaMessage(
+            "text", text=f"[section {label}]", time=0)))
+    _write_sorted_events(track, events)
+    return track
+def _build_guitar_track(chart):
+    track = mido.MidiTrack()
+    track.append(mido.MetaMessage("track_name", name="PART GUITAR", time=0))
+    events = []
+    for difficulty, offset in DIFFICULTY_OFFSETS.items():
+        if difficulty not in chart.notes:
+            continue
+        for note in chart.notes[difficulty]:
+            for fret in note.fret_set:
+                midi_note = offset + fret
+                events.append((note.tick, mido.Message(
+                    "note_on", note=midi_note, velocity=NOTE_VELOCITY, time=0)))
+                off_tick = note.tick + max(note.sustain_ticks, 1)
+                events.append((off_tick, mido.Message(
+                    "note_off", note=midi_note, velocity=0, time=0)))
+            if note.is_hopo:
+                hopo_note = HOPO_NOTE[difficulty]
+                events.append((note.tick, mido.Message(
+                    "note_on", note=hopo_note, velocity=NOTE_VELOCITY, time=0)))
+                events.append((note.tick + 1, mido.Message(
+                    "note_off", note=hopo_note, velocity=0, time=0)))
+    _write_sorted_events(track, events)
+    return track
+def _build_beat_track(chart):
+    track = mido.MidiTrack()
+    track.append(mido.MetaMessage("track_name", name="BEAT", time=0))
+    events = []
+    for tick, is_downbeat in chart.beats:
+        midi_note = 12 if is_downbeat else 13
+        events.append((tick, mido.Message(
+            "note_on", note=midi_note, velocity=NOTE_VELOCITY, time=0)))
+        events.append((tick + 1, mido.Message(
+            "note_off", note=midi_note, velocity=0, time=0)))
+    _write_sorted_events(track, events)
+    return track
+def _write_sorted_events(track, events):
+    events.sort(key=lambda e: e[0])
+    prev_tick = 0
+    for abs_tick, msg in events:
+        msg.time = abs_tick - prev_tick
+        track.append(msg)
+        prev_tick = abs_tick

midmid/nn.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""Chart prediction model architecture.
+FiLM-conditioned masked transformer for Guitar Hero chart generation.
+"""
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ---------------------------------------------------------------------------
+# Utility layers
+# ---------------------------------------------------------------------------
+def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0):
+    x_glu, x_linear = x[..., ::2], x[..., 1::2]
+    x_glu = x_glu.clamp(max=limit)
+    x_linear = x_linear.clamp(min=-limit, max=limit)
+    return x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + 1)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        t = x.float()
+        t = t * torch.rsqrt(t.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return (t * self.scale).to(x.dtype)
+class FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff, bias=False)
+        self.linear_out = nn.Linear(d_ff // 2, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear_out(self.dropout(swiglu(self.linear1(x))))
+# ---------------------------------------------------------------------------
+# Rotary position embeddings
+# ---------------------------------------------------------------------------
+def apply_rotary_emb(
+    x: torch.Tensor, dim: int, base: float = 10000.0,
+) -> torch.Tensor:
+    """Apply RoPE to a tensor of shape [B, heads, T, head_dim]."""
+    seq_len = x.size(2)
+    device, dtype = x.device, x.dtype
+    theta = base ** (-torch.arange(0, dim, 2, device=device, dtype=dtype) / dim)
+    positions = torch.arange(seq_len, device=device, dtype=dtype).unsqueeze(1)
+    angles = positions * theta.unsqueeze(0)
+    sin, cos = angles.sin(), angles.cos()
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    x1 = x[..., : dim // 2]
+    x2 = x[..., dim // 2 : dim]
+    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
+# ---------------------------------------------------------------------------
+# Bidirectional multi-head self-attention
+# ---------------------------------------------------------------------------
+class BidirectionalAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1,
+                 rope_base: float = 10000.0):
+        super().__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+        self.rope_base = rope_base
+        self.w_q = nn.Linear(d_model, d_model, bias=False)
+        self.w_k = nn.Linear(d_model, d_model, bias=False)
+        self.w_v = nn.Linear(d_model, d_model, bias=False)
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor,
+                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, T, _ = x.shape
+        Q = self.w_q(x).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(x).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(x).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
+        Q = apply_rotary_emb(Q, dim=self.d_k, base=self.rope_base)
+        K = apply_rotary_emb(K, dim=self.d_k, base=self.rope_base)
+        sdpa_mask = None
+        if attn_mask is not None:
+            sdpa_mask = attn_mask[:, None, None, :].bool()
+        out = F.scaled_dot_product_attention(
+            Q, K, V, attn_mask=sdpa_mask,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=False,
+        )
+        out = out.transpose(1, 2).contiguous().view(B, T, self.d_model)
+        return self.out_proj(out)
+# ---------------------------------------------------------------------------
+# FiLM-conditioned encoder block
+# ---------------------------------------------------------------------------
+class FiLMEncoderBlock(nn.Module):
+    """Encoder block with FiLM difficulty conditioning.
+    After the feedforward, the output is modulated:
+        h = (1 + gamma) * h + beta
+    where gamma, beta are derived from the difficulty embedding.
+    """
+    def __init__(self, d_model: int, d_ff: int, n_heads: int,
+                 dropout: float = 0.1, rope_base: float = 10000.0):
+        super().__init__()
+        self.norm1 = RMSNorm(d_model)
+        self.attn = BidirectionalAttention(d_model, n_heads, dropout, rope_base)
+        self.norm2 = RMSNorm(d_model)
+        self.ff = FeedForward(d_model, d_ff, dropout)
+        self.dropout = nn.Dropout(dropout)
+        self.film_proj = nn.Linear(d_model, d_model * 2)
+        nn.init.zeros_(self.film_proj.weight)
+        nn.init.zeros_(self.film_proj.bias)
+    def forward(self, x: torch.Tensor, diff_emb: torch.Tensor,
+                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.dropout(self.attn(self.norm1(x), attn_mask))
+        h = self.ff(self.norm2(x))
+        film = self.film_proj(diff_emb).unsqueeze(1)
+        gamma, beta = film.chunk(2, dim=-1)
+        h = (1 + gamma) * h + beta
+        x = x + self.dropout(h)
+        return x
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+SILENCE_TOKEN = 32
+MASK_TOKEN = 33
+VOCAB_SIZE = 34
+NUM_SUSTAIN_BUCKETS = 6
+# ---------------------------------------------------------------------------
+# Main model
+# ---------------------------------------------------------------------------
+class ChartMaskPredictor(nn.Module):
+    """Masked prediction chart model (v3).
+    Token vocabulary: 0-31 fret combos, 32 silence, 33 MASK.
+    """
+    def __init__(self, config: "ChartMaskPredictorConfig"):
+        super().__init__()
+        self.config = config
+        d = config.d_model
+        self.audio_projection = nn.Linear(config.audio_dim, d, bias=False)
+        self.chart_embedding = nn.Embedding(VOCAB_SIZE, d)
+        self.input_dropout = nn.Dropout(config.dropout)
+        self.difficulty_embedding = nn.Embedding(4, d)
+        self.layers = nn.ModuleList([
+            FiLMEncoderBlock(
+                d_model=d, d_ff=config.d_ff, n_heads=config.n_heads,
+                dropout=config.dropout, rope_base=config.rope_base,
+            )
+            for _ in range(config.n_layers)
+        ])
+        self.final_norm = RMSNorm(d)
+        self.token_head = nn.Linear(d, VOCAB_SIZE - 1)  # 33 classes (no MASK)
+        self.sustain_head = nn.Linear(d, 1)
+        self.duration_head = nn.Linear(d, NUM_SUSTAIN_BUCKETS)
+    def forward(self, audio_features: torch.Tensor, chart_tokens: torch.Tensor,
+                difficulty: torch.Tensor,
+                padding_mask: Optional[torch.Tensor] = None) -> dict[str, torch.Tensor]:
+        audio = self.audio_projection(audio_features)
+        chart = self.chart_embedding(chart_tokens)
+        x = audio + chart
+        x = self.input_dropout(x)
+        diff_emb = self.difficulty_embedding(difficulty)
+        for layer in self.layers:
+            x = layer(x, diff_emb, attn_mask=padding_mask)
+        x = self.final_norm(x)
+        return {
+            "token_logits": self.token_head(x),
+            "sustain_logits": self.sustain_head(x),
+            "duration_logits": self.duration_head(x),
+        }
+@dataclass
+class ChartMaskPredictorConfig:
+    audio_dim: int = 771
+    d_model: int = 512
+    n_heads: int = 8
+    n_layers: int = 6
+    d_ff: int = 2048
+    dropout: float = 0.15
+    rope_base: float = 10000.0

midmid/offset.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Offset / silence duration calculation."""
+from midmid.beat_tracker import BeatData
+def calculate_offset(
+    beat_data: BeatData,
+    bpm: float,
+    beats_per_measure: int = 4,
+    min_lead_in: float = 2.0,
+) -> float:
+    """Calculate silence duration to prepend to the audio."""
+    if len(beat_data.downbeats) == 0:
+        return min_lead_in
+    first_downbeat = float(beat_data.downbeats[0])
+    measure_duration = beats_per_measure * 60.0 / bpm
+    n = 1
+    while n * measure_duration < min_lead_in:
+        n += 1
+    silence = n * measure_duration - first_downbeat
+    while silence < 0:
+        n += 1
+        silence = n * measure_duration - first_downbeat
+    return silence

midmid/sections.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Structural segmentation (intro, verse, chorus, etc.)."""
+import numpy as np
+import librosa
+def detect_sections(
+    audio_path: str,
+    min_section_duration: float = 8.0,
+) -> list[tuple[float, str]]:
+    """Detect structural sections in an audio file."""
+    y, sr = librosa.load(audio_path, sr=22050, mono=True)
+    duration = len(y) / sr
+    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+    n_frames = mfcc.shape[1]
+    k = max(2, min(n_frames - 1, int(duration / 25)))
+    bounds = librosa.segment.agglomerative(mfcc, k=k)
+    bound_times = librosa.frames_to_time(bounds, sr=sr)
+    if len(bound_times) == 0 or bound_times[0] > 0.5:
+        bound_times = np.concatenate([[0.0], bound_times])
+    bound_times = _merge_short_segments(bound_times, duration, min_section_duration)
+    labels = _assign_labels(y, sr, bound_times, duration)
+    return list(zip(bound_times.tolist(), labels))
+def _merge_short_segments(bounds, duration, min_dur):
+    merged = [bounds[0]]
+    for t in bounds[1:]:
+        if t - merged[-1] >= min_dur:
+            merged.append(t)
+    return np.array(merged)
+def _assign_labels(y, sr, bound_times, duration):
+    n = len(bound_times)
+    if n == 0:
+        return []
+    if n == 1:
+        return ["Intro"]
+    segment_features = []
+    for i in range(n):
+        start_sample = int(bound_times[i] * sr)
+        end_sample = int(bound_times[i + 1] * sr) if i + 1 < n else len(y)
+        seg = y[start_sample:end_sample]
+        if len(seg) < sr // 4:
+            segment_features.append(np.zeros(13))
+        else:
+            mfcc = librosa.feature.mfcc(y=seg, sr=sr, n_mfcc=13)
+            segment_features.append(np.mean(mfcc, axis=1))
+    labels = ["Intro"]
+    letter_idx = 0
+    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    assigned = {}
+    for i in range(1, n):
+        best_sim = -1
+        best_j = -1
+        for j in range(i):
+            sim = _cosine_sim(segment_features[i], segment_features[j])
+            if sim > best_sim:
+                best_sim = sim
+                best_j = j
+        if best_sim > 0.85 and best_j in assigned:
+            labels.append(f"Section {assigned[best_j]}")
+        else:
+            letter = letters[letter_idx % len(letters)]
+            letter_idx += 1
+            assigned[i] = letter
+            labels.append(f"Section {letter}")
+        if best_j not in assigned and best_j > 0:
+            assigned[best_j] = labels[best_j].split()[-1] if " " in labels[best_j] else "A"
+    return labels
+def _cosine_sim(a, b):
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return float(np.dot(a, b) / (norm_a * norm_b))

midmid/tempo_map.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Tempo map derivation from beat tracker output."""
+import numpy as np
+from midmid.beat_tracker import BeatData
+def derive_tempo_map(
+    beat_data: BeatData, change_threshold: float = 0.08,
+) -> list[tuple[float, float]]:
+    """Derive a tempo map from beat data.
+    Returns list of (time_seconds, bpm) tuples, sorted by time.
+    """
+    beats = beat_data.beats
+    if len(beats) < 2:
+        return [(0.0, 120.0)]
+    intervals = np.diff(beats)
+    bpms = 60.0 / intervals
+    median_bpm = np.median(bpms)
+    valid = (bpms > median_bpm * 0.6) & (bpms < median_bpm * 1.6)
+    if not np.any(valid):
+        return [(0.0, float(median_bpm))]
+    valid_bpms = bpms[valid]
+    if np.std(valid_bpms) / np.mean(valid_bpms) < change_threshold:
+        avg_bpm = float(np.mean(valid_bpms))
+        return [(0.0, _round_bpm(avg_bpm))]
+    tempo_map = []
+    current_bpm = float(bpms[0]) if valid[0] else float(median_bpm)
+    tempo_map.append((0.0, _round_bpm(current_bpm)))
+    window = 4
+    for i in range(window, len(bpms) - window + 1, window):
+        chunk = bpms[i : i + window]
+        chunk_valid = chunk[(chunk > median_bpm * 0.6) & (chunk < median_bpm * 1.6)]
+        if len(chunk_valid) == 0:
+            continue
+        local_bpm = float(np.mean(chunk_valid))
+        if abs(local_bpm - current_bpm) / current_bpm > change_threshold:
+            current_bpm = local_bpm
+            tempo_map.append((float(beats[i]), _round_bpm(current_bpm)))
+    return tempo_map
+def get_median_bpm(beat_data: BeatData) -> float:
+    if len(beat_data.beats) < 2:
+        return 120.0
+    intervals = np.diff(beat_data.beats)
+    bpms = 60.0 / intervals
+    return float(_round_bpm(np.median(bpms)))
+def estimate_time_signature(beat_data: BeatData) -> int:
+    if len(beat_data.downbeats) < 2:
+        return 4
+    beats = beat_data.beats
+    downbeats = beat_data.downbeats
+    counts = []
+    for i in range(len(downbeats) - 1):
+        start, end = downbeats[i], downbeats[i + 1]
+        n = np.sum((beats >= start) & (beats < end))
+        if 2 <= n <= 7:
+            counts.append(n)
+    if not counts:
+        return 4
+    values, freq = np.unique(counts, return_counts=True)
+    return int(values[np.argmax(freq)])
+def _round_bpm(bpm: float) -> float:
+    return round(float(bpm), 2)

pipeline.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""Generation pipeline — callable from Gradio, ZeroGPU-compatible.
+Wraps the full audio→chart pipeline into a single function that returns
+a zip file path and chart JSON for the visualizer.
+"""
+import base64
+import json
+import os
+import shutil
+import tempfile
+from datetime import datetime
+from pathlib import Path
+import numpy as np
+import torch
+from midmid.beat_tracker import track_beats
+from midmid.tempo_map import derive_tempo_map, get_median_bpm, estimate_time_signature
+from midmid.offset import calculate_offset
+from midmid.sections import detect_sections
+from midmid.constraints import enforce_constraints
+from midmid.datatypes import ChartData, NoteEvent
+from midmid.inference import load_model_from_hub, predict_notes, move_models_to_device
+from midmid.midi_writer import write_midi
+from midmid.audio_prep import prepare_audio
+from midmid.ini_writer import write_ini
+RESOLUTION = 192
+MODEL_REPO = "markury/midmid3-19m-0326"
+# Loaded once at startup (on CPU)
+_chart_model = None
+def ensure_model():
+    """Pre-load model on CPU (called at app startup)."""
+    global _chart_model
+    if _chart_model is None:
+        print("Loading chart model from HF Hub...")
+        _chart_model = load_model_from_hub(MODEL_REPO, device="cpu")
+        print("Chart model loaded.")
+    return _chart_model
+def generate_chart(
+    audio_path: str,
+    title: str,
+    artist: str,
+    album: str = "",
+    year: str = "",
+    genre: str = "rock",
+    temperature: float = 0.8,
+    num_steps: int = 12,
+    progress_cb=None,
+) -> tuple[str, dict]:
+    """Run the full generation pipeline.
+    Args:
+        audio_path: Path to uploaded audio file.
+        title: Song title.
+        artist: Artist name.
+        album: Album name (optional).
+        year: Release year (optional).
+        genre: Genre string (optional).
+        temperature: Sampling temperature.
+        num_steps: Unmasking steps.
+        progress_cb: Optional callable(step, total, message) for progress.
+    Returns:
+        (zip_path, chart_json) where chart_json has the data for the visualizer.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = ensure_model()
+    model.to(device)
+    move_models_to_device(device)
+    if not year:
+        year = str(datetime.now().year)
+    # Create temp output dir
+    tmp_dir = tempfile.mkdtemp(prefix="midmid_")
+    song_dir = Path(tmp_dir) / f"{title} - {artist}"
+    song_dir.mkdir(parents=True, exist_ok=True)
+    def _progress(step, total, msg):
+        if progress_cb:
+            progress_cb(step / total, desc=msg)
+    # --- Stage 1: Audio analysis ---
+    _progress(0, 8, "Tracking beats...")
+    beat_data = track_beats(audio_path, device=str(device))
+    _progress(1, 8, "Analyzing tempo...")
+    tempo_map = derive_tempo_map(beat_data)
+    bpm = get_median_bpm(beat_data)
+    time_sig = estimate_time_signature(beat_data)
+    offset_sec = calculate_offset(beat_data, bpm, beats_per_measure=time_sig)
+    _progress(2, 8, "Detecting sections...")
+    raw_sections = detect_sections(audio_path)
+    # --- Stage 2: Note prediction ---
+    beat_times = list(beat_data.beats)
+    difficulties = ["expert", "hard", "medium", "easy"]
+    all_notes = {}
+    for i, diff_name in enumerate(difficulties):
+        _progress(3 + i * 0.2, 8, f"Generating {diff_name} chart...")
+        raw_notes = predict_notes(
+            audio_path=audio_path,
+            model=model,
+            beat_times=beat_times,
+            difficulty=diff_name,
+            device=device,
+            temperature=temperature,
+            num_steps=num_steps,
+        )
+        notes = _grid_to_musical_ticks(raw_notes, beat_times, offset_sec, bpm, RESOLUTION)
+        notes = enforce_constraints(notes, diff_name, RESOLUTION)
+        last_beat_sec = float(beat_data.beats[-1]) if len(beat_data.beats) > 0 else 0
+        last_beat_tick = int(round((last_beat_sec + offset_sec) * bpm / 60.0 * RESOLUTION))
+        notes = [n for n in notes if n.tick <= last_beat_tick]
+        all_notes[diff_name] = notes
+    # Fill missing difficulties
+    required = ["expert", "hard", "medium", "easy"]
+    for diff in required:
+        if diff not in all_notes:
+            for fallback in required:
+                if fallback in all_notes:
+                    all_notes[diff] = all_notes[fallback]
+                    break
+    # --- Stage 3: Assembly ---
+    _progress(5, 8, "Building chart...")
+    tempo_events = _tempo_map_to_ticks(tempo_map, offset_sec, bpm, RESOLUTION)
+    section_events = _sections_to_ticks(raw_sections, tempo_map, offset_sec, RESOLUTION)
+    all_ticks = [n.tick for ns in all_notes.values() for n in ns]
+    last_tick = max(all_ticks) + RESOLUTION * time_sig if all_ticks else RESOLUTION * time_sig * 4
+    beat_markers = _build_beat_markers(last_tick, RESOLUTION, time_sig)
+    chart = ChartData(
+        resolution=RESOLUTION,
+        tempo_events=tempo_events,
+        time_signatures=[(0, time_sig, 4)],
+        sections=section_events,
+        notes=all_notes,
+        beats=beat_markers,
+    )
+    # --- Stage 4: Write outputs ---
+    _progress(6, 8, "Writing MIDI...")
+    write_midi(chart, str(song_dir / "notes.mid"))
+    _progress(7, 8, "Preparing audio...")
+    prepare_audio(
+        audio_path=audio_path,
+        output_path=str(song_dir / "song.ogg"),
+        silence_duration_sec=offset_sec,
+    )
+    write_ini(
+        output_path=str(song_dir / "song.ini"),
+        title=title,
+        artist=artist,
+        album=album,
+        genre=genre,
+        year=year,
+    )
+    # --- Zip it ---
+    zip_base = Path(tmp_dir) / f"{title} - {artist}"
+    zip_path = shutil.make_archive(str(zip_base), "zip", tmp_dir, song_dir.name)
+    # --- Build chart JSON for the visualizer ---
+    chart_json = _build_chart_json(
+        chart, bpm, offset_sec, audio_path, str(song_dir / "song.ogg"),
+    )
+    _progress(8, 8, "Done!")
+    return zip_path, chart_json
+def _build_chart_json(chart, bpm, offset_sec, original_audio_path, prepared_audio_path):
+    """Build JSON payload for the client-side visualizer."""
+    # Encode prepared audio as base64 for the HTML player
+    with open(prepared_audio_path, "rb") as f:
+        audio_b64 = base64.b64encode(f.read()).decode("ascii")
+    notes_json = {}
+    for diff, note_list in chart.notes.items():
+        notes_json[diff] = [
+            {
+                "tick": n.tick,
+                "frets": sorted(n.fret_set),
+                "sustain": n.sustain_ticks,
+                "hopo": n.is_hopo,
+            }
+            for n in note_list
+        ]
+    return {
+        "resolution": chart.resolution,
+        "bpm": bpm,
+        "offset_sec": offset_sec,
+        "tempo_events": [{"tick": t, "bpm": b} for t, b in chart.tempo_events],
+        "time_signatures": [{"tick": t, "num": n, "den": d} for t, n, d in chart.time_signatures],
+        "sections": [{"tick": t, "label": l} for t, l in chart.sections],
+        "beats": [{"tick": t, "downbeat": d} for t, d in chart.beats],
+        "notes": notes_json,
+        "audio_b64": audio_b64,
+        "audio_format": "ogg",
+    }
+# ---------------------------------------------------------------------------
+# Grid index -> musical tick conversion (from generate.py)
+# ---------------------------------------------------------------------------
+def _grid_to_musical_ticks(notes, beat_times, offset_sec, bpm, resolution):
+    if len(beat_times) < 2:
+        return notes
+    sixteenth = resolution // 4
+    fretbars_ms = [t * 1000.0 for t in beat_times]
+    grid_times_ms = []
+    for i in range(len(fretbars_ms) - 1):
+        start = fretbars_ms[i]
+        interval = fretbars_ms[i + 1] - start
+        for sub in range(4):
+            grid_times_ms.append(start + sub * interval / 4.0)
+    grid_times_ms.append(fretbars_ms[-1])
+    result = []
+    for note in notes:
+        grid_idx = note.tick
+        if grid_idx < 0 or grid_idx >= len(grid_times_ms):
+            continue
+        time_sec = grid_times_ms[grid_idx] / 1000.0 + offset_sec
+        tick = round(time_sec * bpm / 60.0 * resolution)
+        tick = round(tick / sixteenth) * sixteenth
+        tick = max(0, tick)
+        sustain_ticks = 0
+        if note.sustain_ticks > 0:
+            sustain_sec = note.sustain_ticks / 1000.0
+            raw = sustain_sec * bpm / 60.0 * resolution
+            sustain_ticks = max(sixteenth, round(raw / sixteenth) * sixteenth)
+        result.append(NoteEvent(
+            tick=tick,
+            fret_set=note.fret_set,
+            sustain_ticks=sustain_ticks,
+            is_hopo=note.is_hopo,
+        ))
+    return result
+def _tempo_map_to_ticks(tempo_map, offset_sec, bpm, resolution):
+    events = []
+    for i, (time_sec, bpm_val) in enumerate(tempo_map):
+        if i == 0:
+            events.append((0, bpm_val))
+        else:
+            adjusted_time = time_sec + offset_sec
+            prev_time = tempo_map[i - 1][0] + offset_sec if i > 0 else 0
+            dt_sec = adjusted_time - prev_time
+            prev_tick = events[-1][0]
+            prev_bpm = events[-1][1]
+            tick = prev_tick + int(round(dt_sec * prev_bpm / 60.0 * resolution))
+            events.append((tick, bpm_val))
+    return events
+def _sections_to_ticks(sections, tempo_map, offset_sec, resolution):
+    if not tempo_map:
+        return []
+    result = []
+    bpm = tempo_map[0][1]
+    for time_sec, label in sections:
+        adjusted = time_sec + offset_sec
+        tick = int(round(adjusted * bpm / 60.0 * resolution))
+        tick = max(0, tick)
+        result.append((tick, label))
+    return result
+def _build_beat_markers(last_tick, resolution, beats_per_measure):
+    beats = []
+    tick = 0
+    beat_in_measure = 0
+    while tick <= last_tick:
+        beats.append((tick, beat_in_measure == 0))
+        beat_in_measure = (beat_in_measure + 1) % beats_per_measure
+        tick += resolution
+    return beats

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio>=5.0
+spaces
+safetensors
+transformers<5
+huggingface-hub
+mido
+beat_this @ git+https://github.com/CPJKU/beat_this.git
+librosa
+pydub
+numpy
+scipy
+tqdm

visualizer.py ADDED Viewed

	@@ -0,0 +1,360 @@

+"""Build the HTML/JS/CSS for the chart visualizer.
+Returns a self-contained HTML string that Gradio embeds via gr.HTML().
+The chart data + audio are injected as a JSON blob.
+"""
+import json
+def build_visualizer_html(chart_json: dict) -> str:
+    """Return a self-contained HTML string for the chart visualizer."""
+    data_json = json.dumps(chart_json, separators=(",", ":"))
+    return TEMPLATE.replace("__CHART_DATA__", data_json)
+TEMPLATE = r"""
+<div id="midmid-viz" style="font-family: system-ui, -apple-system, sans-serif; background: #111; border-radius: 12px; overflow: hidden; max-width: 900px; margin: 0 auto;">
+<!-- Controls bar -->
+<div style="display:flex; align-items:center; gap:12px; padding:10px 16px; background:#1a1a1a; border-bottom:1px solid #333;">
+  <button id="viz-play" style="background:none; border:none; color:#fff; font-size:22px; cursor:pointer; padding:4px 8px;" title="Play/Pause">&#9654;</button>
+  <div id="viz-time" style="color:#aaa; font-size:13px; min-width:80px;">0:00 / 0:00</div>
+  <div style="flex:1; position:relative; height:6px; background:#333; border-radius:3px; cursor:pointer;" id="viz-seekbar">
+    <div id="viz-seekfill" style="height:100%; background:#7c3aed; border-radius:3px; width:0%; pointer-events:none;"></div>
+  </div>
+  <select id="viz-diff" style="background:#222; color:#fff; border:1px solid #444; border-radius:4px; padding:2px 6px; font-size:13px;">
+    <option value="expert">Expert</option>
+    <option value="hard">Hard</option>
+    <option value="medium">Medium</option>
+    <option value="easy">Easy</option>
+  </select>
+</div>
+<!-- Canvas -->
+<canvas id="viz-canvas" style="width:100%; display:block;"></canvas>
+<!-- Section labels row -->
+<div id="viz-sections" style="padding:6px 16px 10px; background:#1a1a1a; border-top:1px solid #333; color:#888; font-size:11px; min-height:20px; white-space:nowrap; overflow:hidden; text-overflow:ellipsis;"></div>
+<script>
+(function() {
+  const DATA = __CHART_DATA__;
+  // --- Constants ---
+  const FRET_COLORS = ['#22c55e','#ef4444','#eab308','#3b82f6','#f97316']; // G R Y B O
+  const FRET_GLOW   = ['#4ade80','#f87171','#facc15','#60a5fa','#fb923c'];
+  const LANE_COUNT = 5;
+  const NOTE_RADIUS = 14;
+  const LANE_WIDTH = 48;
+  const HIGHWAY_WIDTH = LANE_COUNT * LANE_WIDTH;
+  const CANVAS_PAD_LEFT = 80;
+  const CANVAS_PAD_RIGHT = 20;
+  const RES = DATA.resolution; // 192
+  // Timing: convert tick to seconds using tempo events
+  const tempoMap = DATA.tempo_events.map(e => ({tick: e.tick, bpm: e.bpm}));
+  function tickToSec(tick) {
+    let sec = 0;
+    let prevTick = 0;
+    let bpm = tempoMap[0].bpm;
+    for (let i = 1; i < tempoMap.length; i++) {
+      if (tempoMap[i].tick > tick) break;
+      sec += (tempoMap[i].tick - prevTick) / RES * 60.0 / bpm;
+      prevTick = tempoMap[i].tick;
+      bpm = tempoMap[i].bpm;
+    }
+    sec += (tick - prevTick) / RES * 60.0 / bpm;
+    return sec;
+  }
+  function secToTick(sec) {
+    let accSec = 0;
+    let prevTick = 0;
+    let bpm = tempoMap[0].bpm;
+    for (let i = 1; i < tempoMap.length; i++) {
+      const dt = (tempoMap[i].tick - prevTick) / RES * 60.0 / bpm;
+      if (accSec + dt > sec) break;
+      accSec += dt;
+      prevTick = tempoMap[i].tick;
+      bpm = tempoMap[i].bpm;
+    }
+    return prevTick + (sec - accSec) * bpm / 60.0 * RES;
+  }
+  // --- Audio setup ---
+  const audio = new Audio();
+  audio.src = 'data:audio/' + DATA.audio_format + ';base64,' + DATA.audio_b64;
+  audio.preload = 'auto';
+  // --- Canvas setup ---
+  const canvas = document.getElementById('viz-canvas');
+  const ctx = canvas.getContext('2d');
+  let W, H, pxPerSec;
+  const VISIBLE_SEC = 8; // seconds visible on screen
+  function resize() {
+    const container = canvas.parentElement;
+    W = container.clientWidth;
+    H = 360;
+    canvas.width = W * devicePixelRatio;
+    canvas.height = H * devicePixelRatio;
+    canvas.style.height = H + 'px';
+    ctx.setTransform(devicePixelRatio, 0, 0, devicePixelRatio, 0, 0);
+    pxPerSec = (W - CANVAS_PAD_LEFT - CANVAS_PAD_RIGHT) / VISIBLE_SEC;
+  }
+  resize();
+  new ResizeObserver(resize).observe(canvas.parentElement);
+  // --- State ---
+  let currentDiff = 'expert';
+  let playing = false;
+  // Precompute note positions in seconds
+  function buildNoteCache(diff) {
+    return (DATA.notes[diff] || []).map(n => ({
+      sec: tickToSec(n.tick),
+      frets: n.frets,
+      sustainSec: n.sustain > 0 ? tickToSec(n.tick + n.sustain) - tickToSec(n.tick) : 0,
+      hopo: n.hopo,
+    }));
+  }
+  let noteCache = buildNoteCache(currentDiff);
+  // Precompute beats in seconds
+  const beatCache = DATA.beats.map(b => ({
+    sec: tickToSec(b.tick),
+    downbeat: b.downbeat,
+  }));
+  // Sections in seconds
+  const sectionCache = DATA.sections.map(s => ({
+    sec: tickToSec(s.tick),
+    label: s.label,
+  }));
+  // Total duration
+  let totalDuration = 0;
+  audio.addEventListener('loadedmetadata', () => {
+    totalDuration = audio.duration;
+  });
+  // Fallback: estimate from last note
+  const allNoteSecs = Object.values(DATA.notes).flat().map(n => tickToSec(n.tick + (n.sustain || 0)));
+  const estimatedDuration = allNoteSecs.length ? Math.max(...allNoteSecs) + 5 : 120;
+  function getDuration() { return totalDuration || estimatedDuration; }
+  // --- Controls ---
+  const playBtn = document.getElementById('viz-play');
+  const timeDiv = document.getElementById('viz-time');
+  const seekBar = document.getElementById('viz-seekbar');
+  const seekFill = document.getElementById('viz-seekfill');
+  const diffSelect = document.getElementById('viz-diff');
+  const sectionsDiv = document.getElementById('viz-sections');
+  playBtn.addEventListener('click', () => {
+    if (playing) {
+      audio.pause();
+      playing = false;
+      playBtn.textContent = '\u25B6';
+    } else {
+      audio.play();
+      playing = true;
+      playBtn.textContent = '\u23F8';
+    }
+  });
+  seekBar.addEventListener('click', (e) => {
+    const rect = seekBar.getBoundingClientRect();
+    const frac = (e.clientX - rect.left) / rect.width;
+    audio.currentTime = frac * getDuration();
+  });
+  diffSelect.addEventListener('change', () => {
+    currentDiff = diffSelect.value;
+    noteCache = buildNoteCache(currentDiff);
+  });
+  audio.addEventListener('ended', () => {
+    playing = false;
+    playBtn.textContent = '\u25B6';
+  });
+  function formatTime(s) {
+    const m = Math.floor(s / 60);
+    const sec = Math.floor(s % 60);
+    return m + ':' + (sec < 10 ? '0' : '') + sec;
+  }
+  // --- Rendering ---
+  function draw() {
+    const t = audio.currentTime || 0;
+    const dur = getDuration();
+    // Update controls
+    seekFill.style.width = (t / dur * 100) + '%';
+    timeDiv.textContent = formatTime(t) + ' / ' + formatTime(dur);
+    // Update section label
+    let currentSection = '';
+    for (let i = sectionCache.length - 1; i >= 0; i--) {
+      if (sectionCache[i].sec <= t) { currentSection = sectionCache[i].label; break; }
+    }
+    sectionsDiv.textContent = currentSection;
+    // Clear
+    ctx.fillStyle = '#111';
+    ctx.fillRect(0, 0, W, H);
+    // The highway: current time is at the left edge + small offset
+    const playheadX = CANVAS_PAD_LEFT + 40;
+    const secToX = (sec) => playheadX + (sec - t) * pxPerSec;
+    const viewStart = t - 1;
+    const viewEnd = t + VISIBLE_SEC + 1;
+    // Draw lane backgrounds
+    const laneTop = 20;
+    const laneBottom = H - 20;
+    const laneHeight = laneBottom - laneTop;
+    const highwayLeft = playheadX - 20;
+    // Subtle lane separators
+    for (let i = 0; i <= LANE_COUNT; i++) {
+      const y = laneTop + (laneHeight / LANE_COUNT) * i;
+      ctx.strokeStyle = '#2a2a2a';
+      ctx.lineWidth = 1;
+      ctx.beginPath();
+      ctx.moveTo(CANVAS_PAD_LEFT - 10, y);
+      ctx.lineTo(W - CANVAS_PAD_RIGHT, y);
+      ctx.stroke();
+    }
+    // Draw beat lines (vertical)
+    for (const beat of beatCache) {
+      if (beat.sec < viewStart || beat.sec > viewEnd) continue;
+      const x = secToX(beat.sec);
+      ctx.strokeStyle = beat.downbeat ? '#444' : '#222';
+      ctx.lineWidth = beat.downbeat ? 1.5 : 0.5;
+      ctx.beginPath();
+      ctx.moveTo(x, laneTop);
+      ctx.lineTo(x, laneBottom);
+      ctx.stroke();
+      // Measure number for downbeats
+      if (beat.downbeat && x > CANVAS_PAD_LEFT) {
+        ctx.fillStyle = '#555';
+        ctx.font = '9px system-ui';
+        ctx.fillText('|', x - 2, laneTop - 4);
+      }
+    }
+    // Draw section boundaries
+    for (const sec of sectionCache) {
+      if (sec.sec < viewStart || sec.sec > viewEnd) continue;
+      const x = secToX(sec.sec);
+      ctx.strokeStyle = '#7c3aed55';
+      ctx.lineWidth = 2;
+      ctx.beginPath();
+      ctx.moveTo(x, laneTop);
+      ctx.lineTo(x, laneBottom);
+      ctx.stroke();
+      ctx.fillStyle = '#7c3aed';
+      ctx.font = '10px system-ui';
+      ctx.fillText(sec.label, x + 4, laneTop - 4);
+    }
+    // Draw playhead
+    ctx.strokeStyle = '#fff';
+    ctx.lineWidth = 2;
+    ctx.beginPath();
+    ctx.moveTo(playheadX, laneTop - 2);
+    ctx.lineTo(playheadX, laneBottom + 2);
+    ctx.stroke();
+    // Draw notes
+    const laneH = laneHeight / LANE_COUNT;
+    for (const note of noteCache) {
+      if (note.sec + note.sustainSec < viewStart || note.sec > viewEnd) continue;
+      const x = secToX(note.sec);
+      for (const fret of note.frets) {
+        if (fret > 4) continue; // skip open chord marker
+        const laneY = laneTop + fret * laneH + laneH / 2;
+        const color = FRET_COLORS[fret];
+        const glow = FRET_GLOW[fret];
+        // Draw sustain tail first (behind note)
+        if (note.sustainSec > 0) {
+          const endX = secToX(note.sec + note.sustainSec);
+          ctx.fillStyle = color + '55';
+          ctx.fillRect(x, laneY - 4, endX - x, 8);
+          ctx.fillStyle = color + '99';
+          ctx.fillRect(x, laneY - 2, endX - x, 4);
+        }
+        // Note circle
+        const isPast = note.sec < t;
+        ctx.beginPath();
+        ctx.arc(x, laneY, NOTE_RADIUS - 2, 0, Math.PI * 2);
+        if (isPast) {
+          ctx.fillStyle = color + '44';
+          ctx.fill();
+          ctx.strokeStyle = color + '66';
+          ctx.lineWidth = 1.5;
+          ctx.stroke();
+        } else {
+          // Glow for upcoming notes near playhead
+          const dist = note.sec - t;
+          if (dist < 0.3) {
+            ctx.shadowColor = glow;
+            ctx.shadowBlur = 12;
+          }
+          ctx.fillStyle = color;
+          ctx.fill();
+          ctx.shadowBlur = 0;
+          ctx.strokeStyle = '#fff';
+          ctx.lineWidth = 2;
+          ctx.stroke();
+          // HOPO = open center
+          if (note.hopo) {
+            ctx.beginPath();
+            ctx.arc(x, laneY, NOTE_RADIUS - 6, 0, Math.PI * 2);
+            ctx.fillStyle = '#111';
+            ctx.fill();
+          }
+        }
+      }
+    }
+    // Fret labels on the left
+    const fretNames = ['Green', 'Red', 'Yellow', 'Blue', 'Orange'];
+    const fretAbbrev = ['G', 'R', 'Y', 'B', 'O'];
+    ctx.font = 'bold 13px system-ui';
+    for (let i = 0; i < LANE_COUNT; i++) {
+      const y = laneTop + i * laneH + laneH / 2;
+      ctx.fillStyle = FRET_COLORS[i];
+      ctx.textAlign = 'right';
+      ctx.fillText(fretAbbrev[i], CANVAS_PAD_LEFT - 20, y + 5);
+    }
+    ctx.textAlign = 'left';
+    // Note count overlay
+    const noteCount = noteCache.length;
+    ctx.fillStyle = '#666';
+    ctx.font = '11px system-ui';
+    ctx.fillText(noteCount + ' notes (' + currentDiff + ')', W - CANVAS_PAD_RIGHT - 140, laneTop - 4);
+    requestAnimationFrame(draw);
+  }
+  requestAnimationFrame(draw);
+})();
+</script>
+</div>
+"""