anticipatory-music-transformer

Running

App Files Files Community

saumya-pailwan commited on Oct 24, 2025

Commit

251510f

verified ·

1 Parent(s): 9ae1e69

final

Browse files

Files changed (1) hide show

app.py +42 -121

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-import spaces  # Enables ZeroGPU on Hugging Face
 import gradio as gr
 import torch
 from dataclasses import asdict
@@ -25,10 +25,10 @@ LARGE_MODEL = "stanford-crfm/music-large-800k"
 model_card = ModelCard(
     name="Anticipatory Music Transformer",
    description=(
-    "Generate musical accompaniment for your existing melody using the Anticipatory Music Transformer.\n"
-    "Input: a MIDI file that includes a short section of accompaniment followed by a melody.\n"
-    "Output: a new MIDI file with extended accompaniment that continues naturally with the melody.\n"
-    "Use the control below to select how much of the input song is used as context and select the model size. \n\n"
 ),
     author="John Thickstun, David Hall, Chris Donahue, Percy Liang",
     tags=["midi", "generation", "accompaniment"]
@@ -60,135 +60,59 @@ def load_amt_model(model_choice: str):
     return model
 def find_melody_program(mid, debug=False):
-    """
-    Automatically detect the melody track's program number from a MIDI file.
-    Uses a balanced heuristic: pitch + note count + temporal coverage.
-    """
     track_stats = []
-    total_duration = 0
     for i, track in enumerate(mid.tracks):
         pitches, times = [], []
         current_time = 0
-        current_program = None
-        track_note_count = 0
         for msg in track:
-            if msg.type not in ("note_on", "program_change"):
-                continue
-            current_time += msg.time
-            if msg.type == "program_change":
-                current_program = msg.program
-                continue
-            # note_on event
-            if msg.velocity > 0:
                 pitches.append(msg.note)
                 times.append(current_time)
-                track_note_count += 1
-                # Early stop if enough notes gathered
-                if track_note_count >= 100:
-                    break
-        # Skip empty or trivial tracks
-        if not pitches:
-            continue
-        # Compute duration for this track and update total_duration
-        track_duration = max(times) - min(times)
-        total_duration = max(total_duration, current_time)
-        mean_pitch = sum(pitches) / len(pitches)
-        polyphony = len(set(pitches)) / len(pitches)
-        coverage = track_duration / total_duration if total_duration > 0 else 0
-        track_stats.append((i, mean_pitch, len(pitches), current_program, polyphony, coverage))
     if not track_stats:
-        return None, False
-    if len(track_stats) == 1:
-        prog = track_stats[0][3]
         if debug:
-            if prog == 0:
-                print("Single-track MIDI detected, program 0 (Acoustic Grand Piano) will be treated as melody.")
-            else:
-                print(f"Single-track MIDI detected, using program {prog or 'None'}")
-        return prog, prog is not None
-    candidates = [t for t in track_stats if t[3] is not None and t[3] > 0]
-    has_valid_programs = len(candidates) > 0
-    if not candidates:
-        candidates = track_stats
-    if debug:
-        print(f"\nCandidates: {len(candidates)} tracks")
-    max_notes = max(t[2] for t in candidates)
-    max_pitch = max(t[1] for t in candidates)
-    min_pitch = min(t[1] for t in candidates)
-    pitch_span = max_pitch - min_pitch if max_pitch > min_pitch else 1
-    best_score = -1
-    best_program = None
-    best_track = None
-    best_pitch = None
-    for t in candidates:
-        idx, pitch, notes, prog, poly, coverage = t
-        pitch_norm = (pitch - min_pitch) / pitch_span
-        notes_norm = notes / max_notes
-        score = (pitch_norm * 0.35) + (notes_norm * 0.35) + (coverage * 0.30)
-        if poly < 0.15:
-            score *= 0.95
-        if 55 <= pitch <= 75:
-            score *= 1.1
-        if notes >= 30:
-            score *= 1.05
-        if coverage > 0.7:
-            score *= 1.15
-        if score > best_score:
-            best_score = score
-            best_program = prog
-            best_track = idx
-            best_pitch = pitch
-    return best_program, has_valid_programs
 def auto_extract_melody(mid, debug=False):
-    """
-    Extract melody events from a MIDI object (already loaded via MidiFile).
-    Optimized to avoid re-reading the file from disk.
-    Returns: (all_events, melody_events)
-    """
     events = midi_to_events(mid)
-    melody_program, has_valid_program = find_melody_program(mid, debug=debug)
-    if not has_valid_program or melody_program is None or melody_program == 0:
         if debug:
-            print("No valid program changes in MIDI, using all events as melody")
-        return events, events
-    events, melody = extract_instruments(events, [melody_program])
-    if len(melody) == 0:
         if debug:
-            print("No events found for selected program, using all events")
-        return events, events
-    if debug:
-        print(f"Extracted {len(melody)} melody events from program {melody_program}")
     return events, melody
-@spaces.GPU
 # Core generation
 def generate_accompaniment(midi_path: str, model_choice: str, history_length: float):
     """
@@ -199,7 +123,7 @@ def generate_accompaniment(midi_path: str, model_choice: str, history_length: fl
     # Parse MIDI correctly, then convert to events
     mid = MidiFile(midi_path)
-    #print(f"Loaded MIDI file: type {mid.type} ({'single track' if mid.type == 0 else 'multi-track'})")
     # Automatically detect and extract melody
     all_events, melody = auto_extract_melody(mid, debug=True)
@@ -207,16 +131,18 @@ def generate_accompaniment(midi_path: str, model_choice: str, history_length: fl
         print("No melody detected; using all events")
         melody = all_events
-    total_time = round(ops.max_time(all_events, seconds=True))
     # History portion
     history = ops.clip(all_events, 0, history_length, clip_duration=False)
-    # Generate accompaniment for the remaining duration
     accompaniment = generate(
         model,
-        start_time=history_length,   # start after history
-        end_time=total_time,         # go to end
         inputs=history,
         controls=melody,
         top_p=0.95,
@@ -273,16 +199,11 @@ with gr.Blocks() as demo:
         choices=[SMALL_MODEL, MEDIUM_MODEL, LARGE_MODEL],
         value=MEDIUM_MODEL,
         label="Select AMT Model (Faster vs. Higher Quality)"
-    ).set_info(
-    "Choose the model size: Smaller models generate faster but may be less detailed. \n larger models produce richer, more expressive accompaniment."
     )
     history_slider = gr.Slider(
         minimum=1, maximum=10, step=1, value=5,
-        label="Intro Duration for Context (sec)"
-    ).set_info(
-    "Controls how much of the beginning of your song is used as context for generation.\n "
-    "A longer history helps the model better understand the style and rhythm before extending the accompaniment."
     )
     # Outputs (JSON FIRST)

 import os
+#import spaces  # Enables ZeroGPU on Hugging Face
 import gradio as gr
 import torch
 from dataclasses import asdict
 model_card = ModelCard(
     name="Anticipatory Music Transformer",
    description=(
+    "Generate musical accompaniment for your existing vamp using the Anticipatory Music Transformer. "
+    "Input: a MIDI file with a short accompaniment (vamp) followed by a melody line. "
+    "Output: a new MIDI file with extended accompaniment matching the melody continuation. "
+    "Use the sliders to choose model size and how much of the song is used as context."
 ),
     author="John Thickstun, David Hall, Chris Donahue, Percy Liang",
     tags=["midi", "generation", "accompaniment"]
     return model
 def find_melody_program(mid, debug=False):
     track_stats = []
     for i, track in enumerate(mid.tracks):
         pitches, times = [], []
         current_time = 0
         for msg in track:
+            current_time += getattr(msg, "time", 0)
+            if msg.type == "note_on" and msg.velocity > 0:
                 pitches.append(msg.note)
                 times.append(current_time)
+        if pitches:
+            mean_pitch = sum(pitches) / len(pitches)
+            span = (max(times) - min(times)) or 1
+            density = len(pitches) / span
+            polyphony = len(set(pitches)) / len(pitches)
+            track_stats.append((i, mean_pitch, len(pitches), density, polyphony))
     if not track_stats:
         if debug:
+            print("No notes detected in any track.")
+        return 0
+    melody_idx = sorted(track_stats, key=lambda x: (-x[1], -x[3]))[0][0]
+    return melody_idx
+def get_program_number(mid, track_index):
+    for msg in mid.tracks[track_index]:
+        if msg.type == "program_change":
+            return msg.program
+    return None
 def auto_extract_melody(mid, debug=False):
     events = midi_to_events(mid)
+    melody_track = find_melody_program(mid, debug=debug)
+    melody_program = get_program_number(mid, melody_track)
+    if debug:
+        print(f"Melody Track: {melody_track} | Program: {melody_program}")
+    if melody_program is not None:
+        events, melody = extract_instruments(events, [melody_program])
         if debug:
+            print(f"Extracted {len(melody)} melody events from program {melody_program}")
+    else:
         if debug:
+            print("No program number found; using all events as melody.")
+        melody = events
     return events, melody
+#@spaces.GPU
 # Core generation
 def generate_accompaniment(midi_path: str, model_choice: str, history_length: float):
     """
     # Parse MIDI correctly, then convert to events
     mid = MidiFile(midi_path)
+    print(f"Loaded MIDI file: type {mid.type} ({'single track' if mid.type == 0 else 'multi-track'})")
     # Automatically detect and extract melody
     all_events, melody = auto_extract_melody(mid, debug=True)
         print("No melody detected; using all events")
         melody = all_events
     # History portion
     history = ops.clip(all_events, 0, history_length, clip_duration=False)
+    start_time = ops.max_time(history, seconds=True)
+    mid_time = mid.length or 0
+    ops_time = ops.max_time(all_events, seconds=True)
+    total_time = round(max(mid_time, ops_time))
     accompaniment = generate(
         model,
+        start_time=history_length,
+        end_time=total_time,
         inputs=history,
         controls=melody,
         top_p=0.95,
         choices=[SMALL_MODEL, MEDIUM_MODEL, LARGE_MODEL],
         value=MEDIUM_MODEL,
         label="Select AMT Model (Faster vs. Higher Quality)"
     )
     history_slider = gr.Slider(
         minimum=1, maximum=10, step=1, value=5,
+        label="Select History Length (seconds)"
     )
     # Outputs (JSON FIRST)