Delete exp/baseline

Browse files

Files changed (6) hide show

exp/baseline/__init__.py +0 -0
exp/baseline/data.py +0 -128
exp/baseline/eval.py +0 -326
exp/baseline/model.py +0 -62
exp/baseline/train.py +0 -183
exp/baseline/utils.py +0 -53

exp/baseline/__init__.py DELETED Viewed

File without changes

exp/baseline/data.py DELETED Viewed

@@ -1,128 +0,0 @@
-import torch
-from torch.utils.data import Dataset
-import numpy as np
-from tqdm import tqdm
-from .utils import extract_context
-class BeatTrackingDataset(Dataset):
-    def __init__(
-        self, hf_dataset, target_type="beats", sample_rate=16000, hop_length=160
-    ):
-        """
-        Args:
-            hf_dataset: HuggingFace dataset object
-            target_type (str): "beats" or "downbeats". Determines which labels are treated as positive.
-        """
-        self.sr = sample_rate
-        self.hop_length = hop_length
-        self.target_type = target_type
-        # Context window size in samples (7 frames = 70ms at 100fps)
-        self.context_frames = 7
-        self.context_samples = (self.context_frames * 2 + 1) * hop_length + max(
-            [368, 736, 1488]
-        )  # extra for FFT window
-        # Cache audio arrays in memory for fast access
-        self.audio_cache = []
-        self.indices = []
-        self._prepare_indices(hf_dataset)
-    def _prepare_indices(self, hf_dataset):
-        """
-        Prepares balanced indices and caches audio.
-        Paper Section 4.5: Uses "Fuzzier" training examples (neighbors weighted less).
-        """
-        print(f"Preparing dataset indices for target: {self.target_type}...")
-        for i, item in tqdm(
-            enumerate(hf_dataset), total=len(hf_dataset), desc="Building indices"
-        ):
-            # Cache audio array (convert to numpy if tensor)
-            audio = item["audio"]["array"]
-            if hasattr(audio, "numpy"):
-                audio = audio.numpy()
-            self.audio_cache.append(audio)
-            # Calculate total frames available in audio
-            audio_len = len(audio)
-            n_frames = int(audio_len / self.hop_length)
-            # Select ground truth based on target_type
-            if self.target_type == "downbeats":
-                # Only downbeats are positives
-                gt_times = item["downbeats"]
-            else:
-                # All beats are positives (downbeats are also beats)
-                gt_times = item["beats"]
-            # Convert to list if tensor
-            if hasattr(gt_times, "tolist"):
-                gt_times = gt_times.tolist()
-            gt_frames = set([int(t * self.sr / self.hop_length) for t in gt_times])
-            # --- Positive Examples (with Fuzziness) ---
-            # "define a single frame before and after each annotated onset to be additional positive examples"
-            pos_frames = set()
-            for bf in gt_frames:
-                if 0 <= bf < n_frames:
-                    self.indices.append((i, bf, 1.0))  # Center frame (Sharp onset)
-                    pos_frames.add(bf)
-                # Neighbors weighted at 0.25
-                if 0 <= bf - 1 < n_frames:
-                    self.indices.append((i, bf - 1, 0.25))
-                    pos_frames.add(bf - 1)
-                if 0 <= bf + 1 < n_frames:
-                    self.indices.append((i, bf + 1, 0.25))
-                    pos_frames.add(bf + 1)
-            # --- Negative Examples ---
-            # Paper uses "all others as negative", but we balance 2:1 for stable SGD.
-            num_pos = len(pos_frames)
-            num_neg = num_pos * 2
-            count = 0
-            attempts = 0
-            while count < num_neg and attempts < num_neg * 5:
-                f = np.random.randint(0, n_frames)
-                if f not in pos_frames:
-                    self.indices.append((i, f, 0.0))
-                    count += 1
-                attempts += 1
-        print(
-            f"Dataset ready. {len(self.indices)} samples, {len(self.audio_cache)} tracks cached."
-        )
-    def __len__(self):
-        return len(self.indices)
-    def __getitem__(self, idx):
-        track_idx, frame_idx, label = self.indices[idx]
-        # Fast lookup from cache
-        audio = self.audio_cache[track_idx]
-        audio_len = len(audio)
-        # Calculate sample range for context window
-        center_sample = frame_idx * self.hop_length
-        half_context = self.context_samples // 2
-        start = center_sample - half_context
-        end = center_sample + half_context
-        # Handle padding if needed
-        pad_left = max(0, -start)
-        pad_right = max(0, end - audio_len)
-        start = max(0, start)
-        end = min(audio_len, end)
-        # Extract audio chunk
-        chunk = audio[start:end]
-        if pad_left > 0 or pad_right > 0:
-            chunk = np.pad(chunk, (pad_left, pad_right), mode="constant")
-        waveform = torch.tensor(chunk, dtype=torch.float32)
-        return waveform, torch.tensor([label], dtype=torch.float32)

exp/baseline/eval.py DELETED Viewed

@@ -1,326 +0,0 @@
-import torch
-import numpy as np
-from tqdm import tqdm
-from scipy.signal import find_peaks
-import argparse
-import os
-from .model import ODCNN
-from .utils import MultiViewSpectrogram
-from ..data.load import ds
-from ..data.eval import evaluate_all, format_results
-def get_activation_function(model, waveform, device):
-    """
-    Computes probability curve over time.
-    """
-    processor = MultiViewSpectrogram().to(device)
-    waveform = waveform.unsqueeze(0).to(device)
-    with torch.no_grad():
-        spec = processor(waveform)
-        # Normalize
-        mean = spec.mean(dim=(2, 3), keepdim=True)
-        std = spec.std(dim=(2, 3), keepdim=True) + 1e-6
-        spec = (spec - mean) / std
-        # Batchify with sliding window
-        spec = torch.nn.functional.pad(spec, (7, 7))  # Pad time
-        windows = spec.unfold(3, 15, 1)  # (1, 3, 80, Time, 15)
-        windows = windows.permute(3, 0, 1, 2, 4).squeeze(1)  # (Time, 3, 80, 15)
-        # Inference
-        activations = []
-        batch_size = 512
-        for i in range(0, len(windows), batch_size):
-            batch = windows[i : i + batch_size]
-            out = model(batch)
-            activations.append(out.cpu().numpy())
-    return np.concatenate(activations).flatten()
-def pick_peaks(activations, hop_length=160, sr=16000):
-    """
-    Smooth with Hamming window and report local maxima.
-    """
-    # Smoothing
-    window = np.hamming(5)
-    window /= window.sum()
-    smoothed = np.convolve(activations, window, mode="same")
-    # Peak Picking
-    peaks, _ = find_peaks(smoothed, height=0.5, distance=5)
-    timestamps = peaks * hop_length / sr
-    return timestamps.tolist()
-def visualize_track(
-    audio: np.ndarray,
-    sr: int,
-    pred_beats: list[float],
-    pred_downbeats: list[float],
-    gt_beats: list[float],
-    gt_downbeats: list[float],
-    output_dir: str,
-    track_idx: int,
-    time_range: tuple[float, float] | None = None,
-):
-    """
-    Create and save visualizations for a single track.
-    """
-    from ..data.viz import plot_waveform_with_beats, save_figure
-    os.makedirs(output_dir, exist_ok=True)
-    # Full waveform plot
-    fig = plot_waveform_with_beats(
-        audio,
-        sr,
-        pred_beats,
-        gt_beats,
-        pred_downbeats,
-        gt_downbeats,
-        title=f"Track {track_idx}: Beat Comparison",
-        time_range=time_range,
-    )
-    save_figure(fig, os.path.join(output_dir, f"track_{track_idx:03d}.png"))
-def synthesize_audio(
-    audio: np.ndarray,
-    sr: int,
-    pred_beats: list[float],
-    pred_downbeats: list[float],
-    gt_beats: list[float],
-    gt_downbeats: list[float],
-    output_dir: str,
-    track_idx: int,
-    click_volume: float = 0.5,
-):
-    """
-    Create and save audio files with click tracks for a single track.
-    """
-    from ..data.audio import create_comparison_audio, save_audio
-    os.makedirs(output_dir, exist_ok=True)
-    # Create comparison audio
-    audio_pred, audio_gt, audio_both = create_comparison_audio(
-        audio,
-        pred_beats,
-        pred_downbeats,
-        gt_beats,
-        gt_downbeats,
-        sr=sr,
-        click_volume=click_volume,
-    )
-    # Save audio files
-    save_audio(
-        audio_pred, os.path.join(output_dir, f"track_{track_idx:03d}_pred.wav"), sr
-    )
-    save_audio(audio_gt, os.path.join(output_dir, f"track_{track_idx:03d}_gt.wav"), sr)
-    save_audio(
-        audio_both, os.path.join(output_dir, f"track_{track_idx:03d}_both.wav"), sr
-    )
-def main():
-    parser = argparse.ArgumentParser(
-        description="Evaluate beat tracking models with visualization and audio synthesis"
-    )
-    parser.add_argument(
-        "--model-dir",
-        type=str,
-        default="outputs/baseline",
-        help="Base directory containing trained models (with 'beats' and 'downbeats' subdirs)",
-    )
-    parser.add_argument(
-        "--num-samples",
-        type=int,
-        default=20,
-        help="Number of samples to evaluate",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="outputs/eval",
-        help="Directory to save visualizations and audio",
-    )
-    parser.add_argument(
-        "--visualize",
-        action="store_true",
-        help="Generate visualization plots for each track",
-    )
-    parser.add_argument(
-        "--synthesize",
-        action="store_true",
-        help="Generate audio files with click tracks",
-    )
-    parser.add_argument(
-        "--viz-tracks",
-        type=int,
-        default=5,
-        help="Number of tracks to visualize/synthesize (default: 5)",
-    )
-    parser.add_argument(
-        "--time-range",
-        type=float,
-        nargs=2,
-        default=None,
-        metavar=("START", "END"),
-        help="Time range for visualization in seconds (default: full track)",
-    )
-    parser.add_argument(
-        "--click-volume",
-        type=float,
-        default=0.5,
-        help="Volume of click sounds relative to audio (0.0 to 1.0)",
-    )
-    parser.add_argument(
-        "--summary-plot",
-        action="store_true",
-        help="Generate summary evaluation plot",
-    )
-    args = parser.parse_args()
-    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-    # Load BOTH models using from_pretrained
-    beat_model = None
-    downbeat_model = None
-    has_beats = False
-    has_downbeats = False
-    beats_dir = os.path.join(args.model_dir, "beats")
-    downbeats_dir = os.path.join(args.model_dir, "downbeats")
-    if os.path.exists(os.path.join(beats_dir, "model.safetensors")) or os.path.exists(
-        os.path.join(beats_dir, "pytorch_model.bin")
-    ):
-        beat_model = ODCNN.from_pretrained(beats_dir).to(DEVICE)
-        beat_model.eval()
-        has_beats = True
-        print(f"Loaded Beat Model from {beats_dir}")
-    else:
-        print(f"Warning: No beat model found in {beats_dir}")
-    if os.path.exists(
-        os.path.join(downbeats_dir, "model.safetensors")
-    ) or os.path.exists(os.path.join(downbeats_dir, "pytorch_model.bin")):
-        downbeat_model = ODCNN.from_pretrained(downbeats_dir).to(DEVICE)
-        downbeat_model.eval()
-        has_downbeats = True
-        print(f"Loaded Downbeat Model from {downbeats_dir}")
-    else:
-        print(f"Warning: No downbeat model found in {downbeats_dir}")
-    if not has_beats and not has_downbeats:
-        print("No models found. Please run training first.")
-        return
-    predictions = []
-    ground_truths = []
-    audio_data = []  # Store audio for visualization/synthesis
-    # Eval on specified number of tracks
-    test_set = ds["train"].select(range(args.num_samples))
-    print("Running evaluation...")
-    for i, item in enumerate(tqdm(test_set)):
-        waveform = torch.tensor(item["audio"]["array"], dtype=torch.float32)
-        waveform_device = waveform.to(DEVICE)
-        pred_entry = {"beats": [], "downbeats": []}
-        # 1. Predict Beats
-        if has_beats:
-            act_b = get_activation_function(beat_model, waveform_device, DEVICE)
-            pred_entry["beats"] = pick_peaks(act_b)
-        # 2. Predict Downbeats
-        if has_downbeats:
-            act_d = get_activation_function(downbeat_model, waveform_device, DEVICE)
-            pred_entry["downbeats"] = pick_peaks(act_d)
-        predictions.append(pred_entry)
-        ground_truths.append({"beats": item["beats"], "downbeats": item["downbeats"]})
-        # Store audio for later visualization/synthesis
-        if args.visualize or args.synthesize:
-            if i < args.viz_tracks:
-                audio_data.append(
-                    {
-                        "audio": waveform.numpy(),
-                        "sr": item["audio"]["sampling_rate"],
-                        "pred": pred_entry,
-                        "gt": ground_truths[-1],
-                    }
-                )
-    # Run evaluation
-    results = evaluate_all(predictions, ground_truths)
-    print(format_results(results))
-    # Create output directory
-    if args.visualize or args.synthesize or args.summary_plot:
-        os.makedirs(args.output_dir, exist_ok=True)
-    # Generate visualizations
-    if args.visualize:
-        print(f"\nGenerating visualizations for {len(audio_data)} tracks...")
-        viz_dir = os.path.join(args.output_dir, "plots")
-        for i, data in enumerate(tqdm(audio_data, desc="Visualizing")):
-            time_range = tuple(args.time_range) if args.time_range else None
-            visualize_track(
-                data["audio"],
-                data["sr"],
-                data["pred"]["beats"],
-                data["pred"]["downbeats"],
-                data["gt"]["beats"],
-                data["gt"]["downbeats"],
-                viz_dir,
-                i,
-                time_range=time_range,
-            )
-        print(f"Saved visualizations to {viz_dir}")
-    # Generate audio with clicks
-    if args.synthesize:
-        print(f"\nSynthesizing audio for {len(audio_data)} tracks...")
-        audio_dir = os.path.join(args.output_dir, "audio")
-        for i, data in enumerate(tqdm(audio_data, desc="Synthesizing")):
-            synthesize_audio(
-                data["audio"],
-                data["sr"],
-                data["pred"]["beats"],
-                data["pred"]["downbeats"],
-                data["gt"]["beats"],
-                data["gt"]["downbeats"],
-                audio_dir,
-                i,
-                click_volume=args.click_volume,
-            )
-        print(f"Saved audio files to {audio_dir}")
-        print("  *_pred.wav - Original audio with predicted beat clicks")
-        print("  *_gt.wav   - Original audio with ground truth beat clicks")
-        print("  *_both.wav - Original audio with both predicted and GT clicks")
-    # Generate summary plot
-    if args.summary_plot:
-        from ..data.viz import plot_evaluation_summary, save_figure
-        print("\nGenerating summary plot...")
-        fig = plot_evaluation_summary(results, title="Beat Tracking Evaluation Summary")
-        summary_path = os.path.join(args.output_dir, "evaluation_summary.png")
-        save_figure(fig, summary_path)
-        print(f"Saved summary plot to {summary_path}")
-if __name__ == "__main__":
-    main()

exp/baseline/model.py DELETED Viewed

@@ -1,62 +0,0 @@
-import torch
-import torch.nn as nn
-from huggingface_hub import PyTorchModelHubMixin
-class ODCNN(nn.Module, PyTorchModelHubMixin):
-    def __init__(self, dropout_rate=0.5):
-        super().__init__()
-        # Input 3 channels, 80 bands
-        # Conv 1: 7x3 filters -> 10 maps
-        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
-        self.relu1 = nn.ReLU()  #  ReLU improvement
-        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1), stride=(3, 1))
-        # Conv 2: 3x3 filters -> 20 maps
-        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
-        self.relu2 = nn.ReLU()
-        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1), stride=(3, 1))
-        # Flatten size calculation based on architecture
-        # (20 feature maps * 8 freq bands * 7 time frames)
-        self.flatten_size = 20 * 8 * 7
-        # Dropout on FC inputs
-        self.dropout = nn.Dropout(p=dropout_rate)
-        # 256 Hidden Units
-        self.fc1 = nn.Linear(self.flatten_size, 256)
-        self.relu_fc = nn.ReLU()
-        # Output Unit
-        self.fc2 = nn.Linear(256, 1)
-        self.sigmoid = nn.Sigmoid()
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.relu1(x)
-        x = self.pool1(x)
-        x = self.conv2(x)
-        x = self.relu2(x)
-        x = self.pool2(x)
-        x = x.view(x.size(0), -1)
-        x = self.dropout(x)
-        x = self.fc1(x)
-        x = self.relu_fc(x)
-        x = self.dropout(x)
-        x = self.fc2(x)
-        x = self.sigmoid(x)
-        return x
-if __name__ == "__main__":
-    from torchinfo import summary
-    model = ODCNN()
-    summary(model, (1, 3, 80, 15))

exp/baseline/train.py DELETED Viewed

@@ -1,183 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-import argparse
-import os
-from .model import ODCNN
-from .data import BeatTrackingDataset
-from .utils import MultiViewSpectrogram
-from ..data.load import ds
-def train(target_type: str, output_dir: str):
-    # Note: Paper uses SGD with Momentum, Dropout, and ReLU
-    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-    BATCH_SIZE = 512
-    EPOCHS = 50
-    LR = 0.05
-    MOMENTUM = 0.9
-    NUM_WORKERS = 4
-    print(f"--- Training Model for target: {target_type} ---")
-    print(f"Output directory: {output_dir}")
-    # Create output directory
-    os.makedirs(output_dir, exist_ok=True)
-    # TensorBoard writer
-    writer = SummaryWriter(log_dir=os.path.join(output_dir, "logs"))
-    # Data - use existing train/test splits
-    train_dataset = BeatTrackingDataset(ds["train"], target_type=target_type)
-    val_dataset = BeatTrackingDataset(ds["test"], target_type=target_type)
-    train_loader = DataLoader(
-        train_dataset,
-        batch_size=BATCH_SIZE,
-        shuffle=True,
-        num_workers=NUM_WORKERS,
-        pin_memory=True,
-        prefetch_factor=4,
-        persistent_workers=True,
-    )
-    val_loader = DataLoader(
-        val_dataset,
-        batch_size=BATCH_SIZE,
-        shuffle=False,
-        num_workers=NUM_WORKERS,
-        pin_memory=True,
-        prefetch_factor=4,
-        persistent_workers=True,
-    )
-    print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")
-    # Model
-    model = ODCNN(dropout_rate=0.5).to(DEVICE)
-    # GPU Spectrogram Preprocessor
-    preprocessor = MultiViewSpectrogram(sample_rate=16000, hop_length=160).to(DEVICE)
-    # Optimizer
-    optimizer = optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM)
-    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
-    criterion = nn.BCELoss()  # Binary Cross Entropy
-    best_val_loss = float("inf")
-    global_step = 0
-    for epoch in range(EPOCHS):
-        # Training
-        model.train()
-        total_train_loss = 0
-        for waveform, y in tqdm(
-            train_loader,
-            desc=f"[{target_type}] Epoch {epoch + 1}/{EPOCHS} Train",
-            leave=False,
-        ):
-            waveform, y = waveform.to(DEVICE), y.to(DEVICE)
-            # Compute spectrogram on GPU
-            with torch.no_grad():
-                spec = preprocessor(waveform)  # (B, 3, 80, T)
-                # Normalize
-                mean = spec.mean(dim=(2, 3), keepdim=True)
-                std = spec.std(dim=(2, 3), keepdim=True) + 1e-6
-                spec = (spec - mean) / std
-                # Extract center context (T should be ~15 frames)
-                x = spec[:, :, :, 7:22]  # center 15 frames
-            optimizer.zero_grad()
-            output = model(x)
-            loss = criterion(output, y)
-            loss.backward()
-            optimizer.step()
-            total_train_loss += loss.item()
-            global_step += 1
-            # Log batch loss
-            writer.add_scalar("train/batch_loss", loss.item(), global_step)
-        avg_train_loss = total_train_loss / len(train_loader)
-        # Validation
-        model.eval()
-        total_val_loss = 0
-        with torch.no_grad():
-            for waveform, y in tqdm(
-                val_loader,
-                desc=f"[{target_type}] Epoch {epoch + 1}/{EPOCHS} Val",
-                leave=False,
-            ):
-                waveform, y = waveform.to(DEVICE), y.to(DEVICE)
-                # Compute spectrogram on GPU
-                spec = preprocessor(waveform)  # (B, 3, 80, T)
-                # Normalize
-                mean = spec.mean(dim=(2, 3), keepdim=True)
-                std = spec.std(dim=(2, 3), keepdim=True) + 1e-6
-                spec = (spec - mean) / std
-                # Extract center context
-                x = spec[:, :, :, 7:22]
-                output = model(x)
-                loss = criterion(output, y)
-                total_val_loss += loss.item()
-        avg_val_loss = total_val_loss / len(val_loader)
-        # Log epoch metrics
-        writer.add_scalar("train/epoch_loss", avg_train_loss, epoch)
-        writer.add_scalar("val/loss", avg_val_loss, epoch)
-        writer.add_scalar("train/learning_rate", scheduler.get_last_lr()[0], epoch)
-        # Step the scheduler
-        scheduler.step()
-        print(
-            f"[{target_type}] Epoch {epoch + 1}/{EPOCHS} - "
-            f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}"
-        )
-        # Save best model
-        if avg_val_loss < best_val_loss:
-            best_val_loss = avg_val_loss
-            model.save_pretrained(output_dir)
-            print(f"  -> Saved best model (val_loss: {best_val_loss:.4f})")
-    writer.close()
-    # Save final model
-    final_dir = os.path.join(output_dir, "final")
-    model.save_pretrained(final_dir)
-    print(f"Saved final model to {final_dir}")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--target",
-        type=str,
-        choices=["beats", "downbeats"],
-        default=None,
-        help="Train a model for 'beats' or 'downbeats'. If not specified, trains both.",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="outputs/baseline",
-        help="Directory to save model and logs",
-    )
-    args = parser.parse_args()
-    # Determine which targets to train
-    targets = [args.target] if args.target else ["beats", "downbeats"]
-    for target in targets:
-        output_dir = os.path.join(args.output_dir, target)
-        train(target, output_dir)

exp/baseline/utils.py DELETED Viewed

@@ -1,53 +0,0 @@
-import torch
-import torch.nn as nn
-import torchaudio.transforms as T
-import numpy as np
-class MultiViewSpectrogram(nn.Module):
-    def __init__(self, sample_rate=16000, n_mels=80, hop_length=160):
-        super().__init__()
-        #  Window sizes: 23ms, 46ms, 93ms
-        self.win_lengths = [368, 736, 1488]
-        self.transforms = nn.ModuleList()
-        for win_len in self.win_lengths:
-            n_fft = 2 ** int(np.ceil(np.log2(win_len)))
-            mel = T.MelSpectrogram(
-                sample_rate=sample_rate,
-                n_fft=n_fft,
-                win_length=win_len,
-                hop_length=hop_length,
-                f_min=27.5,
-                f_max=16000.0,
-                n_mels=n_mels,
-                power=1.0,
-                center=True,
-            )
-            self.transforms.append(mel)
-    def forward(self, waveform):
-        specs = []
-        for transform in self.transforms:
-            # Scale magnitudes logarithmically
-            s = transform(waveform)
-            s = torch.log(s + 1e-9)
-            specs.append(s)
-        return torch.stack(specs, dim=1)
-def extract_context(spec, center_frame, context=7):
-    # Context of +/- 70ms (7 frames)
-    channels, n_mels, total_time = spec.shape
-    start = center_frame - context
-    end = center_frame + context + 1
-    pad_left = max(0, -start)
-    pad_right = max(0, end - total_time)
-    if pad_left > 0 or pad_right > 0:
-        spec = torch.nn.functional.pad(spec, (pad_left, pad_right))
-        start += pad_left
-        end += pad_left
-    return spec[:, :, start:end]