File size: 5,052 Bytes

344b95e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "shellscript"
    }
   },
   "outputs": [],
   "source": [
    "pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperForAudioClassification\n",
    "# Load pre-trained Whisper model\n",
    "model = WhisperForAudioClassification.from_pretrained(\"openai/whisper-medium\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load the CSV file\n",
    "df = pd.read_csv('dataset.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WhisperProcessor\n",
    "\n",
    "# Initialize the Whisper processor\n",
    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-medium\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import librosa\n",
    "import torch\n",
    "\n",
    "# Create a custom dataset class\n",
    "class LispDataset(torch.utils.data.Dataset):\n",
    "  def __init__(self, df):\n",
    "    self.df = df\n",
    "\n",
    "  def __len__(self):\n",
    "    return len(self.df)\n",
    "  \n",
    "  def __getitem__(self, idx):\n",
    "    row = self.df.iloc[idx]\n",
    "    audio_path = row['file_path']\n",
    "    label = row['label']\n",
    "\n",
    "    audio, original_sr = librosa.load(audio_path, sr=44100)\n",
    "\n",
    "    # Resample to target sample rate (if needed)\n",
    "    target_sr = 16000\n",
    "    if original_sr != target_sr:\n",
    "        audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)\n",
    "\n",
    "    # Extract mel features\n",
    "    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=80, hop_length=512)\n",
    "    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)  # Convert to decibels\n",
    "\n",
    "    # Pad mel spectrogram to fixed length (assuming max_len is pre-defined)\n",
    "    max_len = 3000  # Replace with your desired maximum length\n",
    "    pad_width = (0, max_len - mel_spectrogram_db.shape[1])  # Calculate padding width\n",
    "    mel_spectrogram_db_padded = torch.nn.functional.pad(torch.from_numpy(mel_spectrogram_db).float(), \n",
    "                                                        pad_width, mode='constant', value=0)\n",
    "\n",
    "   # Convert to tensor\n",
    "    input_features = mel_spectrogram_db_padded\n",
    "\n",
    "    # # Convert to tensor\n",
    "    # input_features = torch.from_numpy(mel_spectrogram_db_padded).float()\n",
    "\n",
    "    # Create dictionary with expected key\n",
    "    return {'input_features': input_features, 'labels': label}\n",
    " \n",
    "# Create a DataLoader\n",
    "train_dataset = LispDataset(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TrainingArguments\n",
    "\n",
    "# Training arguments (adjust learning rate as needed)\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./results\",\n",
    "    num_train_epochs=10,\n",
    "    per_device_train_batch_size=2,\n",
    "    learning_rate=5e-5,\n",
    "    fp16=True,\n",
    "    use_cpu=True,\n",
    "    warmup_ratio=0.1,\n",
    "    metric_for_best_model=\"accuracy\",\n",
    "    gradient_accumulation_steps=1  # No gradient accumulation (equivalent to no_auto_optimize=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.optim import AdamW  # Import AdamW from PyTorch\n",
    "\n",
    "# Create the optimizer (adjust other hyperparameters as needed)\n",
    "optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.optim.lr_scheduler import LambdaLR\n",
    "\n",
    "lambda1 = lambda epoch: epoch // 30\n",
    "scheduler = LambdaLR(optimizer, lr_lambda=[lambda1,])\n",
    "\n",
    "optimizertuple = (optimizer,scheduler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Trainer\n",
    "\n",
    "# Trainer instance\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    optimizers=optimizertuple,  # Wrap optimizer in a tuple\n",
    ")\n",
    "\n",
    "# Start training\n",
    "trainer.train()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}