Spaces:

ArseniyPerchik
/

Agent_Control_with_Language

Build error

App Files Files Community

ArseniyPerchik commited on Apr 21, 2025

Commit

d250443

1 Parent(s): 9bf5ef6

Browse files

Files changed (8) hide show

app.py +43 -26
draft_1.ipynb +110 -499
models_for_proj/wav2vec2-base-960h/config.json +109 -0
models_for_proj/wav2vec2-base-960h/model.safetensors +3 -0
models_for_proj/wav2vec2-base-960h/preprocessor_config.json +10 -0
models_for_proj/wav2vec2-base-960h/special_tokens_map.json +6 -0
models_for_proj/wav2vec2-base-960h/tokenizer_config.json +51 -0
models_for_proj/wav2vec2-base-960h/vocab.json +34 -0

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import matplotlib.animation as animation
 import tempfile
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import torchaudio
 import torchaudio.transforms as T
 from matplotlib.patches import Circle
@@ -20,22 +21,10 @@ from types import SimpleNamespace
 # ---------------------------- #
 # models
 # a model for the automatic-speech-recognition task
-# device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# model_id = "./models_for_proj/librispeech_asr_dummy"
-# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
-# model.to(device)
-# processor = AutoProcessor.from_pretrained(model_id)
-# asr_pipe = pipeline(
-#     "automatic-speech-recognition",
-#     model=model,
-#     tokenizer=processor.tokenizer,
-#     feature_extractor=processor.feature_extractor,
-#     max_new_tokens=128,
-#     torch_dtype=torch_dtype,
-#     device=device,
-# )
-asr_pipe_default = pipeline("automatic-speech-recognition")
 # env variables
@@ -62,10 +51,10 @@ r_coverage = 10
 # ---------------------------- #
 def create_standing_animation():
     path = [(agent_pos.x, agent_pos.y)]
-    return create_animation(path, r_coverage)
-def create_animation(path, r_coverage):
     # path = [(i,i) for i in range(90)]
     # targets_x = [20, 80, 80, 20]
     # targets_y = [20, 20, 80, 80]
@@ -135,7 +124,7 @@ def move_agent(target_input: int):
     agent_pos.x = path[-1][0]
     agent_pos.y = path[-1][1]
     # create animation
-    video_output = create_animation(path, r_coverage)
     # update status
     status = f'Went to target {target_input}.'
@@ -147,24 +136,33 @@ def load_image_on_start():
     return np.random.rand(700, 700)
 def get_text_request(audio_input):
     audio_input_sr, audio_input_np = audio_input
     audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
     target_sr = 16000
     resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
     resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
     resampled_audio_input_np = resampled_audio_input_t.numpy()
-    # result = asr_pipe(resampled_audio_input_np)
-    result = asr_pipe_default(resampled_audio_input_np)
-    return result["text"]
 def get_target_from_request(request_text):
-    if 'ONE' in request_text:
         return 1
-    if 'TWO' in request_text:
         return 2
-    if 'THREE' in request_text:
         return 3
-    if 'FOUR' in request_text:
         return 4
     return 'NO TARGET FOUND'
@@ -190,6 +188,7 @@ def create_demo():
             - insert a model that understands the desired goal and not to use a simple function for it that can produce false goals
             - to incorporate a longer chain of goals; for example, go there and pick the package, then come back
             - to introduce additional learnt capabilities
             """)
         # EVENTS:
@@ -208,3 +207,21 @@ def create_demo():
 # ---------------------------- #
 demo = create_demo()
 demo.launch()

 import tempfile
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torchaudio
 import torchaudio.transforms as T
 from matplotlib.patches import Circle
 # ---------------------------- #
 # models
 # a model for the automatic-speech-recognition task
+# asr_pipe_default = pipeline("automatic-speech-recognition")
+save_dir = './models_for_proj/wav2vec2-base-960h'
+model = Wav2Vec2ForCTC.from_pretrained(save_dir)
+processor = Wav2Vec2Processor.from_pretrained(save_dir)
 # env variables
 # ---------------------------- #
 def create_standing_animation():
     path = [(agent_pos.x, agent_pos.y)]
+    return create_animation(path)
+def create_animation(path):
     # path = [(i,i) for i in range(90)]
     # targets_x = [20, 80, 80, 20]
     # targets_y = [20, 20, 80, 80]
     agent_pos.x = path[-1][0]
     agent_pos.y = path[-1][1]
     # create animation
+    video_output = create_animation(path)
     # update status
     status = f'Went to target {target_input}.'
     return np.random.rand(700, 700)
 def get_text_request(audio_input):
+    # --------------------------------------------------------------------------- #
     audio_input_sr, audio_input_np = audio_input
     audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
     target_sr = 16000
     resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
     resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
     resampled_audio_input_np = resampled_audio_input_t.numpy()
+    # --------------------------------------------------------------------------- #
+    # result = asr_pipe_default(resampled_audio_input_np)
+    inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True)
+    # Inference
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Decode
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.decode(predicted_ids[0])
+    # print("Transcription:", transcription)
+    return transcription
 def get_target_from_request(request_text):
+    if any(item in request_text for item in ['ONE', 'FIRST']):
         return 1
+    if any(item in request_text for item in ['TWO', 'SECOND']):
         return 2
+    if any(item in request_text for item in ['THREE', 'THIRD']):
         return 3
+    if any(item in request_text for item in ['FOUR', 'FOURTH']):
         return 4
     return 'NO TARGET FOUND'
             - insert a model that understands the desired goal and not to use a simple function for it that can produce false goals
             - to incorporate a longer chain of goals; for example, go there and pick the package, then come back
             - to introduce additional learnt capabilities
+            - to build more complex environments where the movement is not so straightforward
             """)
         # EVENTS:
 # ---------------------------- #
 demo = create_demo()
 demo.launch()
+# device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# model_id = "./models_for_proj/librispeech_asr_dummy"
+# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
+# model.to(device)
+# processor = AutoProcessor.from_pretrained(model_id)
+# asr_pipe = pipeline(
+#     "automatic-speech-recognition",
+#     model=model,
+#     tokenizer=processor.tokenizer,
+#     feature_extractor=processor.feature_extractor,
+#     max_new_tokens=128,
+#     torch_dtype=torch_dtype,
+#     device=device,
+# )

draft_1.ipynb CHANGED Viewed

@@ -1,618 +1,229 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "id": "bf22c176a849df32",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-04-21T06:10:01.065321Z",
-     "start_time": "2025-04-21T06:10:01.060267Z"
     }
    },
    "source": [
-    "from transformers import pipeline\n",
-    "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, pipeline, AutoFeatureExtractor\n",
     "import torchaudio\n",
-    "import torchaudio.transforms as T"
    ],
    "outputs": [],
-   "execution_count": 32
   },
   {
    "metadata": {
-    "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2025-04-21T05:03:48.582040Z",
-     "start_time": "2025-04-21T04:51:46.343821Z"
     }
    },
    "cell_type": "code",
    "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "51ffb4afb57446278c28d690aa1b22e4"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "86143c7cd15341e39db1e81231d4fd7e"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "01d06664af1c4c169175cd38b00fa78e"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "8833df76fdf24e92bf51c748aa71bc48"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "3f5dfd342c574c2698f42c51a567a77e"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "e9363f15071d48878ef8230bd6c39177"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "d0e777a74b9a47f3ad6a18254825122b"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "fd44e41876984d81a593d55e07e71be6"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "c3baafd24e1c4c4d9dcaf5a4715e846e"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]"
-      ],
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "3a671889c0504b50bcff2aec93497d78"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "Device set to use mps:0\n"
      ]
     }
    ],
-   "execution_count": 4,
-   "source": [
-    "\n",
-    "pipe = pipeline(model=\"openai/whisper-tiny\", task=\"automatic-speech-recognition\")\n"
-   ],
-   "id": "initial_id"
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
-   "source": [
-    "# Load audio file\n",
-    "waveform_1, sample_rate = torchaudio.load(\"sample.wav\")\n",
-    "# Target sampling rate (e.g., 16000 Hz for Whisper)\n",
-    "target_sr = 16000\n",
-    "\n",
-    "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
-    "waveform = resampler(waveform_1)\n",
-    "waveform_np = waveform.squeeze().numpy()\n",
-    "\n",
-    "print(waveform.shape)      # (channels, samples) — usually (1, N)\n",
-    "print(sample_rate)\n",
-    "print(waveform_np)"
-   ],
-   "id": "dc202f529230fa87"
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T05:08:38.144954Z",
-     "start_time": "2025-04-21T05:08:38.087644Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "save_dir = \"./models_for_proj/whisper-tiny\"\n",
-    "device = 'cpu'\n",
-    "pipe.generation_config.save_pretrained(save_dir)\n",
-    "pipe.tokenizer.save_pretrained(save_dir)\n",
-    "pipe.feature_extractor.save_pretrained(save_dir)\n"
-   ],
-   "id": "ed09605af0b78939",
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['./models_for_proj/whisper-tiny/preprocessor_config.json']"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "execution_count": 6
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T05:35:59.540770Z",
-     "start_time": "2025-04-21T05:35:59.476164Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "\n",
-    "# model = AutoModelForSpeechSeq2Seq.from_pretrained(save_dir, device=device)\n",
-    "# model.config.forced_decoder_ids = None\n",
-    "# processor = AutoProcessor.from_pretrained(save_dir, device=device)\n",
-    "# tokenizer = AutoTokenizer.from_pretrained(save_dir, device=device)\n",
-    "# feature_extractor = AutoFeatureExtractor.from_pretrained(save_dir, device=device)\n",
-    "# pipe = pipeline(\"automatic-speech-recognition\", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)\n",
-    "# result = pipe(\"sample.wav\")\n",
-    "# result[\"text\"]"
-   ],
-   "id": "1dcd38e5ca08781b",
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "WhisperForConditionalGeneration.__init__() got an unexpected keyword argument 'device'",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
-      "\u001B[31mTypeError\u001B[39m                                 Traceback (most recent call last)",
-      "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[30]\u001B[39m\u001B[32m, line 3\u001B[39m\n\u001B[32m      1\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mtransformers\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, pipeline, AutoFeatureExtractor\n\u001B[32m      2\u001B[39m device = \u001B[33m'\u001B[39m\u001B[33mcpu\u001B[39m\u001B[33m'\u001B[39m\n\u001B[32m----> \u001B[39m\u001B[32m3\u001B[39m model = \u001B[43mAutoModelForSpeechSeq2Seq\u001B[49m\u001B[43m.\u001B[49m\u001B[43mfrom_pretrained\u001B[49m\u001B[43m(\u001B[49m\u001B[43msave_dir\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdevice\u001B[49m\u001B[43m=\u001B[49m\u001B[43mdevice\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m      4\u001B[39m model.config.forced_decoder_ids = \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[32m      5\u001B[39m processor = AutoProcessor.from_pretrained(save_dir, device=device)\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py:573\u001B[39m, in \u001B[36m_BaseAutoModelClass.from_pretrained\u001B[39m\u001B[34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001B[39m\n\u001B[32m    571\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(config) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mcls\u001B[39m._model_mapping.keys():\n\u001B[32m    572\u001B[39m     model_class = _get_model_class(config, \u001B[38;5;28mcls\u001B[39m._model_mapping)\n\u001B[32m--> \u001B[39m\u001B[32m573\u001B[39m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmodel_class\u001B[49m\u001B[43m.\u001B[49m\u001B[43mfrom_pretrained\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m    574\u001B[39m \u001B[43m        \u001B[49m\u001B[43mpretrained_model_name_or_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43mmodel_args\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mconfig\u001B[49m\u001B[43m=\u001B[49m\u001B[43mconfig\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mhub_kwargs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\n\u001B[32m    575\u001B[39m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    576\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m    577\u001B[39m     \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mUnrecognized configuration class \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mconfig.\u001B[34m__class__\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m for this kind of AutoModel: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mcls\u001B[39m.\u001B[34m__name__\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m.\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m\"\u001B[39m\n\u001B[32m    578\u001B[39m     \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mModel type should be one of \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m, \u001B[39m\u001B[33m'\u001B[39m.join(c.\u001B[34m__name__\u001B[39m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mfor\u001B[39;00m\u001B[38;5;250m \u001B[39mc\u001B[38;5;250m \u001B[39m\u001B[38;5;129;01min\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28mcls\u001B[39m._model_mapping.keys())\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m.\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m    579\u001B[39m )\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py:272\u001B[39m, in \u001B[36mrestore_default_torch_dtype.<locals>._wrapper\u001B[39m\u001B[34m(*args, **kwargs)\u001B[39m\n\u001B[32m    270\u001B[39m old_dtype = torch.get_default_dtype()\n\u001B[32m    271\u001B[39m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[32m--> \u001B[39m\u001B[32m272\u001B[39m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    273\u001B[39m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[32m    274\u001B[39m     torch.set_default_dtype(old_dtype)\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py:4401\u001B[39m, in \u001B[36mPreTrainedModel.from_pretrained\u001B[39m\u001B[34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)\u001B[39m\n\u001B[32m   4395\u001B[39m     config = \u001B[38;5;28mcls\u001B[39m._autoset_attn_implementation(\n\u001B[32m   4396\u001B[39m         config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map\n\u001B[32m   4397\u001B[39m     )\n\u001B[32m   4399\u001B[39m \u001B[38;5;28;01mwith\u001B[39;00m ContextManagers(model_init_context):\n\u001B[32m   4400\u001B[39m     \u001B[38;5;66;03m# Let's make sure we don't run the init function of buffer modules\u001B[39;00m\n\u001B[32m-> \u001B[39m\u001B[32m4401\u001B[39m     model = \u001B[38;5;28;43mcls\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mconfig\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43mmodel_args\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mmodel_kwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m   4403\u001B[39m \u001B[38;5;66;03m# Make sure to tie the weights correctly\u001B[39;00m\n\u001B[32m   4404\u001B[39m model.tie_weights()\n",
-      "\u001B[31mTypeError\u001B[39m: WhisperForConditionalGeneration.__init__() got an unexpected keyword argument 'device'"
-     ]
-    }
-   ],
-   "execution_count": 30
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-04-21T06:13:00.420733Z",
-     "start_time": "2025-04-21T06:13:00.033330Z"
     }
    },
    "cell_type": "code",
    "source": [
-    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
-    "# load dummy dataset and read audio files\n",
     "\n",
-    "# input\n",
     "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
     "target_sr = 16000\n",
     "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
     "waveform = resampler(waveform)\n",
     "waveform_np = waveform.squeeze().numpy()\n",
     "\n",
-    "\n",
-    "processor = WhisperProcessor.from_pretrained(save_dir)\n",
-    "model = WhisperForConditionalGeneration.from_pretrained(save_dir)\n",
-    "model.config.forced_decoder_ids = None\n",
-    "\n",
-    "input_features = processor(waveform_np, sampling_rate=target_sr, return_tensors=\"pt\", device=device).input_features\n",
-    "\n",
-    "# generate token ids\n",
-    "predicted_ids = model.generate(input_features)\n",
-    "# decode token ids to text\n",
-    "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)\n",
-    "# ['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']\n",
-    "print(transcription)\n",
-    "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
-    "# [' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']\n",
-    "print(transcription)"
    ],
-   "id": "b0865456fed26d31",
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
-     ]
-    },
-    {
-     "ename": "ValueError",
-     "evalue": "You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument in favour of `input_ids` or `decoder_input_ids` respectively.",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
-      "\u001B[31mValueError\u001B[39m                                Traceback (most recent call last)",
-      "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[34]\u001B[39m\u001B[32m, line 19\u001B[39m\n\u001B[32m     16\u001B[39m input_features = processor(waveform_np, sampling_rate=target_sr, return_tensors=\u001B[33m\"\u001B[39m\u001B[33mpt\u001B[39m\u001B[33m\"\u001B[39m, device=device).input_features\n\u001B[32m     18\u001B[39m \u001B[38;5;66;03m# generate token ids\u001B[39;00m\n\u001B[32m---> \u001B[39m\u001B[32m19\u001B[39m predicted_ids = \u001B[43mmodel\u001B[49m\u001B[43m.\u001B[49m\u001B[43mgenerate\u001B[49m\u001B[43m(\u001B[49m\u001B[43minput_features\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m     20\u001B[39m \u001B[38;5;66;03m# decode token ids to text\u001B[39;00m\n\u001B[32m     21\u001B[39m transcription = processor.batch_decode(predicted_ids, skip_special_tokens=\u001B[38;5;28;01mFalse\u001B[39;00m)\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/models/whisper/generation_whisper.py:774\u001B[39m, in \u001B[36mWhisperGenerationMixin.generate\u001B[39m\u001B[34m(self, input_features, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, return_timestamps, task, language, is_multilingual, prompt_ids, prompt_condition_type, condition_on_prev_tokens, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, num_segment_frames, attention_mask, time_precision, time_precision_features, return_token_timestamps, return_segments, return_dict_in_generate, force_unique_generate_call, **kwargs)\u001B[39m\n\u001B[32m    765\u001B[39m             proc.set_begin_index(decoder_input_ids.shape[-\u001B[32m1\u001B[39m])\n\u001B[32m    767\u001B[39m \u001B[38;5;66;03m# 6.6 Run generate with fallback\u001B[39;00m\n\u001B[32m    768\u001B[39m (\n\u001B[32m    769\u001B[39m     seek_sequences,\n\u001B[32m    770\u001B[39m     seek_outputs,\n\u001B[32m    771\u001B[39m     should_skip,\n\u001B[32m    772\u001B[39m     do_condition_on_prev_tokens,\n\u001B[32m    773\u001B[39m     model_output_type,\n\u001B[32m--> \u001B[39m\u001B[32m774\u001B[39m ) = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mgenerate_with_fallback\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m    775\u001B[39m \u001B[43m    \u001B[49m\u001B[43msegment_input\u001B[49m\u001B[43m=\u001B[49m\u001B[43msegment_input\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    776\u001B[39m \u001B[43m    \u001B[49m\u001B[43mdecoder_input_ids\u001B[49m\u001B[43m=\u001B[49m\u001B[43mdecoder_input_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    777\u001B[39m \u001B[43m    \u001B[49m\u001B[43mcur_bsz\u001B[49m\u001B[43m=\u001B[49m\u001B[43mcur_bsz\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    778\u001B[39m \u001B[43m    \u001B[49m\u001B[43mbatch_idx_map\u001B[49m\u001B[43m=\u001B[49m\u001B[43mbatch_idx_map\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    779\u001B[39m \u001B[43m    \u001B[49m\u001B[43mseek\u001B[49m\u001B[43m=\u001B[49m\u001B[43mseek\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    780\u001B[39m \u001B[43m    \u001B[49m\u001B[43mnum_segment_frames\u001B[49m\u001B[43m=\u001B[49m\u001B[43mnum_segment_frames\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    781\u001B[39m \u001B[43m    \u001B[49m\u001B[43mmax_frames\u001B[49m\u001B[43m=\u001B[49m\u001B[43mmax_frames\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    782\u001B[39m \u001B[43m    \u001B[49m\u001B[43mtemperatures\u001B[49m\u001B[43m=\u001B[49m\u001B[43mtemperatures\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    783\u001B[39m \u001B[43m    \u001B[49m\u001B[43mgeneration_config\u001B[49m\u001B[43m=\u001B[49m\u001B[43mgeneration_config\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    784\u001B[39m \u001B[43m    \u001B[49m\u001B[43mlogits_processor\u001B[49m\u001B[43m=\u001B[49m\u001B[43mlogits_processor\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    785\u001B[39m \u001B[43m    \u001B[49m\u001B[43mstopping_criteria\u001B[49m\u001B[43m=\u001B[49m\u001B[43mstopping_criteria\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    786\u001B[39m \u001B[43m    \u001B[49m\u001B[43mprefix_allowed_tokens_fn\u001B[49m\u001B[43m=\u001B[49m\u001B[43mprefix_allowed_tokens_fn\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    787\u001B[39m \u001B[43m    \u001B[49m\u001B[43msynced_gpus\u001B[49m\u001B[43m=\u001B[49m\u001B[43msynced_gpus\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    788\u001B[39m \u001B[43m    \u001B[49m\u001B[43mreturn_token_timestamps\u001B[49m\u001B[43m=\u001B[49m\u001B[43mreturn_token_timestamps\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    789\u001B[39m \u001B[43m    \u001B[49m\u001B[43mdo_condition_on_prev_tokens\u001B[49m\u001B[43m=\u001B[49m\u001B[43mdo_condition_on_prev_tokens\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    790\u001B[39m \u001B[43m    \u001B[49m\u001B[43mis_shortform\u001B[49m\u001B[43m=\u001B[49m\u001B[43mis_shortform\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    791\u001B[39m \u001B[43m    \u001B[49m\u001B[43mbatch_size\u001B[49m\u001B[43m=\u001B[49m\u001B[43mbatch_size\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    792\u001B[39m \u001B[43m    \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[43m=\u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    793\u001B[39m \u001B[43m    \u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m=\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    794\u001B[39m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    796\u001B[39m \u001B[38;5;66;03m# 6.7 In every generated sequence, split by timestamp tokens and extract segments\u001B[39;00m\n\u001B[32m    797\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m i, seek_sequence \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(seek_sequences):\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/models/whisper/generation_whisper.py:950\u001B[39m, in \u001B[36mWhisperGenerationMixin.generate_with_fallback\u001B[39m\u001B[34m(self, segment_input, decoder_input_ids, cur_bsz, batch_idx_map, seek, num_segment_frames, max_frames, temperatures, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, return_token_timestamps, do_condition_on_prev_tokens, is_shortform, batch_size, attention_mask, kwargs)\u001B[39m\n\u001B[32m    945\u001B[39m     \u001B[38;5;28;01mif\u001B[39;00m generate_kwargs.get(\u001B[33m\"\u001B[39m\u001B[33mencoder_outputs\u001B[39m\u001B[33m\"\u001B[39m) \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m    946\u001B[39m         generate_kwargs[\u001B[33m\"\u001B[39m\u001B[33mencoder_outputs\u001B[39m\u001B[33m\"\u001B[39m] = F.pad(\n\u001B[32m    947\u001B[39m             generate_kwargs[\u001B[33m\"\u001B[39m\u001B[33mencoder_outputs\u001B[39m\u001B[33m\"\u001B[39m], (\u001B[32m0\u001B[39m, \u001B[32m0\u001B[39m, \u001B[32m0\u001B[39m, \u001B[32m0\u001B[39m, \u001B[32m0\u001B[39m, batch_size - cur_bsz), value=\u001B[32m0\u001B[39m\n\u001B[32m    948\u001B[39m         )\n\u001B[32m--> \u001B[39m\u001B[32m950\u001B[39m seek_outputs = \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mgenerate\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m    951\u001B[39m \u001B[43m    \u001B[49m\u001B[43msegment_input\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    952\u001B[39m \u001B[43m    \u001B[49m\u001B[43mgeneration_config\u001B[49m\u001B[43m=\u001B[49m\u001B[43mgeneration_config\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    953\u001B[39m \u001B[43m    \u001B[49m\u001B[43mlogits_processor\u001B[49m\u001B[43m=\u001B[49m\u001B[43mlogits_processor\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    954\u001B[39m \u001B[43m    \u001B[49m\u001B[43mstopping_criteria\u001B[49m\u001B[43m=\u001B[49m\u001B[43mstopping_criteria\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    955\u001B[39m \u001B[43m    \u001B[49m\u001B[43mprefix_allowed_tokens_fn\u001B[49m\u001B[43m=\u001B[49m\u001B[43mprefix_allowed_tokens_fn\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    956\u001B[39m \u001B[43m    \u001B[49m\u001B[43msynced_gpus\u001B[49m\u001B[43m=\u001B[49m\u001B[43msynced_gpus\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    957\u001B[39m \u001B[43m    \u001B[49m\u001B[43mdecoder_input_ids\u001B[49m\u001B[43m=\u001B[49m\u001B[43mdecoder_input_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    958\u001B[39m \u001B[43m    \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[43m=\u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    959\u001B[39m \u001B[43m    \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mgenerate_kwargs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m    960\u001B[39m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    962\u001B[39m model_output_type = \u001B[38;5;28mtype\u001B[39m(seek_outputs)\n\u001B[32m    964\u001B[39m \u001B[38;5;66;03m# post-process sequence tokens and outputs to be in list form\u001B[39;00m\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py:116\u001B[39m, in \u001B[36mcontext_decorator.<locals>.decorate_context\u001B[39m\u001B[34m(*args, **kwargs)\u001B[39m\n\u001B[32m    113\u001B[39m \u001B[38;5;129m@functools\u001B[39m.wraps(func)\n\u001B[32m    114\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mdecorate_context\u001B[39m(*args, **kwargs):\n\u001B[32m    115\u001B[39m     \u001B[38;5;28;01mwith\u001B[39;00m ctx_factory():\n\u001B[32m--> \u001B[39m\u001B[32m116\u001B[39m         \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/generation/utils.py:2219\u001B[39m, in \u001B[36mGenerationMixin.generate\u001B[39m\u001B[34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)\u001B[39m\n\u001B[32m   2208\u001B[39m     warnings.warn(\n\u001B[32m   2209\u001B[39m         \u001B[33m\"\u001B[39m\u001B[33mYou are calling .generate() with the `input_ids` being on a device type different\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m   2210\u001B[39m         \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m than your model\u001B[39m\u001B[33m'\u001B[39m\u001B[33ms device. `input_ids` is on \u001B[39m\u001B[38;5;132;01m{\u001B[39;00minput_ids.device.type\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m, whereas the model\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m   (...)\u001B[39m\u001B[32m   2215\u001B[39m         \u001B[38;5;167;01mUserWarning\u001B[39;00m,\n\u001B[32m   2216\u001B[39m     )\n\u001B[32m   2218\u001B[39m \u001B[38;5;66;03m# 9. prepare logits processors and stopping criteria\u001B[39;00m\n\u001B[32m-> \u001B[39m\u001B[32m2219\u001B[39m prepared_logits_processor = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_get_logits_processor\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m   2220\u001B[39m \u001B[43m    \u001B[49m\u001B[43mgeneration_config\u001B[49m\u001B[43m=\u001B[49m\u001B[43mgeneration_config\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2221\u001B[39m \u001B[43m    \u001B[49m\u001B[43minput_ids_seq_length\u001B[49m\u001B[43m=\u001B[49m\u001B[43minput_ids_length\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2222\u001B[39m \u001B[43m    \u001B[49m\u001B[43mencoder_input_ids\u001B[49m\u001B[43m=\u001B[49m\u001B[43minputs_tensor\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2223\u001B[39m \u001B[43m    \u001B[49m\u001B[43mprefix_allowed_tokens_fn\u001B[49m\u001B[43m=\u001B[49m\u001B[43mprefix_allowed_tokens_fn\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2224\u001B[39m \u001B[43m    \u001B[49m\u001B[43mlogits_processor\u001B[49m\u001B[43m=\u001B[49m\u001B[43mlogits_processor\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2225\u001B[39m \u001B[43m    \u001B[49m\u001B[43mdevice\u001B[49m\u001B[43m=\u001B[49m\u001B[43minputs_tensor\u001B[49m\u001B[43m.\u001B[49m\u001B[43mdevice\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2226\u001B[39m \u001B[43m    \u001B[49m\u001B[43mmodel_kwargs\u001B[49m\u001B[43m=\u001B[49m\u001B[43mmodel_kwargs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2227\u001B[39m \u001B[43m    \u001B[49m\u001B[43mnegative_prompt_ids\u001B[49m\u001B[43m=\u001B[49m\u001B[43mnegative_prompt_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2228\u001B[39m \u001B[43m    \u001B[49m\u001B[43mnegative_prompt_attention_mask\u001B[49m\u001B[43m=\u001B[49m\u001B[43mnegative_prompt_attention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[32m   2229\u001B[39m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m   2230\u001B[39m prepared_stopping_criteria = \u001B[38;5;28mself\u001B[39m._get_stopping_criteria(\n\u001B[32m   2231\u001B[39m     generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs\n\u001B[32m   2232\u001B[39m )\n\u001B[32m   2234\u001B[39m \u001B[38;5;66;03m# Set model_kwargs `use_cache` so we can use it later in forward runs\u001B[39;00m\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/generation/utils.py:1083\u001B[39m, in \u001B[36mGenerationMixin._get_logits_processor\u001B[39m\u001B[34m(self, generation_config, input_ids_seq_length, encoder_input_ids, prefix_allowed_tokens_fn, logits_processor, device, model_kwargs, negative_prompt_ids, negative_prompt_attention_mask)\u001B[39m\n\u001B[32m   1074\u001B[39m     processors.append(\n\u001B[32m   1075\u001B[39m         SuppressTokensAtBeginLogitsProcessor(\n\u001B[32m   1076\u001B[39m             generation_config.begin_suppress_tokens,\n\u001B[32m   (...)\u001B[39m\u001B[32m   1079\u001B[39m         )\n\u001B[32m   1080\u001B[39m     )\n\u001B[32m   1081\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m generation_config.forced_decoder_ids \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m   1082\u001B[39m     \u001B[38;5;66;03m# TODO (sanchit): move this exception to GenerationConfig.validate() when TF & FLAX are aligned with PT\u001B[39;00m\n\u001B[32m-> \u001B[39m\u001B[32m1083\u001B[39m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m   1084\u001B[39m         \u001B[33m\"\u001B[39m\u001B[33mYou have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument \u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m   1085\u001B[39m         \u001B[33m\"\u001B[39m\u001B[33min favour of `input_ids` or `decoder_input_ids` respectively.\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m   1086\u001B[39m     )\n\u001B[32m   1088\u001B[39m \u001B[38;5;66;03m# TODO (joao): find a strategy to specify the order of the processors\u001B[39;00m\n\u001B[32m   1089\u001B[39m processors = \u001B[38;5;28mself\u001B[39m._merge_criteria_processor_list(processors, logits_processor)\n",
-      "\u001B[31mValueError\u001B[39m: You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument in favour of `input_ids` or `decoder_input_ids` respectively."
      ]
     }
    ],
-   "execution_count": 34
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T06:15:41.079099Z",
-     "start_time": "2025-04-21T06:15:37.277194Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "# load model and processor\n",
-    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n",
-    "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n",
-    "model.config.forced_decoder_ids = None\n",
-    "\n",
-    "# load dummy dataset and read audio files\n",
-    "ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
-    "sample = ds[0][\"audio\"]\n",
-    "input_features = processor(sample[\"array\"], sampling_rate=sample[\"sampling_rate\"], return_tensors=\"pt\").input_features\n",
-    "\n",
-    "# generate token ids\n",
-    "predicted_ids = model.generate(input_features)\n",
-    "# decode token ids to text\n",
-    "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)\n",
-    "processor(transcription)\n",
-    "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
-    "processor(transcription)\n"
-   ],
-   "id": "b4137e08d1a516e5",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.\n"
-     ]
-    },
-    {
-     "ename": "ValueError",
-     "evalue": "could not convert string to float: ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
-      "\u001B[31mValueError\u001B[39m                                Traceback (most recent call last)",
-      "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[37]\u001B[39m\u001B[32m, line 18\u001B[39m\n\u001B[32m     16\u001B[39m \u001B[38;5;66;03m# decode token ids to text\u001B[39;00m\n\u001B[32m     17\u001B[39m transcription = processor.batch_decode(predicted_ids, skip_special_tokens=\u001B[38;5;28;01mFalse\u001B[39;00m)\n\u001B[32m---> \u001B[39m\u001B[32m18\u001B[39m \u001B[43mprocessor\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtranscription\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m     19\u001B[39m transcription = processor.batch_decode(predicted_ids, skip_special_tokens=\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[32m     20\u001B[39m processor(transcription)\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/models/whisper/processing_whisper.py:69\u001B[39m, in \u001B[36mWhisperProcessor.__call__\u001B[39m\u001B[34m(self, *args, **kwargs)\u001B[39m\n\u001B[32m     66\u001B[39m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[33m\"\u001B[39m\u001B[33mYou need to specify either an `audio` or `text` input to process.\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m     68\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m audio \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m---> \u001B[39m\u001B[32m69\u001B[39m     inputs = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mfeature_extractor\u001B[49m\u001B[43m(\u001B[49m\u001B[43maudio\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msampling_rate\u001B[49m\u001B[43m=\u001B[49m\u001B[43msampling_rate\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m*\u001B[49m\u001B[43m*\u001B[49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m     70\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m text \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m     71\u001B[39m     encodings = \u001B[38;5;28mself\u001B[39m.tokenizer(text, **kwargs)\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/models/whisper/feature_extraction_whisper.py:281\u001B[39m, in \u001B[36mWhisperFeatureExtractor.__call__\u001B[39m\u001B[34m(self, raw_speech, truncation, pad_to_multiple_of, return_tensors, return_attention_mask, padding, max_length, sampling_rate, do_normalize, device, return_token_timestamps, **kwargs)\u001B[39m\n\u001B[32m    279\u001B[39m     raw_speech = [np.asarray([speech], dtype=np.float32).T \u001B[38;5;28;01mfor\u001B[39;00m speech \u001B[38;5;129;01min\u001B[39;00m raw_speech]\n\u001B[32m    280\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_batched \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(raw_speech, np.ndarray):\n\u001B[32m--> \u001B[39m\u001B[32m281\u001B[39m     raw_speech = \u001B[43mnp\u001B[49m\u001B[43m.\u001B[49m\u001B[43masarray\u001B[49m\u001B[43m(\u001B[49m\u001B[43mraw_speech\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdtype\u001B[49m\u001B[43m=\u001B[49m\u001B[43mnp\u001B[49m\u001B[43m.\u001B[49m\u001B[43mfloat32\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    282\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(raw_speech, np.ndarray) \u001B[38;5;129;01mand\u001B[39;00m raw_speech.dtype \u001B[38;5;129;01mis\u001B[39;00m np.dtype(np.float64):\n\u001B[32m    283\u001B[39m     raw_speech = raw_speech.astype(np.float32)\n",
-      "\u001B[31mValueError\u001B[39m: could not convert string to float: ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'"
-     ]
-    }
-   ],
-   "execution_count": 37
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-04-21T05:31:26.352787Z",
-     "start_time": "2025-04-21T05:31:26.343398Z"
     }
    },
    "cell_type": "code",
-   "source": "",
-   "id": "37fa63b1c22f4a69",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.Size([1, 24192])\n",
-      "24000\n",
-      "[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ... -1.3932839e-05\n",
-      " -3.6663318e-05 -1.3932839e-05]\n"
-     ]
-    }
-   ],
-   "execution_count": 25
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-04-21T06:28:40.294060Z",
-     "start_time": "2025-04-21T06:28:35.493462Z"
     }
    },
    "cell_type": "code",
    "source": [
-    "import torch\n",
-    "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "\n",
-    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
-    "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
-    "\n",
-    "# model_id = \"distil-whisper/distil-small.en\"\n",
-    "model_id = \"./models_for_proj/librispeech_asr_dummy\"\n",
-    "\n",
-    "model = AutoModelForSpeechSeq2Seq.from_pretrained(\n",
-    "    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True\n",
-    ")\n",
-    "model.to(device)\n",
-    "\n",
-    "processor = AutoProcessor.from_pretrained(model_id)\n",
-    "\n",
-    "pipe = pipeline(\n",
-    "    \"automatic-speech-recognition\",\n",
-    "    model=model,\n",
-    "    tokenizer=processor.tokenizer,\n",
-    "    feature_extractor=processor.feature_extractor,\n",
-    "    max_new_tokens=128,\n",
-    "    torch_dtype=torch_dtype,\n",
-    "    device=device,\n",
-    ")\n",
-    "\n",
-    "# dataset = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
-    "# sample = dataset[0][\"audio\"]\n",
-    "# result = pipe(sample)\n",
     "\n",
-    "# input\n",
-    "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
-    "target_sr = 16000\n",
-    "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
-    "waveform = resampler(waveform)\n",
-    "waveform_np = waveform.squeeze().numpy()\n",
-    "# sample = dataset[2][\"audio\"]\n",
     "\n",
-    "# result = pipe(sample)\n",
-    "result = pipe(waveform_np)\n",
-    "print(result[\"text\"])\n"
-   ],
-   "id": "e7f0a5bccb4e204f",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Device set to use cpu\n",
-      "/Users/perchik/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/pipelines/automatic_speech_recognition.py:312: FutureWarning: `max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.\n",
-      "  warnings.warn(\n",
-      "/Users/perchik/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n",
-      "  warnings.warn(\n",
-      "`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}. If this is not desired, please set these values explicitly.\n",
-      "A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.\n",
-      "A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> to see related `.generate()` flags.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Mr. Quilter is the Apostle of the Middle Classes, and we are glad to welcome his Gospel.\n"
-     ]
-    }
    ],
-   "execution_count": 46
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T06:27:16.239153Z",
-     "start_time": "2025-04-21T06:27:15.587609Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "save_dir = \"./models_for_proj/librispeech_asr_dummy\"\n",
-    "pipe.model.save_pretrained(save_dir)\n",
-    "pipe.tokenizer.save_pretrained(save_dir)\n",
-    "pipe.feature_extractor.save_pretrained(save_dir)"
-   ],
-   "id": "81b57090829a7294",
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/perchik/PycharmProjects/Learning_LLMs/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py:3353: UserWarning: Moving the following attributes in the config to the generation config: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361]}. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.\n",
-      "  warnings.warn(\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "['./models_for_proj/librispeech_asr_dummy/preprocessor_config.json']"
       ]
      },
-     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 45
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T05:31:45.237137Z",
-     "start_time": "2025-04-21T05:31:45.234474Z"
-    }
-   },
-   "cell_type": "code",
-   "source": "target_sr",
-   "id": "61b31c4b81fd098f",
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "16000"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "execution_count": 26
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-04-21T11:20:26.931270Z",
-     "start_time": "2025-04-21T11:20:24.762498Z"
     }
    },
    "cell_type": "code",
    "source": [
-    "# input\n",
     "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
     "target_sr = 16000\n",
     "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
     "waveform = resampler(waveform)\n",
-    "waveform_np = waveform.squeeze().numpy()\n",
-    "# sample = dataset[2][\"audio\"]\n",
-    "\n",
-    "# result = pipe(sample)\n",
-    "result = pipe(waveform_np)\n",
-    "print(result[\"text\"])"
    ],
-   "id": "5c9f9ff839e346f8",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " This is a simple text.\n"
-     ]
-    }
-   ],
-   "execution_count": 48
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-04-21T11:54:49.800197Z",
-     "start_time": "2025-04-21T11:54:47.143900Z"
     }
    },
    "cell_type": "code",
    "source": [
-    "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n",
-    "processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
-    "model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")"
    ],
-   "id": "a7084d040f38e0f5",
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     }
    ],
-   "execution_count": 49
   },
   {
    "metadata": {},
@@ -620,7 +231,7 @@
    "outputs": [],
    "execution_count": null,
    "source": "",
-   "id": "f886807e783c9532"
   }
  ],
  "metadata": {

 {
  "cells": [
   {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## FIRST CHECK",
+   "id": "518bcf10bfff3063"
+  },
+  {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:45:34.883735Z",
+     "start_time": "2025-04-21T15:45:33.734296Z"
     }
    },
+   "cell_type": "code",
    "source": [
+    "# gradio app.py --watch-dirs app.py\n",
+    "\n",
+    "import gradio as gr\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.animation as animation\n",
+    "import tempfile\n",
+    "import torch\n",
+    "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
     "import torchaudio\n",
+    "import torchaudio.transforms as T\n",
+    "from matplotlib.patches import Circle\n",
+    "from stable_baselines3 import SAC\n",
+    "from warehouse_env import WarehouseEnv\n",
+    "from types import SimpleNamespace"
    ],
+   "id": "f861a8e81b92bceb",
    "outputs": [],
+   "execution_count": 50
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:45:58.508916Z",
+     "start_time": "2025-04-21T15:45:53.686659Z"
     }
    },
    "cell_type": "code",
+   "source": "asr_pipe_default = pipeline(\"automatic-speech-recognition\")",
+   "id": "90ddbbf24fac7b1f",
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 22aad52 (https://huggingface.co/facebook/wav2vec2-base-960h).\n",
+      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
+      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
       "Device set to use mps:0\n"
      ]
     }
    ],
+   "execution_count": 51
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:46:03.873405Z",
+     "start_time": "2025-04-21T15:46:02.219145Z"
     }
    },
    "cell_type": "code",
    "source": [
     "\n",
     "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
     "target_sr = 16000\n",
     "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
     "waveform = resampler(waveform)\n",
     "waveform_np = waveform.squeeze().numpy()\n",
+    "# sample = dataset[2][\"audio\"]\n",
     "\n",
+    "# result = pipe(sample)\n",
+    "result = asr_pipe_default(waveform_np)\n",
+    "print(result[\"text\"])\n"
    ],
+   "id": "75dbfd85403eb511",
    "outputs": [
     {
+     "name": "stdout",
      "output_type": "stream",
      "text": [
+      "THIS IS A SIMPLE TEXT\n"
      ]
     }
    ],
+   "execution_count": 52
   },
   {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## TO SAVE THE MODEL",
+   "id": "e0a9c2fd7bce280a"
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:51:20.114613Z",
+     "start_time": "2025-04-21T15:51:20.106995Z"
     }
    },
    "cell_type": "code",
+   "source": "save_dir = './models_for_proj/wav2vec2-base-960h'",
+   "id": "10f2808d5da846b9",
+   "outputs": [],
+   "execution_count": 53
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:54:16.050333Z",
+     "start_time": "2025-04-21T15:54:12.432304Z"
     }
    },
    "cell_type": "code",
    "source": [
+    "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
     "\n",
+    "# Load pretrained model and processor\n",
+    "model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
+    "processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
     "\n",
+    "# Save locally\n",
+    "model.save_pretrained(save_dir)\n",
+    "processor.save_pretrained(save_dir)"
    ],
+   "id": "c22c64edf17613a0",
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
      ]
     },
     {
      "data": {
       "text/plain": [
+       "[]"
       ]
      },
+     "execution_count": 57,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "execution_count": 57
   },
   {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## TO REUSE IT",
+   "id": "b2e0767904efbbb3"
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:59:35.714597Z",
+     "start_time": "2025-04-21T15:59:35.705418Z"
     }
    },
    "cell_type": "code",
    "source": [
+    "import torchaudio\n",
+    "import torchaudio.transforms as T\n",
+    "\n",
     "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
     "target_sr = 16000\n",
     "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
     "waveform = resampler(waveform)\n",
+    "waveform_np = waveform.squeeze().numpy()"
    ],
+   "id": "394c5b342a6510",
+   "outputs": [],
+   "execution_count": 61
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2025-04-21T15:59:36.498222Z",
+     "start_time": "2025-04-21T15:59:36.361763Z"
     }
    },
    "cell_type": "code",
    "source": [
+    "import torch\n",
+    "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
+    "\n",
+    "save_dir = './models_for_proj/wav2vec2-base-960h'\n",
+    "\n",
+    "# load\n",
+    "model = Wav2Vec2ForCTC.from_pretrained(save_dir)\n",
+    "processor = Wav2Vec2Processor.from_pretrained(save_dir)\n",
+    "\n",
+    "# Preprocess\n",
+    "inputs = processor(waveform_np, sampling_rate=16000, return_tensors=\"pt\", padding=True)\n",
+    "\n",
+    "# Inference\n",
+    "with torch.no_grad():\n",
+    "    logits = model(**inputs).logits\n",
+    "\n",
+    "# Decode\n",
+    "predicted_ids = torch.argmax(logits, dim=-1)\n",
+    "transcription = processor.decode(predicted_ids[0])\n",
+    "\n",
+    "print(\"Transcription:\", transcription)\n"
    ],
+   "id": "af430cf9e1e42318",
    "outputs": [
     {
+     "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Transcription: THIS IS A SIMPLE TEXT\n"
      ]
     }
    ],
+   "execution_count": 62
   },
   {
    "metadata": {},
    "outputs": [],
    "execution_count": null,
    "source": "",
+   "id": "113500626c003f89"
   }
  ],
  "metadata": {

models_for_proj/wav2vec2-base-960h/config.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

models_for_proj/wav2vec2-base-960h/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75cf04071a643e1f23b8bb1571cde28cab80e3ff3a822ef0073d26f8fe98afdc
+size 377611120

models_for_proj/wav2vec2-base-960h/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

models_for_proj/wav2vec2-base-960h/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

models_for_proj/wav2vec2-base-960h/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "do_normalize": true,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "return_attention_mask": false,
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
+}

models_for_proj/wav2vec2-base-960h/vocab.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "'": 27,
+  "</s>": 2,
+  "<pad>": 0,
+  "<s>": 1,
+  "<unk>": 3,
+  "A": 7,
+  "B": 24,
+  "C": 19,
+  "D": 14,
+  "E": 5,
+  "F": 20,
+  "G": 21,
+  "H": 11,
+  "I": 10,
+  "J": 29,
+  "K": 26,
+  "L": 15,
+  "M": 17,
+  "N": 9,
+  "O": 8,
+  "P": 23,
+  "Q": 30,
+  "R": 13,
+  "S": 12,
+  "T": 6,
+  "U": 16,
+  "V": 25,
+  "W": 18,
+  "X": 28,
+  "Y": 22,
+  "Z": 31,
+  "|": 4
+}