CindyDelage commited on
Commit
8251cd0
·
verified ·
1 Parent(s): 96069bd

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +8 -12
tools.py CHANGED
@@ -146,7 +146,7 @@ class audio_or_mp3__interpreter(Tool):
146
  name="audio_tool"
147
  description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
148
  inputs = {
149
- 'audio': {"type": "audio", "description": "the audio of interest"}
150
  }
151
  output_type = "string"
152
 
@@ -155,12 +155,8 @@ class audio_or_mp3__interpreter(Tool):
155
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
156
 
157
  model_id = "openai/whisper-large-v3"
158
-
159
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
160
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
161
- )
162
  model.to(device)
163
-
164
  processor = AutoProcessor.from_pretrained(model_id)
165
 
166
  pipe = pipeline(
@@ -169,14 +165,14 @@ class audio_or_mp3__interpreter(Tool):
169
  tokenizer=processor.tokenizer,
170
  feature_extractor=processor.feature_extractor,
171
  torch_dtype=torch_dtype,
172
- device=device,
173
  )
174
- mp3_file = AudioSegment.from_mp3({audio})
175
- wav_file = mp3_file.export("output.wav", format="wav")
176
- sample_rate, audio_data = wavfile.read("output.wav")
177
- audio_data = audio_data / 32768.0
178
 
179
- result = pipe(audio_data)
 
 
 
 
180
  return result["text"]
181
 
182
  class Wikipedia_reader(Tool):
 
146
  name="audio_tool"
147
  description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
148
  inputs = {
149
+ 'audio': {"type": "audio", "description": "the audio of interest. Must be in the format mp3."}
150
  }
151
  output_type = "string"
152
 
 
155
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
156
 
157
  model_id = "openai/whisper-large-v3"
158
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype)
 
 
 
159
  model.to(device)
 
160
  processor = AutoProcessor.from_pretrained(model_id)
161
 
162
  pipe = pipeline(
 
165
  tokenizer=processor.tokenizer,
166
  feature_extractor=processor.feature_extractor,
167
  torch_dtype=torch_dtype,
168
+ device=device
169
  )
 
 
 
 
170
 
171
+ audio = AudioSegment.from_mp3({audio})
172
+ audio = audio.set_channels(1).set_frame_rate(16000)
173
+ audio.export("output.wav", format="wav")
174
+
175
+ result = pipe("output.wav", return_timestamps=True)
176
  return result["text"]
177
 
178
  class Wikipedia_reader(Tool):