Update tools.py
Browse files
tools.py
CHANGED
|
@@ -146,7 +146,7 @@ class audio_or_mp3__interpreter(Tool):
|
|
| 146 |
name="audio_tool"
|
| 147 |
description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
|
| 148 |
inputs = {
|
| 149 |
-
'audio': {"type": "audio", "description": "the audio of interest"}
|
| 150 |
}
|
| 151 |
output_type = "string"
|
| 152 |
|
|
@@ -155,12 +155,8 @@ class audio_or_mp3__interpreter(Tool):
|
|
| 155 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 156 |
|
| 157 |
model_id = "openai/whisper-large-v3"
|
| 158 |
-
|
| 159 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 160 |
-
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
| 161 |
-
)
|
| 162 |
model.to(device)
|
| 163 |
-
|
| 164 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 165 |
|
| 166 |
pipe = pipeline(
|
|
@@ -169,14 +165,14 @@ class audio_or_mp3__interpreter(Tool):
|
|
| 169 |
tokenizer=processor.tokenizer,
|
| 170 |
feature_extractor=processor.feature_extractor,
|
| 171 |
torch_dtype=torch_dtype,
|
| 172 |
-
device=device
|
| 173 |
)
|
| 174 |
-
mp3_file = AudioSegment.from_mp3({audio})
|
| 175 |
-
wav_file = mp3_file.export("output.wav", format="wav")
|
| 176 |
-
sample_rate, audio_data = wavfile.read("output.wav")
|
| 177 |
-
audio_data = audio_data / 32768.0
|
| 178 |
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
return result["text"]
|
| 181 |
|
| 182 |
class Wikipedia_reader(Tool):
|
|
|
|
| 146 |
name="audio_tool"
|
| 147 |
description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
|
| 148 |
inputs = {
|
| 149 |
+
'audio': {"type": "audio", "description": "the audio of interest. Must be in the format mp3."}
|
| 150 |
}
|
| 151 |
output_type = "string"
|
| 152 |
|
|
|
|
| 155 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 156 |
|
| 157 |
model_id = "openai/whisper-large-v3"
|
| 158 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype)
|
|
|
|
|
|
|
|
|
|
| 159 |
model.to(device)
|
|
|
|
| 160 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 161 |
|
| 162 |
pipe = pipeline(
|
|
|
|
| 165 |
tokenizer=processor.tokenizer,
|
| 166 |
feature_extractor=processor.feature_extractor,
|
| 167 |
torch_dtype=torch_dtype,
|
| 168 |
+
device=device
|
| 169 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
audio = AudioSegment.from_mp3({audio})
|
| 172 |
+
audio = audio.set_channels(1).set_frame_rate(16000)
|
| 173 |
+
audio.export("output.wav", format="wav")
|
| 174 |
+
|
| 175 |
+
result = pipe("output.wav", return_timestamps=True)
|
| 176 |
return result["text"]
|
| 177 |
|
| 178 |
class Wikipedia_reader(Tool):
|