InventorsHub commited on
Commit
2aee77d
·
verified ·
1 Parent(s): 244a409

Update speech_processing.py

Browse files
Files changed (1) hide show
  1. speech_processing.py +43 -40
speech_processing.py CHANGED
@@ -1,40 +1,43 @@
1
- from transformers import SeamlessM4Tv2Model, AutoProcessor
2
- import numpy as np
3
- import torch
4
- from pydub import AudioSegment
5
-
6
- # Load processor and model
7
- processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
8
- model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
9
-
10
- def translate_audio(audio_file):
11
- if audio_file is None:
12
- return "No audio file detected. Please try again."
13
-
14
- try:
15
- # Set the device (use GPU if available)
16
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
- model.to(device)
18
-
19
- # Reset audio file pointer and load audio
20
- audio = AudioSegment.from_file(audio_file, format="wav")
21
- audio = audio.set_frame_rate(16000).set_channels(1)
22
-
23
- # Convert audio to float32 NumPy array
24
- audio_array = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
25
-
26
- # Process input
27
- audio_inputs = processor(audios=audio_array, sampling_rate=16000, return_tensors="pt")
28
- audio_inputs = {key: val.to(device) for key, val in audio_inputs.items()} # Ensure tensors are on the correct device
29
-
30
- # Generate translation
31
- output_tokens = model.generate(**audio_inputs, tgt_lang="eng", generate_speech=False)
32
-
33
- # Extract token IDs from the generated output
34
- token_ids = output_tokens.sequences
35
- # Decode token IDs to text
36
- translated_text_from_audio = processor.batch_decode(token_ids, skip_special_tokens=True)[0]
37
-
38
- return translated_text_from_audio
39
- except Exception as e:
40
- return f"Error during audio translation: {e}"
 
 
 
 
1
+ from transformers import SeamlessM4Tv2Model, AutoProcessor
2
+ import numpy as np
3
+ import torch
4
+ from pydub import AudioSegment
5
+ import spaces
6
+
7
+ # Load processor and model
8
+ processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
9
+ model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
10
+
11
+
12
+ @spaces.GPU()
13
+ def translate_audio(audio_file):
14
+ if audio_file is None:
15
+ return "No audio file detected. Please try again."
16
+
17
+ try:
18
+ # Set the device (use GPU if available)
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+ model.to(device)
21
+
22
+ # Reset audio file pointer and load audio
23
+ audio = AudioSegment.from_file(audio_file, format="wav")
24
+ audio = audio.set_frame_rate(16000).set_channels(1)
25
+
26
+ # Convert audio to float32 NumPy array
27
+ audio_array = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
28
+
29
+ # Process input
30
+ audio_inputs = processor(audios=audio_array, sampling_rate=16000, return_tensors="pt")
31
+ audio_inputs = {key: val.to(device) for key, val in audio_inputs.items()} # Ensure tensors are on the correct device
32
+
33
+ # Generate translation
34
+ output_tokens = model.generate(**audio_inputs, tgt_lang="eng", generate_speech=False)
35
+
36
+ # Extract token IDs from the generated output
37
+ token_ids = output_tokens.sequences
38
+ # Decode token IDs to text
39
+ translated_text_from_audio = processor.batch_decode(token_ids, skip_special_tokens=True)[0]
40
+
41
+ return translated_text_from_audio
42
+ except Exception as e:
43
+ return f"Error during audio translation: {e}"