Spaces:
Sleeping
Sleeping
fix the encoder for low end execution
Browse files
app.py
CHANGED
|
@@ -15,8 +15,8 @@ tts = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
|
| 15 |
|
| 16 |
# Load speaker embeddings from dataset
|
| 17 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
| 18 |
-
speaker_embeddings = embeddings_dataset[7306]["xvector"]
|
| 19 |
-
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
|
| 20 |
|
| 21 |
# Ensure cache directory for output files
|
| 22 |
os.makedirs("output", exist_ok=True)
|
|
@@ -43,14 +43,10 @@ def process_audio(audio, target_language):
|
|
| 43 |
|
| 44 |
# Step 3: Generate speech from translated text
|
| 45 |
inputs = processor(text=translated_text, return_tensors="pt")
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
# Convert input_ids to embeddings using model embeddings
|
| 49 |
-
input_embeddings = tts.encoder.embed_tokens(input_ids)
|
| 50 |
|
| 51 |
with torch.no_grad():
|
| 52 |
-
|
| 53 |
-
speech = tts.generate_speech(input_embeddings, speaker_embeddings)
|
| 54 |
|
| 55 |
# Save generated speech
|
| 56 |
output_audio_path = "output/generated_speech.wav"
|
|
|
|
| 15 |
|
| 16 |
# Load speaker embeddings from dataset
|
| 17 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
| 18 |
+
speaker_embeddings = embeddings_dataset[7306]["xvector"]
|
| 19 |
+
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
|
| 20 |
|
| 21 |
# Ensure cache directory for output files
|
| 22 |
os.makedirs("output", exist_ok=True)
|
|
|
|
| 43 |
|
| 44 |
# Step 3: Generate speech from translated text
|
| 45 |
inputs = processor(text=translated_text, return_tensors="pt")
|
| 46 |
+
input_features = inputs.input_features
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
with torch.no_grad():
|
| 49 |
+
speech = tts.generate_speech(input_features, speaker_embeddings)
|
|
|
|
| 50 |
|
| 51 |
# Save generated speech
|
| 52 |
output_audio_path = "output/generated_speech.wav"
|