File size: 3,548 Bytes
43ec985 b7217f7 43ec985 e365862 43ec985 e365862 2342a7b b7217f7 43ec985 f47a2b0 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b 43ec985 2860b2a 43ec985 b7217f7 43ec985 b7217f7 43ec985 2342a7b 43ec985 2342a7b 43ec985 2342a7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import numpy as np
# --- Configuration ---
device = "cuda" if torch.cuda.is_available() else "cpu"
# --- HUBI INAAD SOO GELISAY FAYLASHAN ---
# Faylashan waa inay ku jiraan Hugging Face Spaces, isla galka uu ku jiro "app.py"
VOICE_SAMPLE_FILES = ["1.wav"]
# Directory to store speaker embedding files
EMBEDDING_DIR = "speaker_embeddings"
os.makedirs(EMBEDDING_DIR, exist_ok=True)
# --- Load Models ---
try:
print("Loading models... This may take a moment.")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
run_opts={"device": device},
savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
)
print("Models loaded successfully.")
except Exception as e:
raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
speaker_embeddings_cache = {}
def get_speaker_embedding(wav_file_path):
if wav_file_path in speaker_embeddings_cache:
return speaker_embeddings_cache[wav_file_path]
embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
if os.path.exists(embedding_path):
embedding = torch.load(embedding_path, map_location=device)
speaker_embeddings_cache[wav_file_path] = embedding
return embedding
if not os.path.exists(wav_file_path):
# Kani waa qaladka dhacay. Markaad faylasha soo geliso, meeshan wuu ka gudbayaa.
raise FileNotFoundError(f"Lama helin faylka codka: {wav_file_path}")
try:
audio, sr = torchaudio.load(wav_file_path)
if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000)
if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True)
with torch.no_grad():
embedding = speaker_model.encode_batch(audio.to(device))
embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
torch.save(embedding.cpu(), embedding_path)
speaker_embeddings_cache[wav_file_path] = embedding.to(device)
return embedding.to(device)
except Exception as e:
raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
# ... (Inta kale ee koodhka way saxantahay) ...
# --- Main Text-to-Speech Function ---
def text_to_speech(text, voice_choice):
# ... (sidaadii hore) ...
pass # Koodhka intiisa kale halkan geli
# --- Gradio Interface ---
iface = gr.Interface(
# ... (sidaadii hore) ...
pass # Koodhka intiisa kale halkan geli
)
# --- Launch the web interface ---
if __name__ == "__main__":
print("Hubinta faylasha codadka...")
for f in VOICE_SAMPLE_FILES:
if not os.path.exists(f):
# Qaladku halkan ayuu ka bilaabmayaa
raise FileNotFoundError(f"Mid ka mid ah faylasha lama helin: '{f}'. Fadlan hubi inaad soo gelisay Hugging Face Spaces.")
print("Diyaarinta astaamaha codadka...")
for voice_file in VOICE_SAMPLE_FILES:
get_speaker_embedding(voice_file)
print("Dhammaan codadka waa diyaar. Waxaa la furayaa interface-ka.")
iface.launch(share=True) |