| import torch |
| import re |
| import gradio as gr |
| import soundfile as sf |
| import numpy as np |
| from transformers import SpeechT5HifiGan |
| from IPython.display import Audio |
| from transformers import SpeechT5ForTextToSpeech |
| from transformers import SpeechT5Processor |
|
|
| |
| number_words = { |
| 0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", |
| 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", |
| 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", |
| 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand" |
| } |
| replacements = [ |
| ("β", '"'), |
| ("β", '"'), |
| ("β", ","), |
| ("_", " "), |
| ("\xa0", " "), |
| ("\n", " "), |
| ("$","dollar"), |
| ("%","percent"), |
| ("&","and"), |
| ("*","star"), |
| ("+","plus"), |
| ("β","-") |
|
|
| ] |
| def number_to_words(number): |
| if number < 20: |
| return number_words[number] |
| elif number < 100: |
| tens, unit = divmod(number, 10) |
| return number_words[tens * 10] + (" " + number_words[unit] if unit else "") |
| elif number < 1000: |
| hundreds, remainder = divmod(number, 100) |
| return (number_words[hundreds] + " hundred" if hundreds > 1 else "hundred") + (" " + number_to_words(remainder) if remainder else "") |
| elif number < 1000000: |
| thousands, remainder = divmod(number, 1000) |
| return (number_to_words(thousands) + " thousand" if thousands > 1 else "thousand") + (" " + number_to_words(remainder) if remainder else "") |
| elif number < 1000000000: |
| millions, remainder = divmod(number, 1000000) |
| return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "") |
| elif number < 1000000000000: |
| billions, remainder = divmod(number, 1000000000) |
| return number_to_words(billions) + " billion" + (" " + number_to_words(remainder) if remainder else "") |
| else: |
| return str(number) |
|
|
| def replace_numbers_with_words(text): |
|
|
| def replace(match): |
| number = int(match.group()) |
| return number_to_words(number) |
|
|
| |
| result = re.sub(r'\b\d+\b', replace, text) |
|
|
| return result |
|
|
| def cleanup_text(text): |
| for src, dst in replacements: |
| text = text.replace(src, dst) |
| return text |
|
|
| def normalize_text(text): |
| |
| text = text.lower() |
|
|
| |
| text = re.sub(r'[^\w\s\']', '', text) |
|
|
| |
| text = ' '.join(text.split()) |
|
|
| return text |
|
|
| model = SpeechT5ForTextToSpeech.from_pretrained( |
| "Yassmen/speecht5_finetuned_english_tehnical" |
| ) |
|
|
| checkpoint = "microsoft/speecht5_tts" |
| processor = SpeechT5Processor.from_pretrained(checkpoint) |
|
|
|
|
|
|
| def generate_wav_file(text): |
| try: |
| converted_text = replace_numbers_with_words(text) |
| cleaned_text = cleanup_text(converted_text) |
| final_text = normalize_text(cleaned_text) |
| inputs = processor(text=final_text, return_tensors="pt") |
| speaker_embeddings = torch.tensor(np.load('speaker_embedding.npy')) |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
| |
| output_file = "output.wav" |
| sf.write(output_file, speech.detach().cpu().numpy(), 16000) |
| return output_file |
| except Exception as e: |
| print(f"Error: {e}") |
| return None |
|
|
| iface = gr.Interface( |
| fn=generate_wav_file, |
| inputs=gr.Textbox(lines=3, label="Enter text to convert to speech"), |
| outputs= gr.Audio(type="filepath", label="Generated Audio"), |
| title="Text-to-Speech Technical EN" |
| ) |
| if __name__ == "__main__": |
| iface.launch() |
|
|