| | import streamlit as st |
| | import torch |
| | import torchaudio |
| | import numpy as np |
| | import librosa |
| | import soundfile as sf |
| | from TTS.api import TTS |
| | from fairseq import checkpoint_utils |
| | import wget |
| | import os |
| | from io import BytesIO |
| | import tempfile |
| | import huggingface_hub |
| |
|
| | class VoiceConverter: |
| | def __init__(self): |
| | self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| | self.load_models() |
| |
|
| | def load_models(self): |
| | |
| | models_dir = "pretrained_models" |
| | os.makedirs(models_dir, exist_ok=True) |
| |
|
| | |
| | self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False) |
| |
|
| | |
| | vits_path = os.path.join(models_dir, "vits_female.pth") |
| | if not os.path.exists(vits_path): |
| | |
| | wget.download( |
| | "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth", |
| | vits_path |
| | ) |
| | |
| | self.vits_model = torch.load(vits_path, map_location=self.device) |
| | self.vits_model.eval() |
| |
|
| | def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"): |
| | |
| | wav, sr = librosa.load(audio_path) |
| | |
| | |
| | if sr != 22050: |
| | wav = librosa.resample(wav, orig_sr=sr, target_sr=22050) |
| | sr = 22050 |
| |
|
| | |
| | wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device) |
| |
|
| | |
| | with torch.no_grad(): |
| | converted = self.vits_model.voice_conversion( |
| | wav_tensor, |
| | speaker_id=speaker_id |
| | ) |
| |
|
| | |
| | wav_path = "temp.wav" |
| | sf.write(wav_path, converted.cpu().numpy(), sr) |
| | |
| | emotional_wav = self.tts.tts_with_vc( |
| | wav_path, |
| | speaker_wav=wav_path, |
| | emotion=emotion |
| | ) |
| |
|
| | return emotional_wav, sr |
| |
|
| | def save_audio(audio_data, sr): |
| | buffer = BytesIO() |
| | sf.write(buffer, audio_data, sr, format='WAV') |
| | return buffer |
| |
|
| | |
| | st.title("AI Voice Converter - Female Voice Transformation") |
| |
|
| | |
| | model_type = st.selectbox( |
| | "Select Voice Model", |
| | ["VITS Female", "YourTTS Female", "Mixed Model"] |
| | ) |
| |
|
| | |
| | voice_character = st.selectbox( |
| | "Select Voice Character", |
| | ["Anime Female", "Natural Female", "Young Female", "Mature Female"] |
| | ) |
| |
|
| | |
| | emotion = st.selectbox( |
| | "Select Emotion", |
| | ["Happy", "Sad", "Angry", "Neutral", "Excited"] |
| | ) |
| |
|
| | |
| | with st.expander("Advanced Settings"): |
| | pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0) |
| | clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8) |
| | speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0) |
| |
|
| | |
| | uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) |
| |
|
| | if uploaded_file is not None: |
| | |
| | converter = VoiceConverter() |
| |
|
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: |
| | tmp_file.write(uploaded_file.getvalue()) |
| | tmp_path = tmp_file.name |
| |
|
| | if st.button("Convert Voice"): |
| | try: |
| | with st.spinner("Converting voice... This may take a few moments."): |
| | |
| | speaker_id = { |
| | "Anime Female": 0, |
| | "Natural Female": 1, |
| | "Young Female": 2, |
| | "Mature Female": 3 |
| | }[voice_character] |
| |
|
| | |
| | converted_audio, sr = converter.convert_voice( |
| | tmp_path, |
| | speaker_id=speaker_id, |
| | emotion=emotion |
| | ) |
| |
|
| | |
| | audio_buffer = save_audio(converted_audio, sr) |
| |
|
| | |
| | st.audio(audio_buffer, format='audio/wav') |
| |
|
| | |
| | st.download_button( |
| | label="Download Converted Audio", |
| | data=audio_buffer, |
| | file_name="ai_converted_voice.wav", |
| | mime="audio/wav" |
| | ) |
| |
|
| | except Exception as e: |
| | st.error(f"Error during conversion: {str(e)}") |
| |
|
| | |
| | st.markdown(""" |
| | ### Model Information: |
| | 1. **VITS Female**: Pre-trained on a large dataset of female voices |
| | 2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model |
| | 3. **Mixed Model**: Combination of multiple models for better quality |
| | |
| | ### Voice Characters: |
| | - **Anime Female**: High-pitched, animated style voice |
| | - **Natural Female**: Realistic female voice |
| | - **Young Female**: Young adult female voice |
| | - **Mature Female**: Mature female voice |
| | |
| | ### Tips for Best Results: |
| | - Use clear audio input with minimal background noise |
| | - Short audio clips (5-30 seconds) work best |
| | - Experiment with different emotions and voice characters |
| | - Adjust advanced settings for fine-tuning |
| | """) |
| |
|
| | |
| | """ |
| | pip install requirements: |
| | TTS |
| | fairseq |
| | torch |
| | torchaudio |
| | streamlit |
| | librosa |
| | soundfile |
| | numpy |
| | wget |
| | huggingface_hub |
| | """ |