mujahid1214 commited on
Commit
769a124
Β·
verified Β·
1 Parent(s): e7f5b9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -25
app.py CHANGED
@@ -1,39 +1,66 @@
1
  import gradio as gr
2
- import librosa
3
- import soundfile as sf
4
- import numpy as np
5
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Simple CPU-based RVC-like voice conversion
8
- # NOTE: This is a lightweight approximation for HF Spaces (no GPU needed)
 
 
 
9
 
10
- def clone_voice(input_voice, target_voice):
11
- if input_voice is None or target_voice is None:
12
- return "Upload input and target voices!", None
13
 
14
- # load voices
15
- inp_audio, sr = librosa.load(input_voice, sr=16000)
16
- tgt_audio, _ = librosa.load(target_voice, sr=16000)
17
 
18
- # Simple timbre transfer (placeholder lightweight model)
19
- converted = inp_audio * 0.3 + tgt_audio * 0.7
 
20
 
21
- output_path = "cloned.wav"
22
- sf.write(output_path, converted, 16000)
 
23
 
24
- return "Voice cloned!", output_path
25
 
 
 
26
 
27
- with gr.Blocks() as demo:
28
- gr.Markdown("# 🎀 Free Voice Clone Studio (HuggingFace Compatible RVC-lite)")
 
 
 
 
29
 
30
- input_voice = gr.Audio(label="Upload main voice (speech to convert)", type="filepath")
31
- target_voice = gr.Audio(label="Upload target voice (the voice to clone)", type="filepath")
32
 
33
- output_audio = gr.Audio(label="Cloned Voice Output")
34
- status = gr.Textbox(label="Status")
35
 
36
- btn = gr.Button("Clone Voice")
37
- btn.click(fn=clone_voice, inputs=[input_voice, target_voice], outputs=[status, output_audio])
 
 
 
 
 
 
 
 
 
 
38
 
39
- demo.launch()
 
1
  import gradio as gr
 
 
 
2
  import torch
3
+ import numpy as np
4
+ import torchaudio
5
+ from bark import SAMPLE_RATE, generate_audio, preload_models
6
+ from encodec import EncodecModel
7
+ from transformers import Wav2Vec2Processor, HubertModel
8
+
9
+ # -----------------------
10
+ # Load Bark
11
+ # -----------------------
12
+ preload_models()
13
+
14
+ # -----------------------
15
+ # Load Voice Encoder (HuBERT)
16
+ # -----------------------
17
+ processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
18
+ hubert = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
19
 
20
+ # -----------------------
21
+ # Load Encodec for audio reconstruction
22
+ # -----------------------
23
+ encodec_model = EncodecModel.encodec_model_24khz()
24
+ encodec_model.set_target_bandwidth(6.0)
25
 
 
 
 
26
 
27
+ def extract_voice_embedding(audio):
28
+ speech, sr = torchaudio.load(audio)
29
+ speech = torchaudio.functional.resample(speech, sr, 16000)
30
 
31
+ inputs = processor(speech.squeeze(), sampling_rate=16000, return_tensors="pt")
32
+ with torch.no_grad():
33
+ hidden_states = hubert(**inputs).last_hidden_state
34
 
35
+ # Average pooling for embedding
36
+ embedding = hidden_states.mean(dim=1)
37
+ return embedding
38
 
 
39
 
40
+ def generate_voice(text, ref_audio):
41
+ embedding = extract_voice_embedding(ref_audio)
42
 
43
+ # Bark generation
44
+ audio_array = generate_audio(
45
+ text,
46
+ history_prompt=None,
47
+ speaker_embedding=embedding.squeeze().tolist()
48
+ )
49
 
50
+ return (SAMPLE_RATE, np.array(audio_array))
 
51
 
 
 
52
 
53
+ # -----------------------
54
+ # Gradio UI
55
+ # -----------------------
56
+ app = gr.Interface(
57
+ fn=generate_voice,
58
+ inputs=[
59
+ gr.Textbox(label="Text to Speak"),
60
+ gr.Audio(label="Reference Voice (5–20 sec)", type="filepath")
61
+ ],
62
+ outputs=gr.Audio(label="Generated Voice"),
63
+ title="Free Voice Cloner (Bark + HuBERT)",
64
+ )
65
 
66
+ app.launch()