rishidahiya commited on
Commit
f42952b
·
verified ·
1 Parent(s): 0962eaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -48
app.py CHANGED
@@ -1,79 +1,112 @@
1
- from flask import Flask, request, jsonify, send_file
2
- from flask_cors import CORS
3
  from encoder import inference as encoder_inference
4
  from synthesizer.inference import Synthesizer
5
  from vocoder import inference as vocoder_inference
6
  import librosa
7
  import soundfile as sf
 
8
  from io import BytesIO
9
  import os
10
 
11
- app = Flask(__name__)
12
- CORS(app)
13
-
14
- # Load models once at startup
15
  print("Loading models...")
16
  encoder_inference.load_model("saved_models/encoder.pt")
17
  synthesizer = Synthesizer("saved_models/synthesizer.pt")
18
  vocoder_inference.load_model("saved_models/vocoder.pt")
19
  print("✓ Models loaded!")
20
 
21
- @app.route('/health', methods=['GET'])
22
- def health():
23
- return jsonify({"status": "ok"}), 200
24
-
25
- @app.route('/clone', methods=['POST'])
26
- def clone_voice():
27
- """Clone voice and synthesize speech"""
28
  try:
29
- # Get text and voice sample
30
- text = request.form.get('text')
31
- voice_sample = request.files.get('voice_sample')
 
 
32
 
33
- if not text or not voice_sample:
34
- return jsonify({"error": "Missing 'text' or 'voice_sample'"}), 400
 
 
 
 
35
 
36
- # Save uploaded file temporarily
37
- temp_path = f"/tmp/{voice_sample.filename}"
38
- voice_sample.save(temp_path)
39
 
40
- # Load and preprocess audio
41
- wav, sr = librosa.load(temp_path, sr=16000)
 
 
 
42
  wav = encoder_inference.preprocess_wav(wav)
 
43
 
44
  # Generate speaker embedding
45
  embed = encoder_inference.embed_utterance(wav)
 
46
 
47
- # Synthesize speech
48
  mels = synthesizer.synthesize_spectrograms([text], [embed])
 
49
 
50
  # Vocode to audio
51
- audio = vocoder_inference.vocoder(mels[0])
52
-
53
- # Save to bytes
54
- audio_io = BytesIO()
55
- sf.write(audio_io, audio, 22050, format='WAV')
56
- audio_io.seek(0)
57
-
58
- # Cleanup
59
- os.remove(temp_path)
60
 
61
- return send_file(audio_io, mimetype='audio/wav', as_attachment=True, download_name='cloned_voice.wav')
62
 
63
  except Exception as e:
64
- return jsonify({"error": str(e)}), 400
 
 
 
65
 
66
- @app.route('/', methods=['GET'])
67
- def index():
68
- return '''
69
- <h1>Voice Cloning API</h1>
70
- <p>POST to /clone with:</p>
71
- <ul>
72
- <li>text: Hindi/Kannada text to synthesize</li>
73
- <li>voice_sample: WAV/OGG audio file (5-10 seconds)</li>
74
- </ul>
75
- <p>Returns: WAV audio with cloned voice</p>
76
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- if __name__ == '__main__':
79
- app.run(host='0.0.0.0', port=7860, debug=False)
 
1
+ import gradio as gr
 
2
  from encoder import inference as encoder_inference
3
  from synthesizer.inference import Synthesizer
4
  from vocoder import inference as vocoder_inference
5
  import librosa
6
  import soundfile as sf
7
+ import numpy as np
8
  from io import BytesIO
9
  import os
10
 
11
+ # Load models at startup
 
 
 
12
  print("Loading models...")
13
  encoder_inference.load_model("saved_models/encoder.pt")
14
  synthesizer = Synthesizer("saved_models/synthesizer.pt")
15
  vocoder_inference.load_model("saved_models/vocoder.pt")
16
  print("✓ Models loaded!")
17
 
18
+ def clone_voice(voice_sample, text):
19
+ """Clone voice and generate speech"""
 
 
 
 
 
20
  try:
21
+ if voice_sample is None:
22
+ return None, "Error: No voice sample provided"
23
+
24
+ if not text or len(text.strip()) == 0:
25
+ return None, "Error: No text provided"
26
 
27
+ # Extract audio data and sample rate
28
+ if isinstance(voice_sample, tuple):
29
+ sr, audio_data = voice_sample
30
+ wav = audio_data.astype(np.float32) / 32768.0
31
+ else:
32
+ wav, sr = librosa.load(voice_sample, sr=16000)
33
 
34
+ print(f"Audio loaded: sr={sr}, shape={wav.shape}")
 
 
35
 
36
+ # Resample if needed
37
+ if sr != 16000:
38
+ wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
39
+
40
+ # Preprocess audio
41
  wav = encoder_inference.preprocess_wav(wav)
42
+ print(f"Preprocessed audio: {wav.shape}")
43
 
44
  # Generate speaker embedding
45
  embed = encoder_inference.embed_utterance(wav)
46
+ print(f"Speaker embedding: {embed.shape}")
47
 
48
+ # Synthesize
49
  mels = synthesizer.synthesize_spectrograms([text], [embed])
50
+ print(f"Mel-spectrogram: {mels[0].shape}")
51
 
52
  # Vocode to audio
53
+ wav_generated = vocoder_inference.vocoder(mels[0])
54
+ print(f"Generated audio: {wav_generated.shape}")
 
 
 
 
 
 
 
55
 
56
+ return (22050, (wav_generated * 32768).astype(np.int16)), "✓ Success!"
57
 
58
  except Exception as e:
59
+ print(f"Error: {e}")
60
+ import traceback
61
+ traceback.print_exc()
62
+ return None, f"Error: {str(e)}"
63
 
64
+ # Create Gradio interface
65
+ with gr.Blocks(title="Voice Cloning - Real-Time Test") as demo:
66
+ gr.Markdown("# 🎤 Voice Cloning Test")
67
+ gr.Markdown("Record your voice, enter text, and hear it synthesized in your voice!")
68
+
69
+ with gr.Row():
70
+ with gr.Column():
71
+ gr.Markdown("### Step 1: Record Your Voice")
72
+ voice_input = gr.Audio(
73
+ label="Record or Upload Voice Sample (5-10 seconds)",
74
+ type="numpy",
75
+ sources=["microphone", "upload"]
76
+ )
77
+
78
+ gr.Markdown("### Step 2: Enter Text")
79
+ text_input = gr.Textbox(
80
+ label="Text to Synthesize (Hindi or Kannada)",
81
+ placeholder="नमस्ते, यह एक परीक्षण है",
82
+ lines=3
83
+ )
84
+
85
+ with gr.Column():
86
+ gr.Markdown("### Step 3: Generated Speech")
87
+ audio_output = gr.Audio(label="Cloned Voice Output", type="numpy")
88
+ status_output = gr.Textbox(label="Status", interactive=False)
89
+
90
+ clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
91
+ clone_button.click(
92
+ clone_voice,
93
+ inputs=[voice_input, text_input],
94
+ outputs=[audio_output, status_output]
95
+ )
96
+
97
+ gr.Markdown("""
98
+ ### Instructions:
99
+ 1. **Record your voice** using the microphone (5-10 seconds in Hindi/Kannada) OR upload a WAV/OGG file
100
+ 2. **Enter text** you want to generate in your voice (Hindi or Kannada)
101
+ 3. **Click "Clone Voice & Generate Speech"**
102
+ 4. **Wait** (10-30 seconds on CPU) and hear the result!
103
+
104
+ ### Tips:
105
+ - Clearer voice samples = better results
106
+ - Longer samples (10 seconds) = better voice cloning
107
+ - Same language as input voice works best
108
+ - Be patient - CPU processing takes time!
109
+ """)
110
 
111
+ if __name__ == "__main__":
112
+ demo.launch(share=True)