harmonicsnail commited on
Commit
16f5077
·
1 Parent(s): 7f6a0d4

updated app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -4
app.py CHANGED
@@ -1,7 +1,95 @@
 
1
  import gradio as gr
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
  import gradio as gr
3
+ import os
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import tempfile
7
+ from model_inference import NetTALKWrapper
8
 
9
+ # choose TTS backend: "gtts" or "coqui" (TTS) or "none"
10
+ TTS_BACKEND = os.environ.get("TTS_BACKEND", "gtts")
11
 
12
+ # load model once (fast startup if model is cached)
13
+ MODEL_PATH = "nettalk_model.pt"
14
+ model = NetTALKWrapper(MODEL_PATH)
15
+
16
+ # optional: simple gTTS-based synth (works by speaking the phoneme string as text)
17
+ def synthesize_gtts(phoneme_text):
18
+ from gtts import gTTS
19
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
20
+ # gTTS outputs mp3 -> convert to wav using soundfile via numpy? Simpler: save mp3 then load then re-save wav
21
+ mp3_tmp = tmp.name + ".mp3"
22
+ tts = gTTS(phoneme_text, lang="en")
23
+ tts.save(mp3_tmp)
24
+ # load mp3 with soundfile may not work; scipy can read via pydub if available.
25
+ try:
26
+ import pydub
27
+ audio = pydub.AudioSegment.from_mp3(mp3_tmp)
28
+ audio.export(tmp.name, format="wav")
29
+ except Exception:
30
+ # fallback: return mp3 (Gradio accepts mp3 as audio)
31
+ return mp3_tmp
32
+ return tmp.name
33
+
34
+ # optional: Coqui TTS (phoneme-aware) - heavier but can take ARPAbet inputs
35
+ def synthesize_coqui(arpabet):
36
+ # This requires the `TTS` package and an appropriate model that accepts phoneme input.
37
+ try:
38
+ from TTS.api import TTS
39
+ except Exception as e:
40
+ raise RuntimeError("TTS package not installed or failed to import.") from e
41
+
42
+ # choose a model name you installed / that exists; example placeholder:
43
+ tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)
44
+ # Some TTS models accept `phoneme` argument or `phoneme_input=True`. Check the model docs.
45
+ wav = tts.tts(arpabet, speaker=None, phoneme_input=False)
46
+ # wav is a numpy array and sample rate accessible via tts.synthesizer.output_sample_rate
47
+ sr = tts.synthesizer.output_sample_rate if hasattr(tts.synthesizer, "output_sample_rate") else 22050
48
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
49
+ sf.write(tmp.name, wav, sr)
50
+ return tmp.name
51
+
52
+ def predict_and_speak(word):
53
+ if not word or not word.strip():
54
+ return "Please enter a word", None
55
+
56
+ phonemes = model.predict(word)
57
+ audio_path = None
58
+
59
+ # Try preferred backend
60
+ try:
61
+ if TTS_BACKEND == "coqui":
62
+ audio_path = synthesize_coqui(phonemes)
63
+ else:
64
+ audio_path = synthesize_gtts(phonemes)
65
+ except Exception as e:
66
+ # If synth fails, still return phonemes and a None audio
67
+ print("Synthesis failed:", e)
68
+ audio_path = None
69
+
70
+ # gr.Audio accepts: filename (wav/mp3), numpy array, or (np, sr)
71
+ return phonemes, audio_path
72
+
73
+ # ---- Gradio UI ----
74
+ css = """
75
+ body { background: linear-gradient(135deg,#0f172a,#020617); color: #e6eef8; }
76
+ .gradio-container { max-width: 900px; margin: auto; padding: 20px; }
77
+ """
78
+
79
+ with gr.Blocks(css=css, theme=gr.themes.Default()) as demo:
80
+ gr.Markdown("# 🧠 NetTALK → ARPAbet demo")
81
+ gr.Markdown("Enter a word, get predicted ARPAbet phonemes and a synthesized audio preview.")
82
+
83
+ with gr.Row():
84
+ word_in = gr.Textbox(label="Enter word", placeholder="example: 'computer'", lines=1)
85
+ run_btn = gr.Button("Predict")
86
+
87
+ phoneme_out = gr.Textbox(label="Predicted ARPAbet Phonemes")
88
+ audio_out = gr.Audio(label="Synthesized audio (preview)")
89
+
90
+ run_btn.click(fn=predict_and_speak, inputs=[word_in], outputs=[phoneme_out, audio_out])
91
+
92
+ gr.Markdown("Tip: Replace `preprocess()` and `decode_to_arpabet()` in `model_inference.py` with your real model code.")
93
+
94
+ if __name__ == "__main__":
95
+ demo.launch()