Spaces:

Mohit0708
/

My-TTS-Streamlit

Sleeping

App Files Files Community

Mohit0708 commited on 21 days ago

Commit

4cdc4e7

verified ·

1 Parent(s): 607fd46

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -64

app.py CHANGED Viewed

@@ -1,65 +1,78 @@
-# app.py
-import streamlit as st
-import numpy as np
-import matplotlib.pyplot as plt
-from model.inference import TTSInference
-# Page Config
-st.set_page_config(page_title="My Custom TTS Engine", layout="wide")
-st.title("🎙️ Custom Architecture TTS Playground")
-st.markdown("This project demonstrates a custom PyTorch implementation of a Transformer-based TTS.")
-# Sidebar for Model Controls
-with st.sidebar:
-    st.header("Model Settings")
-    checkpoint = st.selectbox("Select Checkpoint", ["checkpoints/checkpoint_epoch_50c.pth", "checkpoints/checkpoint_epoch_3c.pth", "checkpoints/checkpoint_epoch_8.pth"])
-    device = st.radio("Device", ["cpu", "cuda"])
-    st.info("Load a specific training checkpoint to compare progress.")
-# Initialize the Inference Engine
-# (In a real app, use @st.cache_resource to load this once)
-tts_engine = TTSInference(checkpoint_path=checkpoint, device=device)
-# Main Input Area
-text_input = st.text_area("Enter Text to Speak:", "Deep learning is fascinating.", height=100)
-col1, col2 = st.columns([1, 2])
-with col1:
-    if st.button("Generate Audio", type="primary"):
-        with st.spinner("Running Inference..."):
-            # Call your backend
-            audio_data, sample_rate, mel_spec = tts_engine.predict(text_input)
-            # Play Audio
-            st.success("Generation Complete!")
-            st.audio(audio_data, sample_rate=sample_rate)
-            # --- VISUALIZATION (Crucial for Path 2) ---
-            # Showing the spectrogram proves you understand the data, not just the result.
-            st.subheader("Mel Spectrogram Analysis")
-            fig, ax = plt.subplots(figsize=(10, 3))
-            im = ax.imshow(mel_spec, aspect='auto', origin='lower', cmap='inferno')
-            plt.colorbar(im, ax=ax)
-            plt.title("Generated Mel Spectrogram")
-            plt.xlabel("Time Frames")
-            plt.ylabel("Mel Channels")
-            st.pyplot(fig)
-with col2:
-    st.subheader("Architecture Details")
-    st.code("""
-    class TextToMel(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.encoder = TransformerEncoder(...)
-            self.decoder = TransformerDecoder(...)
-        def forward(self, text):
-            # 1. Embed text
-            # 2. Add Positional Encodings
-            # 3. Predict Mel Frames
-            return mel_output
     """, language="python")

+# app.py
+import streamlit as st
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+from model.inference import TTSInference
+# Page Config
+st.set_page_config(page_title="My Custom TTS Engine", layout="wide")
+st.title("🎙️ Custom Architecture TTS Playground")
+st.markdown("This project demonstrates a custom PyTorch implementation of a Transformer-based TTS.")
+# Sidebar for Model Controls
+with st.sidebar:
+    st.header("Model Settings")
+    checkpoint = st.selectbox("Select Checkpoint", [
+        "checkpoints/checkpoint_epoch_50c.pth",
+        "checkpoints/checkpoint_epoch_3c.pth",
+        "checkpoints/checkpoint_epoch_8.pth"
+    ])
+    # Force CPU for Hugging Face free tier to prevent CUDA errors
+    device = st.radio("Device", ["cpu"])
+    st.info("Load a specific training checkpoint to compare progress.")
+# --- CRITICAL FIX FOR CLOUD: Cache the model ---
+@st.cache_resource
+def load_engine(ckpt_path, dev):
+    if not os.path.exists(ckpt_path):
+        return None # Return None if file isn't uploaded yet
+    return TTSInference(checkpoint_path=ckpt_path, device=dev)
+# Initialize the Inference Engine
+tts_engine = load_engine(checkpoint, device)
+# Main Input Area
+text_input = st.text_area("Enter Text to Speak:", "Deep learning is fascinating.", height=100)
+col1, col2 = st.columns([1, 2])
+with col1:
+    if st.button("Generate Audio", type="primary"):
+        if tts_engine is None:
+            st.error(f"⚠️ Error: Could not find '{checkpoint}'. Did you upload it to the 'checkpoints' folder on Hugging Face?")
+        else:
+            with st.spinner("Running Inference..."):
+                # Call your backend
+                audio_data, sample_rate, mel_spec = tts_engine.predict(text_input)
+                # Play Audio
+                st.success("Generation Complete!")
+                st.audio(audio_data, sample_rate=sample_rate)
+                # --- VISUALIZATION ---
+                st.subheader("Mel Spectrogram Analysis")
+                fig, ax = plt.subplots(figsize=(10, 3))
+                im = ax.imshow(mel_spec, aspect='auto', origin='lower', cmap='inferno')
+                plt.colorbar(im, ax=ax)
+                plt.title("Generated Mel Spectrogram")
+                plt.xlabel("Time Frames")
+                plt.ylabel("Mel Channels")
+                st.pyplot(fig)
+with col2:
+    st.subheader("Architecture Details")
+    st.code("""
+    class TextToMel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.encoder = TransformerEncoder(...)
+            self.decoder = TransformerDecoder(...)
+        def forward(self, text):
+            # 1. Embed text
+            # 2. Add Positional Encodings
+            # 3. Predict Mel Frames
+            return mel_output
     """, language="python")