Spaces:

E-motionAssistant
/

Space2

Sleeping

App Files Files Community

amasha03 commited on 24 days ago

Commit

c36673e

verified ·

1 Parent(s): 754e278

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -36

app.py CHANGED Viewed

@@ -1,63 +1,52 @@
 import gradio as gr
 import torch
-from TTS.utils.synthesizer import Synthesizer
-from TTS.tts.models.vits import Vits
-from TTS.tts.configs.vits_config import VitsConfig
 from huggingface_hub import hf_hub_download
 import os
 def load_eng_model():
     repo_id = "E-motionAssistant/text-to-speech-VITS-english"
-    print(f"--- Bypassing TTS Library English Defaults ---")
     model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pth")
     config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
-    # 1. Load the Config
-    config = VitsConfig()
-    config.load_json(config_path)
-    # 2. THE LOBOTOMY: Strip the language and characters from the library's view
-    # This stops the library from forcing '131'
-    config.model_args.num_chars = 137
-    if hasattr(config, 'characters'):
-        config.characters = None # Forces the model to use the checkpoint's internal map
-    # 3. Build the Model Architecture Manually
-    model = Vits.init_from_config(config)
-    # 4. Load the weights
-    checkpoint = torch.load(model_path, map_location="cpu")
-    # Use 'strict=False' but now the architecture should actually match
-    model.load_state_dict(checkpoint["model"], strict=False)
-    model.eval()
-    # 5. Build Synthesizer WITHOUT a language label
-    syn = Synthesizer(
-        tts_checkpoint=model_path,
-        tts_config_path=config_path,
-        use_cuda=False
-    )
-    syn.tts_model = model
-    return syn
 # --- Initialization ---
 try:
     eng_tts = load_eng_model()
-    print("--- SUCCESS: LIBRARY BYPASSED, MODEL LOADED ---")
 except Exception as e:
-    print(f"LOAD FAILED: {e}")
     eng_tts = None
 def generate_voice(text):
     if not eng_tts: return None
     try:
-        output_path = os.path.join(os.getcwd(), "output.wav")
-        # Synthesize using the manual model
-        wav = eng_tts.tts(text=str(text))
-        eng_tts.save_wav(wav, output_path)
         return output_path
     except Exception as e:
         print(f"Synthesis Error: {e}")
@@ -67,7 +56,7 @@ demo = gr.Interface(
     fn=generate_voice,
     inputs=gr.Textbox(label="English Text"),
     outputs=gr.Audio(label="Result", type="filepath"),
-    title="TTS Library Override"
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+from TTS.api import TTS
 from huggingface_hub import hf_hub_download
 import os
 def load_eng_model():
     repo_id = "E-motionAssistant/text-to-speech-VITS-english"
+    print("--- Starting Weights Surgery ---")
     model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pth")
     config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
+    # 1. Load the "Brain" (Checkpoint) directly into PyTorch
+    checkpoint = torch.load(model_path, map_location="cpu")
+    # 2. PERFORM SURGERY: Shrink the layer from 137 down to 131
+    # This removes the mismatch error entirely
+    raw_weights = checkpoint['model']['text_encoder.emb.weight']
+    print(f"Original weight shape: {raw_weights.shape}")
+    if raw_weights.shape[0] == 137:
+        print("Trimming 137 -> 131...")
+        checkpoint['model']['text_encoder.emb.weight'] = raw_weights[:131, :]
+    # 3. Save the "Fixed" brain to a new file
+    fixed_model_path = os.path.join(os.getcwd(), "fixed_model.pth")
+    torch.save(checkpoint, fixed_model_path)
+    print("Surgery complete. Fixed model saved.")
+    # 4. Load using the standard TTS library
+    # Now that the weights match (131), it won't crash!
+    tts = TTS(model_path=fixed_model_path, config_path=config_path, gpu=False)
+    return tts
 # --- Initialization ---
 try:
     eng_tts = load_eng_model()
+    print("--- SUCCESS: SURGERY WORKED, SYSTEM ONLINE ---")
 except Exception as e:
+    print(f"CRITICAL ERROR: {e}")
     eng_tts = None
 def generate_voice(text):
     if not eng_tts: return None
     try:
+        output_path = "output.wav"
+        eng_tts.tts_to_file(text=str(text), file_path=output_path)
         return output_path
     except Exception as e:
         print(f"Synthesis Error: {e}")
     fn=generate_voice,
     inputs=gr.Textbox(label="English Text"),
     outputs=gr.Audio(label="Result", type="filepath"),
+    title="English TTS (Surgery Version)"
 )
 if __name__ == "__main__":