Spaces:

GAASH-Lab
/

Matcha-TTS-Kashmiri-Demo

Running

App Files Files Community

saeedabdulmuizz commited on Feb 1

Commit

2a22d6f

verified ·

1 Parent(s): 56b24d2

Update app.py

Browse files

Added option for translation

Files changed (1) hide show

app.py +78 -11

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import urllib.request
 import os  # Add this import at the top
 import soundfile as sf
 from huggingface_hub import hf_hub_download
 from matcha.models.matcha_tts import MatchaTTS
 from matcha.hifigan.models import Generator as HiFiGAN
 from matcha.hifigan.config import v1
@@ -40,7 +42,52 @@ def load_models():
     return model, vocoder
 model, vocoder = load_models()
 # --- Update the function signature to accept two arguments ---
 @torch.inference_mode()
@@ -74,16 +121,36 @@ def process(text, speaker_id):
     sf.write(output_path, audio, 22050)
     return output_path
-# --- Update the Interface inputs to match (2 inputs) ---
-demo = gr.Interface(
-    fn=process,
-    inputs=[
-        gr.Textbox(label="Kashmiri Text"),
-        # Added a slider so you can select the voice (0 is usually the default)
-        gr.Slider(0, model.n_spks - 1, step=1, value=0, label="Speaker ID")
-    ],
-    outputs=gr.Audio(label="Audio", type="filepath"),
-    title="GAASH-Lab: Kashmiri TTS"
-)
 demo.launch()

 import os  # Add this import at the top
 import soundfile as sf
 from huggingface_hub import hf_hub_download
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
 from matcha.models.matcha_tts import MatchaTTS
 from matcha.hifigan.models import Generator as HiFiGAN
 from matcha.hifigan.config import v1
     return model, vocoder
+# Translation Config
+TRANSLATION_BASE_MODEL = "sarvamai/sarvam-translate"
+TRANSLATION_ADAPTER = "GAASH-Lab/Sarvam-Kashmiri-finetuned"
+def load_translation_models():
+    print("[*] Loading Sarvam Translate Adapter...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_BASE_MODEL)
+        base_model = AutoModelForCausalLM.from_pretrained(
+            TRANSLATION_BASE_MODEL,
+            device_map="auto",
+            torch_dtype=torch.float16
+        )
+        model = PeftModel.from_pretrained(base_model, TRANSLATION_ADAPTER)
+        model.eval()
+        return tokenizer, model
+    except Exception as e:
+        print(f"Error loading translation model: {e}")
+        return None, None
 model, vocoder = load_models()
+trans_tokenizer, trans_model = load_translation_models()
+def translate(text):
+    if trans_model is None:
+        return "Translation model unavailable."
+    messages = [
+        {"role": "system", "content": "Translate the text below to Kashmiri."},
+        {"role": "user", "content": text},
+    ]
+    try:
+        # Note: apply_chat_template returns input_ids tensor directly if tokenize=True and return_tensors="pt"
+        input_ids = trans_tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(trans_model.device)
+    except Exception as e:
+        print(f"Chat template error: {e}")
+        return "Error in translation template."
+    with torch.no_grad():
+        outputs = trans_model.generate(input_ids, max_new_tokens=256)
+    # Slice reusing the input length
+    decoded = trans_tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+    return decoded.strip()
 # --- Update the function signature to accept two arguments ---
 @torch.inference_mode()
     sf.write(output_path, audio, 22050)
     return output_path
+# --- Gradio UI with Translation Option ---
+with gr.Blocks(title="GAASH-Lab: Kashmiri TTS & Translation") as demo:
+    gr.Markdown("# GAASH-Lab: Kashmiri TTS & Translation")
+    gr.Markdown("Enter text in English (check the box) or Kashmiri directly.")
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(label="Input Text", placeholder="Type here...")
+            is_english = gr.Checkbox(label="Input is English (Translate first)", value=False)
+            speaker_slider = gr.Slider(0, model.n_spks - 1, step=1, value=0, label="Speaker ID")
+            gen_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            trans_view = gr.Textbox(label="Processed/Translated Kashmiri Text", interactive=False)
+            audio_output = gr.Audio(label="Audio", type="filepath")
+    def pipeline(text, is_eng, spk_id):
+        processed_text = text
+        if is_eng:
+            print(f"Translating input: {text}")
+            processed_text = translate(text)
+        print(f"Synthesizing for: {processed_text}")
+        audio_path = process(processed_text, spk_id)
+        return processed_text, audio_path
+    gen_btn.click(
+        pipeline,
+        inputs=[input_text, is_english, speaker_slider],
+        outputs=[trans_view, audio_output]
+    )
 demo.launch()