Spaces:

GAASH-Lab
/

Matcha-TTS-Kashmiri-Demo

Running

App Files Files Community

Matcha-TTS-Kashmiri-Demo / app.py

saeedabdulmuizz

Update app.py

aa24966 verified 5 days ago

raw

history blame contribute delete

10.1 kB


	import gradio as gr
	from pathlib import Path
	import torch
	import urllib.request
	import os

	# HuggingFace Spaces GPU support
	try:
	import spaces
	SPACES_AVAILABLE = True
	except ImportError:
	SPACES_AVAILABLE = False
	print("[!] spaces module not available, running without GPU decorator")
	import soundfile as sf
	import traceback
	import huggingface_hub
	from huggingface_hub import hf_hub_download

	# Patch for older diffusers compatibility with newer huggingface_hub
	if not hasattr(huggingface_hub, "cached_download"):
	huggingface_hub.cached_download = hf_hub_download

	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import PeftModel
	from matcha.models.matcha_tts import MatchaTTS
	from matcha.hifigan.models import Generator as HiFiGAN
	from matcha.hifigan.config import v1
	from matcha.hifigan.env import AttrDict
	from matcha.text import text_to_sequence
	from matcha.utils.utils import intersperse

	HF_TOKEN = os.getenv("HF_TOKEN")

	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	MODEL_REPO = "GAASH-Lab/Matcha-TTS-Kashmiri"

	def load_models():
	print("[*] Downloading GAASH-Lab checkpoint...")
	ckpt = hf_hub_download(repo_id=MODEL_REPO, filename="model.ckpt", token=HF_TOKEN)
	model = MatchaTTS.load_from_checkpoint(ckpt, map_location=DEVICE, weights_only=False)
	model.eval()

	print("[*] Loading HiFi-GAN vocoder...")
	# The file 'generator_v1' is what the code calls 'hifigan_T2_v1'
	# We download it from the official GitHub release if not found locally
	vocoder_path = Path("hifigan_T2_v1")
	if not vocoder_path.exists():
	url = "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/generator_v1"
	urllib.request.urlretrieve(url, vocoder_path)

	vocoder = HiFiGAN(AttrDict(v1)).to(DEVICE)
	state_dict = torch.load(vocoder_path, map_location=DEVICE)
	vocoder.load_state_dict(state_dict['generator'])
	vocoder.eval()
	vocoder.remove_weight_norm()

	return model, vocoder

	# Translation Config
	TRANSLATION_BASE_MODEL = "sarvamai/sarvam-translate"
	TRANSLATION_ADAPTER = "GAASH-Lab/Sarvam-Kashmiri-finetuned"

	# Global cache for translation model (loaded lazily when GPU is available)
	_trans_cache = {"tokenizer": None, "model": None, "loaded": False}

	def load_translation_models():
	"""Load translation model lazily on first use (CPU deployment)."""
	global _trans_cache

	if _trans_cache["loaded"]:
	return _trans_cache["tokenizer"], _trans_cache["model"]

	print("[*] Loading Sarvam Translate Adapter (CPU mode)...")
	try:
	# Load the tokenizer with left padding (required for causal LM)
	tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_BASE_MODEL, trust_remote_code=True)
	tokenizer.padding_side = "left"
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load the base model on CPU with bfloat16 to reduce memory
	# bfloat16 is better supported on CPU than float16
	print("[*] Loading base model on CPU (bfloat16)...")
	base_model = AutoModelForCausalLM.from_pretrained(
	TRANSLATION_BASE_MODEL,
	torch_dtype=torch.bfloat16,
	device_map="cpu",
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	# Load the LoRA adapter
	print("[*] Loading LoRA adapter...")
	model = PeftModel.from_pretrained(base_model, TRANSLATION_ADAPTER)

	# Merge LoRA weights into base model for faster inference
	# This eliminates adapter overhead during generation
	print("[*] Merging LoRA weights for faster inference...")
	model = model.merge_and_unload()
	model.eval()

	print(f"[+] Translation model loaded and merged successfully on CPU.")
	_trans_cache["tokenizer"] = tokenizer
	_trans_cache["model"] = model
	_trans_cache["loaded"] = True
	return tokenizer, model
	except Exception as e:
	print(f"[-] Error loading translation model: {e}")
	traceback.print_exc()
	return None, None

	# Load TTS models at startup (they're smaller)
	model, vocoder = load_models()
	# Translation model will be loaded lazily when GPU is available

	def _translate_impl(text):
	"""Internal translation implementation - matching evaluate_model.py approach."""
	# Load model lazily (will be cached after first load)
	trans_tokenizer, trans_model = load_translation_models()

	if trans_model is None:
	return "Translation model unavailable."

	# Build chat messages (matching evaluate_model.py)
	messages = [
	{"role": "system", "content": "Translate the text below to Kashmiri."},
	{"role": "user", "content": text},
	]

	try:
	# Apply chat template (matching evaluate_model.py)
	prompt = trans_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = trans_tokenizer(prompt, padding=True, truncation=True, max_length=512, return_tensors="pt")

	# Move inputs to model's device
	inputs = {k: v.to(trans_model.device) for k, v in inputs.items()}

	print(f"[DEBUG] Input tokens: {inputs['input_ids'].shape[1]}")

	except Exception as e:
	print(f"Chat template error: {e}")
	traceback.print_exc()
	return "Error in translation template."

	try:
	import time
	start_time = time.time()
	print("[DEBUG] Starting generation...")

	# Generation settings optimized for CPU inference
	# - Greedy decoding (do_sample=False) is faster than sampling
	# - Same quality as temp=0.01 which was near-greedy anyway
	with torch.no_grad():
	generated = trans_model.generate(
	**inputs,
	max_new_tokens=256, # Keep full length for long texts
	do_sample=False, # Greedy decoding for speed
	num_beams=1, # No beam search overhead
	)

	elapsed = time.time() - start_time
	print(f"[DEBUG] Generation completed in {elapsed:.2f}s")

	# Decode only the new tokens (matching evaluate_model.py)
	input_len = inputs['input_ids'].shape[1]
	output_ids = generated[0][input_len:]
	decoded = trans_tokenizer.decode(output_ids, skip_special_tokens=True).replace("\n", "")

	print(f"[DEBUG] New tokens: {len(output_ids)}")
	print(f"[DEBUG] Decoded: '{decoded}'")

	return decoded.strip()

	except Exception as e:
	print(f"Generation error: {e}")
	traceback.print_exc()
	return "Error during translation generation."

	# Simple wrapper function for CPU deployment
	def translate(text):
	return _translate_impl(text)


	# --- Update the function signature to accept two arguments ---
	@torch.inference_mode()
	def process(text, speaker_id, n_timesteps=10):
	# 1. Kashmiri script normalization
	text = text.replace("ي", "ی").replace("ك", "ک").replace("۔", "").strip()

	# 2. Text to Sequence
	cleaner = "basic_cleaners"
	sequence, _ = text_to_sequence(text, [cleaner])

	# Filter out any non-integer values (unknown characters not in vocabulary)
	# This happens when text contains characters not supported by the TTS model
	filtered_sequence = [s for s in sequence if isinstance(s, int)]

	x = torch.tensor(intersperse(filtered_sequence, 0), dtype=torch.long, device=DEVICE)[None]
	x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)

	# 3. Use the Speaker ID from the interface
	# Even if you only use one voice, the model requires this tensor
	spks = torch.tensor([int(speaker_id)], device=DEVICE, dtype=torch.long)

	# 4. Generate Mel-spectrogram
	output = model.synthesise(
	x,
	x_lengths,
	n_timesteps=n_timesteps,
	temperature=0.667,
	spks=spks,
	length_scale=1.0
	)

	# 5. Generate Waveform
	audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
	output_path = "out.wav"
	sf.write(output_path, audio, 22050)
	return output_path

	# --- Gradio UI with Translation Option ---
	with gr.Blocks(title="GAASH-Lab: Kashmiri TTS & Translation") as demo:
	gr.Markdown("# GAASH-Lab: Kashmiri TTS & Translation")
	gr.Markdown("Enter text in English (check the box) or Kashmiri directly.")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input Text", placeholder="Type here...")
	is_english = gr.Checkbox(label="Input is English (Translate first)", value=False)
	speaker_radio = gr.Radio(choices=["Male", "Female"], value="Male", label="Speaker Voice")
	quality_radio = gr.Radio(
	choices=["Low (fast)", "Medium", "High"],
	value="Low (fast)",
	label="Quality"
	)
	gen_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	trans_view = gr.Textbox(label="Processed/Translated Kashmiri Text", interactive=False)
	audio_output = gr.Audio(label="Audio", type="filepath")

	def pipeline(text, is_eng, spk_voice, quality):
	spk_id = 422 if spk_voice == "Male" else 423

	if "Low" in quality:
	steps = 10
	elif "Medium" in quality:
	steps = 50
	else:
	steps = 500

	processed_text = text
	if is_eng:
	print(f"Translating input: {text}")
	processed_text = translate(text)

	print(f"Synthesizing for: {processed_text}")
	audio_path = process(processed_text, spk_id, steps)
	return processed_text, audio_path

	gen_btn.click(
	pipeline,
	inputs=[input_text, is_english, speaker_radio, quality_radio],
	outputs=[trans_view, audio_output]
	)

	demo.launch(ssr_mode=False)