Spaces:
Running
Running
| # app.py | |
| # Malayalam TTS (Free) – Multi-style, Prosody (rate & pitch), Batch paragraphs, WAV+MP3 | |
| # Model: AI4Bharat VITS (supports Malayalam among 13 Indian languages) | |
| import gradio as gr | |
| import soundfile as sf | |
| import tempfile | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| import numpy as np | |
| import os | |
| # Optional MP3 conversion | |
| try: | |
| from pydub import AudioSegment | |
| _HAS_PYDUB = True | |
| except Exception: | |
| _HAS_PYDUB = False | |
| MODEL_ID = "ai4bharat/vits_rasa_13" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| DEFAULT_SPEAKER = 11 # MAL_F | |
| DEFAULT_TEXT = ( | |
| "മലയാളം ടെക്സ്റ്റ് ശബ്ദമായി മാറ്റാൻ ഇതുപയോഗിക്കുക. താഴെ ഒരു ഉദാഹരണം നൽകുന്നു.\n\n" | |
| "ഇത് ഒരു രണ്ടാം പാരագրാഫ് ആണ്." | |
| ) | |
| STYLE_LABELS = { | |
| 0: "ALEXA", | |
| 1: "ANGER", | |
| 2: "BB", | |
| 3: "BOOK", | |
| 4: "CONV", | |
| 5: "DIGI", | |
| 6: "DISGUST", | |
| 7: "FEAR", | |
| 8: "HAPPY", | |
| 10: "NEWS", | |
| 12: "SAD", | |
| 14: "SURPRISE", | |
| 15: "UMANG", | |
| 16: "WIKI", | |
| } | |
| def split_paragraphs(text: str): | |
| # Split on blank lines; ignore empty chunks | |
| parts = [p.strip() for p in text.replace('\r','').split('\n\n')] | |
| parts = [p for p in parts if p] | |
| return parts if parts else ([text.strip()] if text.strip() else []) | |
| def time_scale(wav: np.ndarray, rate: float) -> np.ndarray: | |
| """Naive time scaling by linear interpolation. rate>1 -> faster (shorter).""" | |
| if rate <= 0: | |
| rate = 1.0 | |
| if abs(rate - 1.0) < 1e-6: | |
| return wav | |
| n = len(wav) | |
| new_len = max(1, int(n / rate)) | |
| x_old = np.linspace(0.0, 1.0, n, endpoint=False) | |
| x_new = np.linspace(0.0, 1.0, new_len, endpoint=False) | |
| return np.interp(x_new, x_old, wav).astype(wav.dtype) | |
| def apply_prosody(wav: np.ndarray, sr: int, rate: float, pitch_semitones: float): | |
| """ | |
| Approximate prosody control without heavy DSP: | |
| - We implement pitch by changing the output *sample rate* by factor pf = 2**(semitones/12). | |
| - Changing sample rate also changes playback speed by pf, so we pre-scale time by rate/pf | |
| to keep the final perceived speaking rate close to the requested rate. | |
| """ | |
| pf = 2.0 ** (pitch_semitones / 12.0) | |
| pre_rate = max(0.25, min(4.0, rate / max(pf, 1e-6))) | |
| y = time_scale(wav, pre_rate) | |
| out_sr = int(sr * pf) | |
| return y, out_sr | |
| def synthesize_once(text: str, speaker_id: int, style_id: int): | |
| inputs = tokenizer(text=text, return_tensors="pt").to(device) | |
| outputs = model(inputs['input_ids'], speaker_id=int(speaker_id), emotion_id=int(style_id)) | |
| wav = outputs.waveform.squeeze().detach().cpu().numpy() | |
| sr = model.config.sampling_rate | |
| return wav, sr | |
| def save_audio_pair(wav: np.ndarray, sr: int, base_name: str, make_mp3: bool): | |
| # Save WAV | |
| wav_path = base_name + ".wav" | |
| sf.write(wav_path, wav, sr) | |
| out_files = [wav_path] | |
| # Optionally save MP3 via pydub/ffmpeg | |
| if make_mp3 and _HAS_PYDUB: | |
| try: | |
| mp3_path = base_name + ".mp3" | |
| seg = AudioSegment.from_wav(wav_path) | |
| seg.export(mp3_path, format="mp3") | |
| out_files.append(mp3_path) | |
| except Exception: | |
| pass | |
| return out_files | |
| def parse_style(choice: str) -> int: | |
| try: | |
| return int(choice.split(":", 1)[0]) | |
| except Exception: | |
| return 0 | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # മലയാളം Text → AI Voice (Free) | |
| Open‑source Malayalam TTS powered by **AI4Bharat VITS**. | |
| Now supports **multiple voice styles**, **prosody (rate & pitch)**, **batch paragraphs**, and **WAV + MP3** output. | |
| """ | |
| ) | |
| with gr.Row(): | |
| txt = gr.Textbox(label="Malayalam Text (single or multiple paragraphs)", value=DEFAULT_TEXT, lines=8, placeholder="ഒരു അല്ലെങ്കിൽ നിരവധി പാരഗ്രാഫുകൾ ഇവിടെ പേസ്റ്റ് ചെയ്യുക… രണ്ട് newline ഉപയോഗിച്ച് വേർതിരിക്കുക.") | |
| with gr.Row(): | |
| speaker = gr.Slider(0, 19, value=DEFAULT_SPEAKER, step=1, label="Speaker ID (MAL_F = 11)") | |
| styles = gr.CheckboxGroup( | |
| choices=[f"{k}:{v}" for k, v in STYLE_LABELS.items()], | |
| value=["0:ALEXA", "10:NEWS", "3:BOOK"], | |
| label="Voice styles (select one or more)" | |
| ) | |
| with gr.Row(): | |
| rate = gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speaking rate (0.5–1.5)") | |
| pitch = gr.Slider(minimum=-4, maximum=+4, value=0, step=1, label="Pitch (semitones, -4 to +4)") | |
| batch = gr.Checkbox(value=True, label="Batch: split by blank lines (paragraphs)") | |
| make_mp3 = gr.Checkbox(value=True, label="Also export MP3 (needs ffmpeg)") | |
| with gr.Row(): | |
| btn = gr.Button("Generate", variant="primary") | |
| audio = gr.Audio(label="Preview (first file)", type="filepath") | |
| files_out = gr.Files(label="All generated files") | |
| note = gr.Markdown() | |
| def run(text, speaker_id, style_choices, rate, pitch, batch, make_mp3): | |
| text = (text or "").strip() | |
| if not text: | |
| raise gr.Error("ദയവായി മലയാളത്തിൽ ഒരു വാചകം/പാരഗ്രാഫ് നൽകുക.") | |
| paras = split_paragraphs(text) if batch else [text] | |
| if not style_choices: | |
| style_choices = ["0:ALEXA"] | |
| total = len(paras) * len(style_choices) | |
| if total > 30: | |
| raise gr.Error(f"താങ്കൾ വളരെ കൂടുതൽ ഔട്ട്പുട്ടുകൾ ആവശ്യപ്പെടുന്നു ({total}). ദയവായി കുറച്ച് പാരഗ്രാഫുകൾ/സ്റ്റൈലുകൾ തിരഞ്ഞെടുക്കുക (<= 30 files).") | |
| all_files = [] | |
| preview = None | |
| details = [] | |
| idx = 1 | |
| for pi, para in enumerate(paras, start=1): | |
| wav_raw, sr_raw = synthesize_once(para, int(speaker_id), parse_style(style_choices[0])) # synthesize once per paragraph using first style to get base prosody; style will be applied per file below anyway | |
| for sc in style_choices: | |
| stid = parse_style(sc) | |
| # Re-synthesize for each style to reflect emotion_id | |
| wav, sr = synthesize_once(para, int(speaker_id), stid) | |
| # Apply prosody approximation | |
| wav2, sr2 = apply_prosody(wav, sr, float(rate), float(pitch)) | |
| base = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name[:-4] | |
| base_named = f"{base}_p{pi:02d}_style-{stid}_{STYLE_LABELS.get(stid, 'STYLE')}" | |
| outs = save_audio_pair(wav2, sr2, base_named, bool(make_mp3)) | |
| all_files.extend(outs) | |
| if preview is None: | |
| preview = outs[0] | |
| details.append(f"• P{pi} – {STYLE_LABELS.get(stid, sc)} → {os.path.basename(outs[0])}{' (+MP3)' if len(outs)>1 else ''}") | |
| idx += 1 | |
| summary = ( | |
| f"Generated **{len(all_files)}** files for {len(paras)} paragraph(s) × {len(style_choices)} style(s).\n\n" | |
| + "\n".join(details) | |
| + ("\n\n**Note:** Rate & pitch are approximations using resampling; for studio-grade SSML prosody use a managed TTS like Azure." if True else "") | |
| ) | |
| return preview, all_files, summary | |
| btn.click(run, inputs=[txt, speaker, styles, rate, pitch, batch, make_mp3], outputs=[audio, files_out, note]) | |
| gr.Markdown( | |
| """ | |
| **Prosody controls** | |
| *Speaking rate* slows/speeds audio; *Pitch* raises/lowers tone (in semitones). These are **approximate** controls based on resampling. For high‑fidelity prosody, consider SSML in Azure TTS. | |
| **Batch mode** | |
| Split input into paragraphs using a blank line. The app creates one file per **paragraph × style**. | |
| **MP3 output** | |
| Requires `ffmpeg` (available on Hugging Face Spaces). If unavailable, only WAV will be produced. | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |