Spaces:

GAASH-Lab
/

Matcha-TTS-Kashmiri-Demo

Running

App Files Files Community

saeedabdulmuizz commited on Feb 4

Commit

48cb3b4

verified ·

1 Parent(s): edd0ea2

time optimization

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -87,9 +87,14 @@ def load_translation_models():
         # Load the LoRA adapter
         print("[*] Loading LoRA adapter...")
         model = PeftModel.from_pretrained(base_model, TRANSLATION_ADAPTER)
         model.eval()
-        print(f"[+] Translation model loaded successfully on CPU.")
         _trans_cache["tokenizer"] = tokenizer
         _trans_cache["model"] = model
         _trans_cache["loaded"] = True
@@ -137,13 +142,15 @@ def _translate_impl(text):
         start_time = time.time()
         print("[DEBUG] Starting generation...")
-        # Generation settings matching evaluate_model.py for Sarvam
         with torch.no_grad():
             generated = trans_model.generate(
                 **inputs,
-                max_new_tokens=512,
-                do_sample=True,
-                temperature=0.01,
             )
         elapsed = time.time() - start_time
@@ -182,13 +189,7 @@ def process(text, speaker_id):
     # Filter out any non-integer values (unknown characters not in vocabulary)
     # This happens when text contains characters not supported by the TTS model
     filtered_sequence = [s for s in sequence if isinstance(s, int)]
-    if not filtered_sequence:
-        raise ValueError("No valid characters found in input text for TTS model.")
-    if len(filtered_sequence) != len(sequence):
-        print(f"[WARN] Filtered out {len(sequence) - len(filtered_sequence)} unknown characters from TTS input")
     x = torch.tensor(intersperse(filtered_sequence, 0), dtype=torch.long, device=DEVICE)[None]
     x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)

         # Load the LoRA adapter
         print("[*] Loading LoRA adapter...")
         model = PeftModel.from_pretrained(base_model, TRANSLATION_ADAPTER)
+        # Merge LoRA weights into base model for faster inference
+        # This eliminates adapter overhead during generation
+        print("[*] Merging LoRA weights for faster inference...")
+        model = model.merge_and_unload()
         model.eval()
+        print(f"[+] Translation model loaded and merged successfully on CPU.")
         _trans_cache["tokenizer"] = tokenizer
         _trans_cache["model"] = model
         _trans_cache["loaded"] = True
         start_time = time.time()
         print("[DEBUG] Starting generation...")
+        # Generation settings optimized for CPU inference
+        # - Greedy decoding (do_sample=False) is faster than sampling
+        # - Same quality as temp=0.01 which was near-greedy anyway
         with torch.no_grad():
             generated = trans_model.generate(
                 **inputs,
+                max_new_tokens=256,  # Keep full length for long texts
+                do_sample=False,     # Greedy decoding for speed
+                num_beams=1,         # No beam search overhead
             )
         elapsed = time.time() - start_time
     # Filter out any non-integer values (unknown characters not in vocabulary)
     # This happens when text contains characters not supported by the TTS model
     filtered_sequence = [s for s in sequence if isinstance(s, int)]
     x = torch.tensor(intersperse(filtered_sequence, 0), dtype=torch.long, device=DEVICE)[None]
     x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)