Spaces:

HumeAI
/

tada

Running on Zero

sharath25 commited on 29 days ago

Commit

024f3d7

1 Parent(s): 7a6b47b

fix alignment

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,6 +31,7 @@ except ImportError:
 from tada.modules.encoder import Encoder, EncoderOutput  # noqa: E402
 from tada.modules.tada import InferenceOptions, TadaForCausalLM  # noqa: E402
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -358,21 +359,17 @@ def generate_speech(
         audio_duration = wav.shape[-1] / 24_000
-        # Extract only user-text step_logs, reconstructing any prefilled (missing) entries
         all_logs = output.step_logs or []
         if _model is not None and text and output.input_text_ids is not None:
             input_ids = output.input_text_ids[0]
             seq_len = input_ids.shape[0]
             n_eos = _model.config.shift_acoustic
-            # Find text boundary: last <|end_header_id|> token marks end of assistant header
-            end_header_id = _model.tokenizer.convert_tokens_to_ids("<|end_header_id|>")
-            # Scan backwards for the last end_header token
-            text_start = 0
-            for i in range(seq_len - 1, -1, -1):
-                if input_ids[i].item() == end_header_id:
-                    text_start = i + 1
-                    break
             text_end = seq_len - n_eos
             # Build a step -> log lookup from existing step_logs
             log_by_step = {e["step"]: e for e in all_logs}

 from tada.modules.encoder import Encoder, EncoderOutput  # noqa: E402
 from tada.modules.tada import InferenceOptions, TadaForCausalLM  # noqa: E402
+from tada.utils.text import normalize_text as normalize_text_fn  # noqa: E402
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         audio_duration = wav.shape[-1] / 24_000
+        # Extract only text-to-speak step_logs, reconstructing any prefilled (missing) entries
         all_logs = output.step_logs or []
         if _model is not None and text and output.input_text_ids is not None:
             input_ids = output.input_text_ids[0]
             seq_len = input_ids.shape[0]
             n_eos = _model.config.shift_acoustic
+            # Count text-to-speak tokens (same logic as generate())
+            normalized = normalize_text_fn(text) if normalize_text else text
+            n_text_tokens = len(_model.tokenizer.encode(normalized, add_special_tokens=False))
             text_end = seq_len - n_eos
+            text_start = text_end - n_text_tokens
             # Build a step -> log lookup from existing step_logs
             log_by_step = {e["step"]: e for e in all_logs}