Spaces:

iahad
/

shaml

Sleeping

Cursor Agent AHAD commited on May 21

Commit

2400aac

unverified ·

1 Parent(s): 53a9529

Use text branch for text-only inference on text+ASR models

Pass inputs_asr=None instead of mirroring input as pseudo-ASR, which produced
undiacritized output. Implement direct encode/forward/decode to avoid Diac
predict_text empty-list .to() bug.

Co-authored-by: AHAD <ahad-m@users.noreply.github.com>

Files changed (2) hide show

README.md +1 -1
backend/app/services/inference.py +133 -21

README.md CHANGED Viewed

@@ -10,7 +10,7 @@
 **النموذج الافتراضي:** `rufaelfekadu/diac-transformer-text-asr-tashkeela-clartts`
-> **نموذج text+ASR:** عند إرسال نص فقط (بدون مخرجات Whisper)، يستخدم النظام النص نفسه كمدخل ASR بديلًا حتى يعمل الاستنتاج.
 ## المتطلبات

 **النموذج الافتراضي:** `rufaelfekadu/diac-transformer-text-asr-tashkeela-clartts`
+> **نموذج text+ASR:** عند إرسال نص فقط (بدون مخرجات Whisper)، يُستخدم فرع النص في النموذج (`inputs_asr=None`). لنتائج أفضل على الكلام، مرّر نص ASR من Whisper في حقل ASR (قريبًا) أو استخدم نموذج `text-only`.
 ## المتطلبات

backend/app/services/inference.py CHANGED Viewed

@@ -4,6 +4,8 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
     from diac.models import DiacritizationModule
@@ -16,42 +18,152 @@ def _max_length(model: DiacritizationModule) -> int:
     return int(getattr(model.config.INFERENCE, "MAX_LENGTH", 500))
-def _asr_inputs_for_texts(model: DiacritizationModule, texts: list[str]) -> list[str]:
-    """
-    text+ASR checkpoints require ASR tensors during predict_text().
-    When callers provide text only, mirror the undiacritized input as pseudo-ASR
-    so encode_batch returns a tensor instead of an empty list (which breaks .to()).
-    """
-    if not _uses_asr(model):
-        return []
-    return list(texts)
-def predict_diacritized(model: DiacritizationModule, texts: list[str]) -> list[str]:
-    """Run diacritization for one or more strings."""
     cleaned = [t.strip() for t in texts]
     if not cleaned:
         return []
     max_len = _max_length(model)
-    asr_texts = _asr_inputs_for_texts(model, cleaned)
     if all(len(text) <= max_len for text in cleaned):
-        outputs = model.predict_text(cleaned, asr_text=asr_texts)
-        return _normalize_outputs(outputs, len(cleaned))
     results: list[str] = []
-    for text, asr in zip(cleaned, asr_texts or [""] * len(cleaned), strict=True):
-        asr_arg = asr if _uses_asr(model) else []
-        window_out = model.predict_sliding_window(text, asr_text=asr_arg)
-        if isinstance(window_out, list):
-            results.append(window_out[0] if window_out else "")
-        else:
-            results.append(str(window_out))
     return results
 def _normalize_outputs(outputs: Any, expected: int) -> list[str]:
     if isinstance(outputs, list):
         if len(outputs) != expected:

 from typing import TYPE_CHECKING, Any
+import torch
 if TYPE_CHECKING:
     from diac.models import DiacritizationModule
     return int(getattr(model.config.INFERENCE, "MAX_LENGTH", 500))
+def _window_size(model: DiacritizationModule) -> int:
+    return int(getattr(model.config.INFERENCE, "WINDOW_SIZE", 50))
+def _buffer_size(model: DiacritizationModule) -> int:
+    return int(getattr(model.config.INFERENCE, "BUFFER_SIZE", 25))
+def predict_diacritized(
+    model: DiacritizationModule,
+    texts: list[str],
+    asr_texts: list[str] | None = None,
+) -> list[str]:
+    """
+    Run diacritization for one or more strings.
+    When no ASR text is supplied, the model's text branch is used (inputs_asr=None)
+    even for text+ASR checkpoints. This matches TransformerModel.forward and avoids
+    Diac predict_text bugs with empty ASR tensors.
+    """
     cleaned = [t.strip() for t in texts]
     if not cleaned:
         return []
+    asr = _normalize_asr_inputs(cleaned, asr_texts)
     max_len = _max_length(model)
     if all(len(text) <= max_len for text in cleaned):
+        return _predict_batch(model, cleaned, asr)
     results: list[str] = []
+    for text, asr_line in zip(cleaned, asr or [None] * len(cleaned), strict=True):
+        results.append(_predict_sliding_window(model, text, asr_line))
     return results
+def _normalize_asr_inputs(
+    texts: list[str], asr_texts: list[str] | None
+) -> list[str] | None:
+    if not asr_texts:
+        return None
+    if len(asr_texts) != len(texts):
+        raise ValueError("asr_texts length must match texts length")
+    cleaned_asr = [t.strip() for t in asr_texts]
+    if not all(cleaned_asr):
+        return None
+    return cleaned_asr
+def _predict_batch(
+    model: DiacritizationModule,
+    texts: list[str],
+    asr_texts: list[str] | None,
+) -> list[str]:
+    model.model.eval()
+    use_asr = _uses_asr(model) and asr_texts is not None
+    encoded_text, encoded_asr, _ = model.tokenizer.encode_batch(
+        texts,
+        asr_texts if use_asr else [],
+        padding=True,
+    )
+    encoded_text = encoded_text.to(model.device)
+    encoded_asr = _prepare_asr_tensor(model, encoded_asr, use_asr)
+    with torch.no_grad():
+        outputs = model.model(encoded_text, inputs_asr=encoded_asr)
+        predictions = outputs.argmax(dim=-1).cpu().tolist()
+    decoded = model.tokenizer.decode_batch(predictions, texts)
+    return _normalize_outputs(decoded, len(texts))
+def _predict_sliding_window(
+    model: DiacritizationModule,
+    text: str,
+    asr_text: str | None,
+) -> str:
+    from diac.utils.text import remove_diacritics
+    model.model.eval()
+    text = remove_diacritics(text).strip()
+    if not text:
+        return ""
+    asr_text = asr_text or ""
+    max_len = _max_length(model)
+    if len(text) <= max_len:
+        batch = _predict_batch(model, [text], [asr_text] if asr_text else None)
+        return batch[0]
+    window_size = _window_size(model)
+    buffer_size = _buffer_size(model)
+    ratio = len(asr_text) / len(text) if asr_text else 1.0
+    start_idx = 0
+    end_idx = window_size
+    output = ""
+    while end_idx <= len(text):
+        start = max(0, start_idx - buffer_size)
+        end = min(len(text), end_idx + buffer_size)
+        end_idx = min(len(text), start_idx + window_size)
+        chunk = text[start:end]
+        chunk_asr = asr_text[int(start * ratio) : int(end * ratio)] if asr_text else ""
+        encoded_chunk, encoded_asr_chunk, _ = model.tokenizer.encode(
+            chunk,
+            chunk_asr or None,
+            return_tensor=True,
+        )
+        encoded_chunk = encoded_chunk.to(model.device)
+        encoded_asr_chunk = _prepare_asr_tensor(
+            model,
+            encoded_asr_chunk,
+            _uses_asr(model) and bool(chunk_asr),
+        )
+        with torch.no_grad():
+            outputs = model.model(encoded_chunk, inputs_asr=encoded_asr_chunk).squeeze(0)
+            predictions = outputs.argmax(dim=-1).cpu().tolist()
+        decoded_chunk = model.tokenizer.decode(
+            predictions[start_idx - start : end_idx - start],
+            chunk[start_idx - start : end_idx - start],
+        )
+        output += decoded_chunk
+        start_idx = end_idx
+    return output
+def _prepare_asr_tensor(
+    model: DiacritizationModule,
+    encoded_asr: Any,
+    use_asr: bool,
+) -> torch.Tensor | None:
+    if not use_asr:
+        return None
+    if isinstance(encoded_asr, torch.Tensor):
+        return encoded_asr.to(model.device)
+    return None
 def _normalize_outputs(outputs: Any, expected: int) -> list[str]:
     if isinstance(outputs, list):
         if len(outputs) != expected: