Spaces:
Running on Zero
Running on Zero
lulavc commited on
Commit ·
ae3213a
1
Parent(s): 56ced0e
fix: wav shape, float dtype check, extract_audio cleanup, NaN duration, HF token for InferenceClient
Browse files- app.py +4 -1
- dubbing.py +14 -4
app.py
CHANGED
|
@@ -129,7 +129,7 @@ def _coerce_frames(frames):
|
|
| 129 |
arr = frame.cpu().float().numpy()
|
| 130 |
if arr.ndim == 3 and arr.shape[0] in (1, 3, 4):
|
| 131 |
arr = arr.transpose(1, 2, 0)
|
| 132 |
-
if arr.max() <= 1.0:
|
| 133 |
arr = (arr * 255).clip(0, 255)
|
| 134 |
arr = arr.astype(np.uint8)
|
| 135 |
else:
|
|
@@ -188,6 +188,9 @@ def _run_tts(text: str, voice_ref: str | None, emotion: float, language: str = "
|
|
| 188 |
audio_prompt_path=voice_ref if voice_ref else None,
|
| 189 |
exaggeration=float(emotion),
|
| 190 |
)
|
|
|
|
|
|
|
|
|
|
| 191 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 192 |
out_path = f.name
|
| 193 |
torchaudio.save(out_path, wav, model.sr)
|
|
|
|
| 129 |
arr = frame.cpu().float().numpy()
|
| 130 |
if arr.ndim == 3 and arr.shape[0] in (1, 3, 4):
|
| 131 |
arr = arr.transpose(1, 2, 0)
|
| 132 |
+
if arr.dtype.kind == 'f' and arr.max() <= 1.0:
|
| 133 |
arr = (arr * 255).clip(0, 255)
|
| 134 |
arr = arr.astype(np.uint8)
|
| 135 |
else:
|
|
|
|
| 188 |
audio_prompt_path=voice_ref if voice_ref else None,
|
| 189 |
exaggeration=float(emotion),
|
| 190 |
)
|
| 191 |
+
# torchaudio.save requires 2-D tensor [channels, samples]
|
| 192 |
+
if wav.ndim == 1:
|
| 193 |
+
wav = wav.unsqueeze(0)
|
| 194 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 195 |
out_path = f.name
|
| 196 |
torchaudio.save(out_path, wav, model.sr)
|
dubbing.py
CHANGED
|
@@ -69,7 +69,15 @@ def extract_audio(video_path: str) -> str:
|
|
| 69 |
"-ar", "16000", "-ac", "1",
|
| 70 |
out_path,
|
| 71 |
]
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
return out_path
|
| 74 |
|
| 75 |
|
|
@@ -118,8 +126,9 @@ def translate(text: str, source_lang: str, target_lang: str) -> str:
|
|
| 118 |
src_code = get_nllb_code(source_lang)
|
| 119 |
tgt_code = get_nllb_code(target_lang)
|
| 120 |
|
| 121 |
-
# Client instantiated once outside the retry loop
|
| 122 |
-
|
|
|
|
| 123 |
last_exc: Optional[Exception] = None
|
| 124 |
for attempt in range(3):
|
| 125 |
try:
|
|
@@ -190,6 +199,7 @@ def get_video_duration(video_path: str) -> float:
|
|
| 190 |
duration = float(raw)
|
| 191 |
except (ValueError, TypeError) as exc:
|
| 192 |
raise ValueError(f"ffprobe returned invalid duration: {raw!r}") from exc
|
| 193 |
-
|
|
|
|
| 194 |
raise ValueError(f"ffprobe returned unusable duration: {duration}")
|
| 195 |
return duration
|
|
|
|
| 69 |
"-ar", "16000", "-ac", "1",
|
| 70 |
out_path,
|
| 71 |
]
|
| 72 |
+
try:
|
| 73 |
+
subprocess.run(cmd, check=True, timeout=60)
|
| 74 |
+
except Exception:
|
| 75 |
+
if os.path.exists(out_path):
|
| 76 |
+
try:
|
| 77 |
+
os.unlink(out_path)
|
| 78 |
+
except OSError:
|
| 79 |
+
pass
|
| 80 |
+
raise
|
| 81 |
return out_path
|
| 82 |
|
| 83 |
|
|
|
|
| 126 |
src_code = get_nllb_code(source_lang)
|
| 127 |
tgt_code = get_nllb_code(target_lang)
|
| 128 |
|
| 129 |
+
# Client instantiated once outside the retry loop; use HF token if available
|
| 130 |
+
_hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
| 131 |
+
client = InferenceClient(token=_hf_token if _hf_token else None)
|
| 132 |
last_exc: Optional[Exception] = None
|
| 133 |
for attempt in range(3):
|
| 134 |
try:
|
|
|
|
| 199 |
duration = float(raw)
|
| 200 |
except (ValueError, TypeError) as exc:
|
| 201 |
raise ValueError(f"ffprobe returned invalid duration: {raw!r}") from exc
|
| 202 |
+
import math
|
| 203 |
+
if not math.isfinite(duration) or duration <= 0:
|
| 204 |
raise ValueError(f"ffprobe returned unusable duration: {duration}")
|
| 205 |
return duration
|