Update custom model files, README, and requirements
Browse files- alignment.py +12 -3
alignment.py
CHANGED
|
@@ -197,7 +197,8 @@ class ForcedAligner:
|
|
| 197 |
import torchaudio
|
| 198 |
|
| 199 |
device = _get_device()
|
| 200 |
-
model,
|
|
|
|
| 201 |
|
| 202 |
# Convert audio to tensor (copy to ensure array is writable)
|
| 203 |
if isinstance(audio, np.ndarray):
|
|
@@ -259,7 +260,11 @@ class ForcedAligner:
|
|
| 259 |
|
| 260 |
for token_id, start_frame, end_frame in alignment_path:
|
| 261 |
if token_id == separator_id: # Word separator
|
| 262 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
start_time = max(0.0, current_word_start * frame_duration - start_offset)
|
| 264 |
end_time = max(0.0, current_word_end * frame_duration - end_offset)
|
| 265 |
word_timestamps.append(
|
|
@@ -278,7 +283,11 @@ class ForcedAligner:
|
|
| 278 |
current_word_end = end_frame
|
| 279 |
|
| 280 |
# Don't forget the last word
|
| 281 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
start_time = max(0.0, current_word_start * frame_duration - start_offset)
|
| 283 |
end_time = max(0.0, current_word_end * frame_duration - end_offset)
|
| 284 |
word_timestamps.append(
|
|
|
|
| 197 |
import torchaudio
|
| 198 |
|
| 199 |
device = _get_device()
|
| 200 |
+
model, _labels, dictionary = cls.get_instance(device)
|
| 201 |
+
assert cls._bundle is not None and dictionary is not None # Initialized by get_instance
|
| 202 |
|
| 203 |
# Convert audio to tensor (copy to ensure array is writable)
|
| 204 |
if isinstance(audio, np.ndarray):
|
|
|
|
| 260 |
|
| 261 |
for token_id, start_frame, end_frame in alignment_path:
|
| 262 |
if token_id == separator_id: # Word separator
|
| 263 |
+
if (
|
| 264 |
+
current_word_start is not None
|
| 265 |
+
and current_word_end is not None
|
| 266 |
+
and word_idx < len(words)
|
| 267 |
+
):
|
| 268 |
start_time = max(0.0, current_word_start * frame_duration - start_offset)
|
| 269 |
end_time = max(0.0, current_word_end * frame_duration - end_offset)
|
| 270 |
word_timestamps.append(
|
|
|
|
| 283 |
current_word_end = end_frame
|
| 284 |
|
| 285 |
# Don't forget the last word
|
| 286 |
+
if (
|
| 287 |
+
current_word_start is not None
|
| 288 |
+
and current_word_end is not None
|
| 289 |
+
and word_idx < len(words)
|
| 290 |
+
):
|
| 291 |
start_time = max(0.0, current_word_start * frame_duration - start_offset)
|
| 292 |
end_time = max(0.0, current_word_end * frame_duration - end_offset)
|
| 293 |
word_timestamps.append(
|