# aligner.py import os import re import tempfile from typing import Dict, Any # These imports are from your original script and are installed by your setup.sh from aeneas.executetask import ExecuteTask from aeneas.task import Task def setup_aligner(): """ Aeneas does not require a model to be loaded, so this function does nothing. It exists to keep the structure of main.py consistent. """ print("✅ Aeneas aligner is ready (no setup required).") pass def align_words(audio_bytes: bytes, transcript: str) -> Dict[str, Any]: """ Performs word alignment using the file-based aeneas library. This is run sequentially to ensure stability. """ config = ( "task_language=eng|" "is_text_type=plain|" "os_task_file_format=json|" "task_adjust_boundary_algorithm=percent|" "task_adjust_boundary_percent_value=30" ) # Use a with statement to ensure temporary files are always cleaned up with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as af, \ tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tf: # Write audio to a temporary file af.write(audio_bytes) audio_path = af.name # Write the formatted transcript to a temporary file words_only = re.findall(r"\b[a-zA-Z']+\b", transcript) formatted_transcript = "\n".join(words_only) tf.write(formatted_transcript) text_path = tf.name try: # Setup and run the aeneas alignment task task = Task(config_string=config) task.audio_file_path_absolute = audio_path task.text_file_path_absolute = text_path ExecuteTask(task).execute() # Extract the aligned words and start times words = [] start_times = [] if task.sync_map is not None: for fragment in task.sync_map.fragments: word = fragment.text.strip() if word: words.append(word) start_times.append(float(fragment.begin)) return {"word": words, "startTime": start_times} finally: # Manually clean up the temporary files os.unlink(audio_path) os.unlink(text_path)