import os, uuid, subprocess, boto3, replicate from typing import Dict, Any from dotenv import load_dotenv # from helpers_function.helpers import upload_to_r2 load_dotenv() replicate_client = replicate.Client(api_token=os.getenv("REPLICATE_API_KEY")) def video_to_audio(video_path: str) -> str: """Extract audio from video using ffmpeg and return audio filename.""" audio_filename = f"{uuid.uuid4()}.mp3" command = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-i", video_path, "-vn", "-acodec", "libmp3lame", "-y", audio_filename ] subprocess.run(command, check=True) return audio_filename def extract_text_from_video(video_path: str, max_duration: int = 60) -> Dict[str, Any]: """ Convert video speech to text using Replicate seamless model. Steps: 1. Extract audio from video. 2. Upload audio to R2. 3. Run Replicate model. """ audio_file = video_to_audio(video_path) # audio_url = upload_to_r2(audio_file, f"audio/{os.path.basename(audio_file)}") try: with open(audio_file, "rb") as f: # Call Replicate model result = replicate_client.run( "cjwbw/seamless_communication:668a4fec05a887143e5fe8d45df25ec4c794dd43169b9a11562309b2d45873b0", input={ "task_name": "S2ST (Speech to Speech translation)", "input_audio": f, "input_text_language": "None", "max_input_audio_length": max_duration, "target_language_text_only": "English", "target_language_with_speech": "English", }, ) finally: try: os.remove(audio_file) except Exception: pass return result if result else {"error": "No output"}