Spaces:
Running
Running
| import os, uuid, subprocess, boto3, replicate | |
| from typing import Dict, Any | |
| from dotenv import load_dotenv | |
| # from helpers_function.helpers import upload_to_r2 | |
| load_dotenv() | |
| replicate_client = replicate.Client(api_token=os.getenv("REPLICATE_API_KEY")) | |
| def video_to_audio(video_path: str) -> str: | |
| """Extract audio from video using ffmpeg and return audio filename.""" | |
| audio_filename = f"{uuid.uuid4()}.mp3" | |
| command = [ | |
| "ffmpeg", "-hide_banner", "-loglevel", "error", | |
| "-i", video_path, "-vn", "-acodec", "libmp3lame", "-y", audio_filename | |
| ] | |
| subprocess.run(command, check=True) | |
| return audio_filename | |
| def extract_text_from_video(video_path: str, max_duration: int = 60) -> Dict[str, Any]: | |
| """ | |
| Convert video speech to text using Replicate seamless model. | |
| Steps: | |
| 1. Extract audio from video. | |
| 2. Upload audio to R2. | |
| 3. Run Replicate model. | |
| """ | |
| audio_file = video_to_audio(video_path) | |
| # audio_url = upload_to_r2(audio_file, f"audio/{os.path.basename(audio_file)}") | |
| try: | |
| with open(audio_file, "rb") as f: | |
| # Call Replicate model | |
| result = replicate_client.run( | |
| "cjwbw/seamless_communication:668a4fec05a887143e5fe8d45df25ec4c794dd43169b9a11562309b2d45873b0", | |
| input={ | |
| "task_name": "S2ST (Speech to Speech translation)", | |
| "input_audio": f, | |
| "input_text_language": "None", | |
| "max_input_audio_length": max_duration, | |
| "target_language_text_only": "English", | |
| "target_language_with_speech": "English", | |
| }, | |
| ) | |
| finally: | |
| try: | |
| os.remove(audio_file) | |
| except Exception: | |
| pass | |
| return result if result else {"error": "No output"} | |