StoryLens / audio_extractor.py
Marek4321's picture
Upload 13 files
6bdfadc verified
import os
from typing import List, Dict
from moviepy.editor import VideoFileClip
from openai import OpenAI
class AudioExtractor:
def __init__(self, openai_api_key: str = None, **kwargs):
self.openai_api_key = openai_api_key
self.client = None
if openai_api_key:
self.client = OpenAI(api_key=openai_api_key)
def extract_audio(self, video_path: str, output_path: str = None) -> str:
"""
Extract audio track from video.
Returns:
Path to extracted MP3 file (better for Whisper API)
"""
if output_path is None:
output_path = video_path.rsplit('.', 1)[0] + '.mp3'
video = VideoFileClip(video_path)
video.audio.write_audiofile(output_path, codec='mp3', verbose=False, logger=None)
video.close()
return output_path
def transcribe(self, audio_path: str) -> List[Dict]:
"""
Transcribe audio with timestamps using OpenAI Whisper API.
Returns:
List of segments: [
{"start": 0.0, "end": 3.2, "text": "Tired of everyday exhaustion?"},
{"start": 3.2, "end": 7.1, "text": "Meet the new SuperVit..."},
...
]
"""
if not self.client:
print("OpenAI API key not configured")
return []
try:
with open(audio_path, "rb") as audio_file:
# Use whisper-1 model with verbose_json for timestamps
response = self.client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
timestamp_granularities=["segment"]
)
segments = []
# Extract segments with timestamps
if hasattr(response, 'segments') and response.segments:
for segment in response.segments:
segments.append({
"start": segment.get('start', 0) if isinstance(segment, dict) else getattr(segment, 'start', 0),
"end": segment.get('end', 0) if isinstance(segment, dict) else getattr(segment, 'end', 0),
"text": (segment.get('text', '') if isinstance(segment, dict) else getattr(segment, 'text', '')).strip()
})
elif hasattr(response, 'text') and response.text:
# Fallback if no segments
segments.append({
"start": 0.0,
"end": 0.0,
"text": response.text.strip()
})
return segments
except Exception as e:
print(f"Transcription error: {e}")
return []