File size: 2,785 Bytes
6bdfadc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
from typing import List, Dict

from moviepy.editor import VideoFileClip
from openai import OpenAI


class AudioExtractor:
    def __init__(self, openai_api_key: str = None, **kwargs):
        self.openai_api_key = openai_api_key
        self.client = None
        if openai_api_key:
            self.client = OpenAI(api_key=openai_api_key)

    def extract_audio(self, video_path: str, output_path: str = None) -> str:
        """
        Extract audio track from video.

        Returns:
            Path to extracted MP3 file (better for Whisper API)
        """
        if output_path is None:
            output_path = video_path.rsplit('.', 1)[0] + '.mp3'

        video = VideoFileClip(video_path)
        video.audio.write_audiofile(output_path, codec='mp3', verbose=False, logger=None)
        video.close()

        return output_path

    def transcribe(self, audio_path: str) -> List[Dict]:
        """
        Transcribe audio with timestamps using OpenAI Whisper API.

        Returns:
            List of segments: [
                {"start": 0.0, "end": 3.2, "text": "Tired of everyday exhaustion?"},
                {"start": 3.2, "end": 7.1, "text": "Meet the new SuperVit..."},
                ...
            ]
        """
        if not self.client:
            print("OpenAI API key not configured")
            return []

        try:
            with open(audio_path, "rb") as audio_file:
                # Use whisper-1 model with verbose_json for timestamps
                response = self.client.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file,
                    response_format="verbose_json",
                    timestamp_granularities=["segment"]
                )

            segments = []

            # Extract segments with timestamps
            if hasattr(response, 'segments') and response.segments:
                for segment in response.segments:
                    segments.append({
                        "start": segment.get('start', 0) if isinstance(segment, dict) else getattr(segment, 'start', 0),
                        "end": segment.get('end', 0) if isinstance(segment, dict) else getattr(segment, 'end', 0),
                        "text": (segment.get('text', '') if isinstance(segment, dict) else getattr(segment, 'text', '')).strip()
                    })
            elif hasattr(response, 'text') and response.text:
                # Fallback if no segments
                segments.append({
                    "start": 0.0,
                    "end": 0.0,
                    "text": response.text.strip()
                })

            return segments

        except Exception as e:
            print(f"Transcription error: {e}")
            return []