File size: 7,900 Bytes
2342bb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3616c3
 
2342bb7
 
 
 
9ffb7e0
d3616c3
2342bb7
 
 
 
9ffb7e0
d3616c3
2342bb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import re
from typing import Optional, Dict, Any, List
from youtube_transcript_api import YouTubeTranscriptApi
from smolagents import Tool


class YouTubeTranscriptTool(Tool):
    """
    A tool to fetch transcripts from YouTube videos.

    This tool can extract transcripts in various languages and formats,
    providing clean text output for further processing by AI agents.
    """

    name = "youtube_transcript"
    description = """
    Fetches the transcript/captions from a YouTube video.

    Input: YouTube URL or video ID
    Output: Clean transcript text with optional timestamps

    Supports:
    - Auto-generated and manual captions
    - Multiple languages
    - Timestamp formatting options
    - Text cleaning and formatting
    """

    inputs = {
        "video_url": {
            "type": "string",
            "description": "YouTube video URL or video ID"
        },
        "language": {
            "type": "string",
            "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
            "default": "auto",
            "nullable": True,
        },
        "include_timestamps": {
            "type": "boolean",
            "description": "Whether to include timestamps in the output",
            "default": False,
            "nullable": True,
        },
        "clean_text": {
            "type": "boolean",
            "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
            "default": True,
            "nullable": True,
        }
    }

    output_type = "string"

    def __init__(self):
        super().__init__()

    def extract_video_id(self, url: str) -> Optional[str]:
        """Extract video ID from various YouTube URL formats."""
        # Handle direct video ID
        if len(url) == 11 and url.isalnum():
            return url

        # Regular expression patterns for different YouTube URL formats
        patterns = [
            r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
            r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
            r'youtu\.be\/([a-zA-Z0-9_-]{11})',
            r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
        ]

        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1)

        return None

    def clean_transcript_text(self, transcript: List[Dict]) -> str:
        """Clean and format transcript text."""
        text_parts = []

        for entry in transcript:
            text = entry['text']
            # Remove extra spaces and newlines
            text = re.sub(r'\s+', ' ', text.strip())
            # Fix common caption artifacts
            text = re.sub(r'\[.*?\]', '', text)  # Remove [Music], [Applause], etc.
            text = re.sub(r'\(.*?\)', '', text)  # Remove (inaudible), etc.
            if text:
                text_parts.append(text)

        # Join and clean up the full text
        full_text = ' '.join(text_parts)
        # Fix punctuation spacing
        full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
        full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)

        return full_text.strip()

    def format_with_timestamps(self, transcript: List[Dict]) -> str:
        """Format transcript with timestamps."""
        formatted_parts = []

        for entry in transcript:
            start_time = entry['start']
            minutes = int(start_time // 60)
            seconds = int(start_time % 60)
            timestamp = f"[{minutes:02d}:{seconds:02d}]"

            text = entry['text'].strip()
            if text:
                formatted_parts.append(f"{timestamp} {text}")

        return '\n'.join(formatted_parts)

    def get_available_languages(self, video_id: str) -> List[str]:
        """Get list of available transcript languages for a video."""
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            languages = []

            for transcript in transcript_list:
                languages.append(transcript.language_code)

            return languages
        except Exception:
            return []

    def forward(self, video_url: str, language: str = "auto",
                include_timestamps: bool = False, clean_text: bool = True) -> str:
        """
        Fetch and format YouTube video transcript.

        Args:
            video_url: YouTube URL or video ID
            language: Language code for transcript (default: auto-detect)
            include_timestamps: Whether to include timestamps
            clean_text: Whether to clean and format the text

        Returns:
            Formatted transcript text
        """
        try:
            # Extract video ID
            video_id = self.extract_video_id(video_url)
            if not video_id:
                return "Error: Invalid YouTube URL or video ID provided."

            # Get available languages if auto-detect is requested
            if language == "auto":
                available_languages = self.get_available_languages(video_id)
                if not available_languages:
                    return "Error: No transcripts available for this video."

                # Prefer English, then first available
                language = 'en' if 'en' in available_languages else available_languages[0]

            # Fetch transcript
            try:
                transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
            except Exception as e:
                # Try to get any available transcript
                try:
                    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                    transcript = transcript_list.find_generated_transcript(['en']).fetch()
                except Exception:
                    try:
                        # Try manual transcripts
                        transcript = transcript_list.find_manually_created_transcript(
                            ['en']).fetch()
                    except Exception:
                        return f"Error: Could not fetch transcript. {str(e)}"

            if not transcript:
                return "Error: No transcript content found."

            # Format output based on options
            if include_timestamps:
                result = self.format_with_timestamps(transcript)
            else:
                if clean_text:
                    result = self.clean_transcript_text(transcript)
                else:
                    result = ' '.join([entry['text'] for entry in transcript])

            # Add metadata
            metadata = f"YouTube Video ID: {video_id}\n"
            metadata += f"Language: {language}\n"
            metadata += f"Transcript Length: {len(result)} characters\n"
            metadata += "-" * 50 + "\n\n"

            return metadata + result

        except Exception as e:
            return f"Error fetching transcript: {str(e)}"


# Example usage and testing
if __name__ == "__main__":
    # Initialize the tool
    transcript_tool = YouTubeTranscriptTool()

    # Test with a sample video
    test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

    print("Testing YouTube Transcript Tool...")
    print("=" * 50)

    # Test basic transcript
    result = transcript_tool.forward(test_url)
    print("Basic transcript:")
    print(result[:500] + "..." if len(result) > 500 else result)
    print("\n" + "=" * 50 + "\n")

    # Test with timestamps
    result_with_timestamps = transcript_tool.forward(
        test_url,
        include_timestamps=True
    )
    print("With timestamps:")
    print(result_with_timestamps[:500] + "..." if len(
        result_with_timestamps) > 500 else result_with_timestamps)

# Installation requirements:
# pip install youtube-transcript-api smolagents