vlapparov commited on
Commit
2342bb7
·
verified ·
1 Parent(s): 989f30a

Upload youtube_utils.py

Browse files
Files changed (1) hide show
  1. youtube_utils.py +224 -0
youtube_utils.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Optional, Dict, Any, List
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ from smolagents import Tool
5
+
6
+
7
+ class YouTubeTranscriptTool(Tool):
8
+ """
9
+ A tool to fetch transcripts from YouTube videos.
10
+
11
+ This tool can extract transcripts in various languages and formats,
12
+ providing clean text output for further processing by AI agents.
13
+ """
14
+
15
+ name = "youtube_transcript"
16
+ description = """
17
+ Fetches the transcript/captions from a YouTube video.
18
+
19
+ Input: YouTube URL or video ID
20
+ Output: Clean transcript text with optional timestamps
21
+
22
+ Supports:
23
+ - Auto-generated and manual captions
24
+ - Multiple languages
25
+ - Timestamp formatting options
26
+ - Text cleaning and formatting
27
+ """
28
+
29
+ inputs = {
30
+ "video_url": {
31
+ "type": "string",
32
+ "description": "YouTube video URL or video ID"
33
+ },
34
+ "language": {
35
+ "type": "string",
36
+ "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
37
+ "default": "auto"
38
+ },
39
+ "include_timestamps": {
40
+ "type": "boolean",
41
+ "description": "Whether to include timestamps in the output",
42
+ "default": False
43
+ },
44
+ "clean_text": {
45
+ "type": "boolean",
46
+ "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
47
+ "default": True
48
+ }
49
+ }
50
+
51
+ output_type = "string"
52
+
53
+ def __init__(self):
54
+ super().__init__()
55
+
56
+ def extract_video_id(self, url: str) -> Optional[str]:
57
+ """Extract video ID from various YouTube URL formats."""
58
+ # Handle direct video ID
59
+ if len(url) == 11 and url.isalnum():
60
+ return url
61
+
62
+ # Regular expression patterns for different YouTube URL formats
63
+ patterns = [
64
+ r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
65
+ r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
66
+ r'youtu\.be\/([a-zA-Z0-9_-]{11})',
67
+ r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
68
+ ]
69
+
70
+ for pattern in patterns:
71
+ match = re.search(pattern, url)
72
+ if match:
73
+ return match.group(1)
74
+
75
+ return None
76
+
77
+ def clean_transcript_text(self, transcript: List[Dict]) -> str:
78
+ """Clean and format transcript text."""
79
+ text_parts = []
80
+
81
+ for entry in transcript:
82
+ text = entry['text']
83
+ # Remove extra spaces and newlines
84
+ text = re.sub(r'\s+', ' ', text.strip())
85
+ # Fix common caption artifacts
86
+ text = re.sub(r'\[.*?\]', '', text) # Remove [Music], [Applause], etc.
87
+ text = re.sub(r'\(.*?\)', '', text) # Remove (inaudible), etc.
88
+ if text:
89
+ text_parts.append(text)
90
+
91
+ # Join and clean up the full text
92
+ full_text = ' '.join(text_parts)
93
+ # Fix punctuation spacing
94
+ full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
95
+ full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)
96
+
97
+ return full_text.strip()
98
+
99
+ def format_with_timestamps(self, transcript: List[Dict]) -> str:
100
+ """Format transcript with timestamps."""
101
+ formatted_parts = []
102
+
103
+ for entry in transcript:
104
+ start_time = entry['start']
105
+ minutes = int(start_time // 60)
106
+ seconds = int(start_time % 60)
107
+ timestamp = f"[{minutes:02d}:{seconds:02d}]"
108
+
109
+ text = entry['text'].strip()
110
+ if text:
111
+ formatted_parts.append(f"{timestamp} {text}")
112
+
113
+ return '\n'.join(formatted_parts)
114
+
115
+ def get_available_languages(self, video_id: str) -> List[str]:
116
+ """Get list of available transcript languages for a video."""
117
+ try:
118
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
119
+ languages = []
120
+
121
+ for transcript in transcript_list:
122
+ languages.append(transcript.language_code)
123
+
124
+ return languages
125
+ except Exception:
126
+ return []
127
+
128
+ def forward(self, video_url: str, language: str = "auto",
129
+ include_timestamps: bool = False, clean_text: bool = True) -> str:
130
+ """
131
+ Fetch and format YouTube video transcript.
132
+
133
+ Args:
134
+ video_url: YouTube URL or video ID
135
+ language: Language code for transcript (default: auto-detect)
136
+ include_timestamps: Whether to include timestamps
137
+ clean_text: Whether to clean and format the text
138
+
139
+ Returns:
140
+ Formatted transcript text
141
+ """
142
+ try:
143
+ # Extract video ID
144
+ video_id = self.extract_video_id(video_url)
145
+ if not video_id:
146
+ return "Error: Invalid YouTube URL or video ID provided."
147
+
148
+ # Get available languages if auto-detect is requested
149
+ if language == "auto":
150
+ available_languages = self.get_available_languages(video_id)
151
+ if not available_languages:
152
+ return "Error: No transcripts available for this video."
153
+
154
+ # Prefer English, then first available
155
+ language = 'en' if 'en' in available_languages else available_languages[0]
156
+
157
+ # Fetch transcript
158
+ try:
159
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
160
+ except Exception as e:
161
+ # Try to get any available transcript
162
+ try:
163
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
164
+ transcript = transcript_list.find_generated_transcript(['en']).fetch()
165
+ except Exception:
166
+ try:
167
+ # Try manual transcripts
168
+ transcript = transcript_list.find_manually_created_transcript(
169
+ ['en']).fetch()
170
+ except Exception:
171
+ return f"Error: Could not fetch transcript. {str(e)}"
172
+
173
+ if not transcript:
174
+ return "Error: No transcript content found."
175
+
176
+ # Format output based on options
177
+ if include_timestamps:
178
+ result = self.format_with_timestamps(transcript)
179
+ else:
180
+ if clean_text:
181
+ result = self.clean_transcript_text(transcript)
182
+ else:
183
+ result = ' '.join([entry['text'] for entry in transcript])
184
+
185
+ # Add metadata
186
+ metadata = f"YouTube Video ID: {video_id}\n"
187
+ metadata += f"Language: {language}\n"
188
+ metadata += f"Transcript Length: {len(result)} characters\n"
189
+ metadata += "-" * 50 + "\n\n"
190
+
191
+ return metadata + result
192
+
193
+ except Exception as e:
194
+ return f"Error fetching transcript: {str(e)}"
195
+
196
+
197
+ # Example usage and testing
198
+ if __name__ == "__main__":
199
+ # Initialize the tool
200
+ transcript_tool = YouTubeTranscriptTool()
201
+
202
+ # Test with a sample video
203
+ test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
204
+
205
+ print("Testing YouTube Transcript Tool...")
206
+ print("=" * 50)
207
+
208
+ # Test basic transcript
209
+ result = transcript_tool.forward(test_url)
210
+ print("Basic transcript:")
211
+ print(result[:500] + "..." if len(result) > 500 else result)
212
+ print("\n" + "=" * 50 + "\n")
213
+
214
+ # Test with timestamps
215
+ result_with_timestamps = transcript_tool.forward(
216
+ test_url,
217
+ include_timestamps=True
218
+ )
219
+ print("With timestamps:")
220
+ print(result_with_timestamps[:500] + "..." if len(
221
+ result_with_timestamps) > 500 else result_with_timestamps)
222
+
223
+ # Installation requirements:
224
+ # pip install youtube-transcript-api smolagents