File size: 7,900 Bytes
2342bb7 d3616c3 2342bb7 9ffb7e0 d3616c3 2342bb7 9ffb7e0 d3616c3 2342bb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | import re
from typing import Optional, Dict, Any, List
from youtube_transcript_api import YouTubeTranscriptApi
from smolagents import Tool
class YouTubeTranscriptTool(Tool):
"""
A tool to fetch transcripts from YouTube videos.
This tool can extract transcripts in various languages and formats,
providing clean text output for further processing by AI agents.
"""
name = "youtube_transcript"
description = """
Fetches the transcript/captions from a YouTube video.
Input: YouTube URL or video ID
Output: Clean transcript text with optional timestamps
Supports:
- Auto-generated and manual captions
- Multiple languages
- Timestamp formatting options
- Text cleaning and formatting
"""
inputs = {
"video_url": {
"type": "string",
"description": "YouTube video URL or video ID"
},
"language": {
"type": "string",
"description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
"default": "auto",
"nullable": True,
},
"include_timestamps": {
"type": "boolean",
"description": "Whether to include timestamps in the output",
"default": False,
"nullable": True,
},
"clean_text": {
"type": "boolean",
"description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
"default": True,
"nullable": True,
}
}
output_type = "string"
def __init__(self):
super().__init__()
def extract_video_id(self, url: str) -> Optional[str]:
"""Extract video ID from various YouTube URL formats."""
# Handle direct video ID
if len(url) == 11 and url.isalnum():
return url
# Regular expression patterns for different YouTube URL formats
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
r'youtu\.be\/([a-zA-Z0-9_-]{11})',
r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def clean_transcript_text(self, transcript: List[Dict]) -> str:
"""Clean and format transcript text."""
text_parts = []
for entry in transcript:
text = entry['text']
# Remove extra spaces and newlines
text = re.sub(r'\s+', ' ', text.strip())
# Fix common caption artifacts
text = re.sub(r'\[.*?\]', '', text) # Remove [Music], [Applause], etc.
text = re.sub(r'\(.*?\)', '', text) # Remove (inaudible), etc.
if text:
text_parts.append(text)
# Join and clean up the full text
full_text = ' '.join(text_parts)
# Fix punctuation spacing
full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)
return full_text.strip()
def format_with_timestamps(self, transcript: List[Dict]) -> str:
"""Format transcript with timestamps."""
formatted_parts = []
for entry in transcript:
start_time = entry['start']
minutes = int(start_time // 60)
seconds = int(start_time % 60)
timestamp = f"[{minutes:02d}:{seconds:02d}]"
text = entry['text'].strip()
if text:
formatted_parts.append(f"{timestamp} {text}")
return '\n'.join(formatted_parts)
def get_available_languages(self, video_id: str) -> List[str]:
"""Get list of available transcript languages for a video."""
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
languages = []
for transcript in transcript_list:
languages.append(transcript.language_code)
return languages
except Exception:
return []
def forward(self, video_url: str, language: str = "auto",
include_timestamps: bool = False, clean_text: bool = True) -> str:
"""
Fetch and format YouTube video transcript.
Args:
video_url: YouTube URL or video ID
language: Language code for transcript (default: auto-detect)
include_timestamps: Whether to include timestamps
clean_text: Whether to clean and format the text
Returns:
Formatted transcript text
"""
try:
# Extract video ID
video_id = self.extract_video_id(video_url)
if not video_id:
return "Error: Invalid YouTube URL or video ID provided."
# Get available languages if auto-detect is requested
if language == "auto":
available_languages = self.get_available_languages(video_id)
if not available_languages:
return "Error: No transcripts available for this video."
# Prefer English, then first available
language = 'en' if 'en' in available_languages else available_languages[0]
# Fetch transcript
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
except Exception as e:
# Try to get any available transcript
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = transcript_list.find_generated_transcript(['en']).fetch()
except Exception:
try:
# Try manual transcripts
transcript = transcript_list.find_manually_created_transcript(
['en']).fetch()
except Exception:
return f"Error: Could not fetch transcript. {str(e)}"
if not transcript:
return "Error: No transcript content found."
# Format output based on options
if include_timestamps:
result = self.format_with_timestamps(transcript)
else:
if clean_text:
result = self.clean_transcript_text(transcript)
else:
result = ' '.join([entry['text'] for entry in transcript])
# Add metadata
metadata = f"YouTube Video ID: {video_id}\n"
metadata += f"Language: {language}\n"
metadata += f"Transcript Length: {len(result)} characters\n"
metadata += "-" * 50 + "\n\n"
return metadata + result
except Exception as e:
return f"Error fetching transcript: {str(e)}"
# Example usage and testing
if __name__ == "__main__":
# Initialize the tool
transcript_tool = YouTubeTranscriptTool()
# Test with a sample video
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
print("Testing YouTube Transcript Tool...")
print("=" * 50)
# Test basic transcript
result = transcript_tool.forward(test_url)
print("Basic transcript:")
print(result[:500] + "..." if len(result) > 500 else result)
print("\n" + "=" * 50 + "\n")
# Test with timestamps
result_with_timestamps = transcript_tool.forward(
test_url,
include_timestamps=True
)
print("With timestamps:")
print(result_with_timestamps[:500] + "..." if len(
result_with_timestamps) > 500 else result_with_timestamps)
# Installation requirements:
# pip install youtube-transcript-api smolagents |