Agents_Course_Final_Assignment

Sleeping

App Files Files Community

vlapparov commited on May 28, 2025

Commit

2342bb7

verified ·

1 Parent(s): 989f30a

Upload youtube_utils.py

Browse files

Files changed (1) hide show

youtube_utils.py +224 -0

youtube_utils.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import re
+from typing import Optional, Dict, Any, List
+from youtube_transcript_api import YouTubeTranscriptApi
+from smolagents import Tool
+class YouTubeTranscriptTool(Tool):
+    """
+    A tool to fetch transcripts from YouTube videos.
+    This tool can extract transcripts in various languages and formats,
+    providing clean text output for further processing by AI agents.
+    """
+    name = "youtube_transcript"
+    description = """
+    Fetches the transcript/captions from a YouTube video.
+    Input: YouTube URL or video ID
+    Output: Clean transcript text with optional timestamps
+    Supports:
+    - Auto-generated and manual captions
+    - Multiple languages
+    - Timestamp formatting options
+    - Text cleaning and formatting
+    """
+    inputs = {
+        "video_url": {
+            "type": "string",
+            "description": "YouTube video URL or video ID"
+        },
+        "language": {
+            "type": "string",
+            "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
+            "default": "auto"
+        },
+        "include_timestamps": {
+            "type": "boolean",
+            "description": "Whether to include timestamps in the output",
+            "default": False
+        },
+        "clean_text": {
+            "type": "boolean",
+            "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
+            "default": True
+        }
+    }
+    output_type = "string"
+    def __init__(self):
+        super().__init__()
+    def extract_video_id(self, url: str) -> Optional[str]:
+        """Extract video ID from various YouTube URL formats."""
+        # Handle direct video ID
+        if len(url) == 11 and url.isalnum():
+            return url
+        # Regular expression patterns for different YouTube URL formats
+        patterns = [
+            r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
+            r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
+            r'youtu\.be\/([a-zA-Z0-9_-]{11})',
+            r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, url)
+            if match:
+                return match.group(1)
+        return None
+    def clean_transcript_text(self, transcript: List[Dict]) -> str:
+        """Clean and format transcript text."""
+        text_parts = []
+        for entry in transcript:
+            text = entry['text']
+            # Remove extra spaces and newlines
+            text = re.sub(r'\s+', ' ', text.strip())
+            # Fix common caption artifacts
+            text = re.sub(r'\[.*?\]', '', text)  # Remove [Music], [Applause], etc.
+            text = re.sub(r'\(.*?\)', '', text)  # Remove (inaudible), etc.
+            if text:
+                text_parts.append(text)
+        # Join and clean up the full text
+        full_text = ' '.join(text_parts)
+        # Fix punctuation spacing
+        full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
+        full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)
+        return full_text.strip()
+    def format_with_timestamps(self, transcript: List[Dict]) -> str:
+        """Format transcript with timestamps."""
+        formatted_parts = []
+        for entry in transcript:
+            start_time = entry['start']
+            minutes = int(start_time // 60)
+            seconds = int(start_time % 60)
+            timestamp = f"[{minutes:02d}:{seconds:02d}]"
+            text = entry['text'].strip()
+            if text:
+                formatted_parts.append(f"{timestamp} {text}")
+        return '\n'.join(formatted_parts)
+    def get_available_languages(self, video_id: str) -> List[str]:
+        """Get list of available transcript languages for a video."""
+        try:
+            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+            languages = []
+            for transcript in transcript_list:
+                languages.append(transcript.language_code)
+            return languages
+        except Exception:
+            return []
+    def forward(self, video_url: str, language: str = "auto",
+                include_timestamps: bool = False, clean_text: bool = True) -> str:
+        """
+        Fetch and format YouTube video transcript.
+        Args:
+            video_url: YouTube URL or video ID
+            language: Language code for transcript (default: auto-detect)
+            include_timestamps: Whether to include timestamps
+            clean_text: Whether to clean and format the text
+        Returns:
+            Formatted transcript text
+        """
+        try:
+            # Extract video ID
+            video_id = self.extract_video_id(video_url)
+            if not video_id:
+                return "Error: Invalid YouTube URL or video ID provided."
+            # Get available languages if auto-detect is requested
+            if language == "auto":
+                available_languages = self.get_available_languages(video_id)
+                if not available_languages:
+                    return "Error: No transcripts available for this video."
+                # Prefer English, then first available
+                language = 'en' if 'en' in available_languages else available_languages[0]
+            # Fetch transcript
+            try:
+                transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
+            except Exception as e:
+                # Try to get any available transcript
+                try:
+                    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+                    transcript = transcript_list.find_generated_transcript(['en']).fetch()
+                except Exception:
+                    try:
+                        # Try manual transcripts
+                        transcript = transcript_list.find_manually_created_transcript(
+                            ['en']).fetch()
+                    except Exception:
+                        return f"Error: Could not fetch transcript. {str(e)}"
+            if not transcript:
+                return "Error: No transcript content found."
+            # Format output based on options
+            if include_timestamps:
+                result = self.format_with_timestamps(transcript)
+            else:
+                if clean_text:
+                    result = self.clean_transcript_text(transcript)
+                else:
+                    result = ' '.join([entry['text'] for entry in transcript])
+            # Add metadata
+            metadata = f"YouTube Video ID: {video_id}\n"
+            metadata += f"Language: {language}\n"
+            metadata += f"Transcript Length: {len(result)} characters\n"
+            metadata += "-" * 50 + "\n\n"
+            return metadata + result
+        except Exception as e:
+            return f"Error fetching transcript: {str(e)}"
+# Example usage and testing
+if __name__ == "__main__":
+    # Initialize the tool
+    transcript_tool = YouTubeTranscriptTool()
+    # Test with a sample video
+    test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+    print("Testing YouTube Transcript Tool...")
+    print("=" * 50)
+    # Test basic transcript
+    result = transcript_tool.forward(test_url)
+    print("Basic transcript:")
+    print(result[:500] + "..." if len(result) > 500 else result)
+    print("\n" + "=" * 50 + "\n")
+    # Test with timestamps
+    result_with_timestamps = transcript_tool.forward(
+        test_url,
+        include_timestamps=True
+    )
+    print("With timestamps:")
+    print(result_with_timestamps[:500] + "..." if len(
+        result_with_timestamps) > 500 else result_with_timestamps)
+# Installation requirements:
+# pip install youtube-transcript-api smolagents