Agents_Course_Final_Assignment

Sleeping

File size: 7,900 Bytes

import re
from typing import Optional, Dict, Any, List
from youtube_transcript_api import YouTubeTranscriptApi
from smolagents import Tool


class YouTubeTranscriptTool(Tool):
    """
    A tool to fetch transcripts from YouTube videos.

    This tool can extract transcripts in various languages and formats,
    providing clean text output for further processing by AI agents.
    """

    name = "youtube_transcript"
    description = """
    Fetches the transcript/captions from a YouTube video.

    Input: YouTube URL or video ID
    Output: Clean transcript text with optional timestamps

    Supports:
    - Auto-generated and manual captions
    - Multiple languages
    - Timestamp formatting options
    - Text cleaning and formatting
    """

    inputs = {
        "video_url": {
            "type": "string",
            "description": "YouTube video URL or video ID"
        },
        "language": {
            "type": "string",
            "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
            "default": "auto",
            "nullable": True,
        },
        "include_timestamps": {
            "type": "boolean",
            "description": "Whether to include timestamps in the output",
            "default": False,
            "nullable": True,
        },
        "clean_text": {
            "type": "boolean",
            "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
            "default": True,
            "nullable": True,
        }
    }

    output_type = "string"

    def __init__(self):
        super().__init__()

    def extract_video_id(self, url: str) -> Optional[str]:
        """Extract video ID from various YouTube URL formats."""
        # Handle direct video ID
        if len(url) == 11 and url.isalnum():
            return url

        # Regular expression patterns for different YouTube URL formats
        patterns = [
            r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
            r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
            r'youtu\.be\/([a-zA-Z0-9_-]{11})',
            r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
        ]

        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1)

        return None

    def clean_transcript_text(self, transcript: List[Dict]) -> str:
        """Clean and format transcript text."""
        text_parts = []

        for entry in transcript:
            text = entry['text']
            # Remove extra spaces and newlines
            text = re.sub(r'\s+', ' ', text.strip())
            # Fix common caption artifacts
            text = re.sub(r'\[.*?\]', '', text)  # Remove [Music], [Applause], etc.
            text = re.sub(r'\(.*?\)', '', text)  # Remove (inaudible), etc.
            if text:
                text_parts.append(text)

        # Join and clean up the full text
        full_text = ' '.join(text_parts)
        # Fix punctuation spacing
        full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
        full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)

        return full_text.strip()

    def format_with_timestamps(self, transcript: List[Dict]) -> str:
        """Format transcript with timestamps."""
        formatted_parts = []

        for entry in transcript:
            start_time = entry['start']
            minutes = int(start_time // 60)
            seconds = int(start_time % 60)
            timestamp = f"[{minutes:02d}:{seconds:02d}]"

            text = entry['text'].strip()
            if text:
                formatted_parts.append(f"{timestamp} {text}")

        return '\n'.join(formatted_parts)

    def get_available_languages(self, video_id: str) -> List[str]:
        """Get list of available transcript languages for a video."""
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            languages = []

            for transcript in transcript_list:
                languages.append(transcript.language_code)

            return languages
        except Exception:
            return []

    def forward(self, video_url: str, language: str = "auto",
                include_timestamps: bool = False, clean_text: bool = True) -> str:
        """
        Fetch and format YouTube video transcript.

        Args:
            video_url: YouTube URL or video ID
            language: Language code for transcript (default: auto-detect)
            include_timestamps: Whether to include timestamps
            clean_text: Whether to clean and format the text

        Returns:
            Formatted transcript text
        """
        try:
            # Extract video ID
            video_id = self.extract_video_id(video_url)
            if not video_id:
                return "Error: Invalid YouTube URL or video ID provided."

            # Get available languages if auto-detect is requested
            if language == "auto":
                available_languages = self.get_available_languages(video_id)
                if not available_languages:
                    return "Error: No transcripts available for this video."

                # Prefer English, then first available
                language = 'en' if 'en' in available_languages else available_languages[0]

            # Fetch transcript
            try:
                transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
            except Exception as e:
                # Try to get any available transcript
                try:
                    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                    transcript = transcript_list.find_generated_transcript(['en']).fetch()
                except Exception:
                    try:
                        # Try manual transcripts
                        transcript = transcript_list.find_manually_created_transcript(
                            ['en']).fetch()
                    except Exception:
                        return f"Error: Could not fetch transcript. {str(e)}"

            if not transcript:
                return "Error: No transcript content found."

            # Format output based on options
            if include_timestamps:
                result = self.format_with_timestamps(transcript)
            else:
                if clean_text:
                    result = self.clean_transcript_text(transcript)
                else:
                    result = ' '.join([entry['text'] for entry in transcript])

            # Add metadata
            metadata = f"YouTube Video ID: {video_id}\n"
            metadata += f"Language: {language}\n"
            metadata += f"Transcript Length: {len(result)} characters\n"
            metadata += "-" * 50 + "\n\n"

            return metadata + result

        except Exception as e:
            return f"Error fetching transcript: {str(e)}"


# Example usage and testing
if __name__ == "__main__":
    # Initialize the tool
    transcript_tool = YouTubeTranscriptTool()

    # Test with a sample video
    test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

    print("Testing YouTube Transcript Tool...")
    print("=" * 50)

    # Test basic transcript
    result = transcript_tool.forward(test_url)
    print("Basic transcript:")
    print(result[:500] + "..." if len(result) > 500 else result)
    print("\n" + "=" * 50 + "\n")

    # Test with timestamps
    result_with_timestamps = transcript_tool.forward(
        test_url,
        include_timestamps=True
    )
    print("With timestamps:")
    print(result_with_timestamps[:500] + "..." if len(
        result_with_timestamps) > 500 else result_with_timestamps)

# Installation requirements:
# pip install youtube-transcript-api smolagents