Spaces:
Sleeping
Sleeping
| from urllib.parse import parse_qs, urlparse | |
| from llama_index.core.tools import FunctionTool | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable | |
| #loader = YoutubeTranscriptReader() | |
| yt_ap = YouTubeTranscriptApi() | |
| def extract_video_id(url: str) -> str: | |
| """ | |
| Extracts the video ID from a YouTube URL. | |
| Args: | |
| url (str): The full YouTube video URL. | |
| Returns: | |
| str: The extracted video ID or raises ValueError. | |
| """ | |
| patterns = [ | |
| r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})", | |
| r"youtu\.be/([a-zA-Z0-9_-]{11})" | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, url) | |
| if match: | |
| return match.group(1) | |
| raise ValueError("Invalid YouTube URL or unable to extract video ID.") | |
| def get_youtube_transcript(url: str) -> str: | |
| """ | |
| Fetches the transcript text for a given YouTube video. | |
| Args: | |
| url (str): The YouTube video URL. | |
| Returns: | |
| str: Combined transcript text or an error message. | |
| """ | |
| try: | |
| video_id = extract_video_id(url) | |
| transcript_list = YouTubeTranscriptApi.get_transcript(video_id) | |
| full_text = " ".join([entry["text"] for entry in transcript_list]) | |
| return full_text.strip()[:2000] # Truncate to 2000 chars to prevent token overflow | |
| except TranscriptsDisabled: | |
| return "This video has transcripts disabled." | |
| except NoTranscriptFound: | |
| return "No transcript was found for this video." | |
| except Exception as e: | |
| return f"Transcript error: {str(e)}" | |
| youtube_tool = FunctionTool.from_defaults(get_youtube_transcript) | |
| def extract_video_id(url: str) -> str: | |
| """ | |
| Handles typical YouTube URLs: | |
| - https://www.youtube.com/watch?v=VIDEO_ID | |
| - https://youtu.be/VIDEO_ID | |
| - with extra query params | |
| """ | |
| parsed = urlparse(url) | |
| if parsed.hostname in {"www.youtube.com", "youtube.com"}: | |
| qs = parse_qs(parsed.query) | |
| if "v" in qs: | |
| return qs["v"][0] | |
| # fallback for youtu.be or raw IDs | |
| return parsed.path.lstrip("/") | |
| def fetch_youtube_transcript(video_url: str) -> str: | |
| """ | |
| Fetch YouTube transcript text for the given URL. | |
| In English language. | |
| """ | |
| video_id = extract_video_id(video_url) | |
| try: | |
| # ✅ call on the class, NOT an instance | |
| transcript_data = yt_ap.fetch( | |
| video_id=video_id, | |
| languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges | |
| ) | |
| #FROM TRANSCRIPT DATA, YOU CAN CREATE A OBJECT OF TRANSCRIPT SNIPET AND TIME | |
| arr = [snippet.text for snippet in transcript_data] | |
| return " ".join(arr) | |
| #return " ".join(entry["text"] for entry in arr) | |
| except Exception as e: | |
| return f"Error fetching video details: {str(e)}" | |
| def fetch_youtube_transcript_snippets(video_url: str) -> str: | |
| """ | |
| Fetch YouTube transcript snippets for the given URL. | |
| It gets the start-time, end-time and duration of each snippet. | |
| """ | |
| video_id = extract_video_id(video_url) | |
| try: | |
| # ✅ call on the class, NOT an instance | |
| transcript_data = yt_ap.fetch( | |
| video_id=video_id, | |
| languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges | |
| ) | |
| arr = [ | |
| {"text": snippet.text, "duration": snippet.duration, "start": snippet.start} | |
| for snippet in transcript_data | |
| ] | |
| return " ".join(f"Text: {entry['text']} Duration: {entry['duration']} StartTime: {entry['start']} <End>" for entry in arr) | |
| except Exception as e: | |
| return f"Error fetching video details: {str(e)}" | |
| youtube_transcript_tool = FunctionTool.from_defaults(fetch_youtube_transcript) | |
| youtube_transcript_snippet_tool = FunctionTool.from_defaults(fetch_youtube_transcript_snippets) |