Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from langchain_core.tools import tool | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from youtube_transcript_api._errors import ( | |
| NoTranscriptFound, | |
| TranscriptsDisabled, | |
| VideoUnavailable, | |
| ) | |
| # ---- Config ----------------------------------------------------------------- | |
| DEFAULT_TIMEOUT = 30 | |
| DEFAULT_HEADERS = { | |
| # Helps avoid consent/anti-bot interstitials on some sites | |
| "User-Agent": ( | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| ) | |
| } | |
| # ---- Small helpers ---------------------------------------------------------- | |
| def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str: | |
| """Download raw HTML for a URL or raise on HTTP errors.""" | |
| resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout) | |
| resp.raise_for_status() | |
| return resp.text | |
| def parse_title_description(raw: str) -> tuple[str, str]: | |
| title, desc = raw.split("Description:", 1) | |
| return title.replace("Title:", "").strip(), desc.strip() | |
| # ---- Pure functions (safe to call directly in Python) ----------------------- | |
| def get_youtube_transcript(video_id: str) -> str: | |
| """ | |
| Return YouTube transcript text for a given video ID. | |
| One line per chunk; raises a clear error if transcript is unavailable. | |
| Example of video_id: | |
| For youtube video: https://www.youtube.com/watch?v=1htKBjuUWec | |
| The video id is: dQw4w9WgXcQ | |
| """ | |
| try: | |
| # Initialize the YouTubeTranscriptApi | |
| ytt_api = YouTubeTranscriptApi() | |
| fetched_transcript = ytt_api.fetch(video_id) | |
| raw_data = fetched_transcript.to_raw_data() | |
| # raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines | |
| transcript = "\n".join([item["text"] for item in raw_data]) | |
| return transcript | |
| except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable) as e: | |
| raise RuntimeError(f"Transcript unavailable: {e}") from e | |
| def get_youtube_title_description(video_url: str) -> str: | |
| """ | |
| get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec) | |
| Extract YouTube title + description from Open Graph meta tags. | |
| Falls back to standard <meta name="title"/"description"> if needed. | |
| """ | |
| html = _fetch_html(video_url) | |
| soup = BeautifulSoup(html, "html.parser") | |
| title_tag = soup.find("meta", property="og:title") or soup.find( | |
| "meta", attrs={"name": "title"} | |
| ) | |
| desc_tag = soup.find("meta", property="og:description") or soup.find( | |
| "meta", attrs={"name": "description"} | |
| ) | |
| title = (title_tag.get("content") if title_tag else None) or "No title found" | |
| description = ( | |
| desc_tag.get("content") if desc_tag else None | |
| ) or "No description found" | |
| return f"Title: {title}\nDescription: {description}" | |
| # ---- LangChain tool wrappers (for agents; call with .invoke) ---------------- | |
| def get_youtube_transcript_tool(video_id: str) -> str: | |
| """Tool: return YouTube transcript text for a video ID.""" | |
| return get_youtube_transcript(video_id) | |
| def get_youtube_title_description_tool(video_url: str) -> str: | |
| """Tool: return YouTube title + description for a video URL.""" | |
| return get_youtube_title_description(video_url) | |
| # ---- Minimal demo ----------------------------------------------------------- | |
| if __name__ == "__main__": | |
| video_id = "1htKBjuUWec" | |
| url = f"https://www.youtube.com/watch?v={video_id}" | |
| print(get_youtube_title_description(url)) | |
| try: | |
| print("\n--- Transcript (first 500 chars) ---") | |
| tx = get_youtube_transcript(video_id) | |
| print(tx[:500] + ("..." if len(tx) > 500 else "")) | |
| except Exception as e: | |
| print(f"Transcript error: {e}") | |