Spaces:
Running
Running
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from youtube_transcript_api.proxies import GenericProxyConfig | |
| import re | |
| import os | |
| from agents import function_tool | |
| def fetch_video_transcript(url: str) -> str: | |
| """ | |
| Extract transcript with timestamps from a YouTube video URL and format it for LLM consumption | |
| Args: | |
| url (str): YouTube video URL | |
| Returns: | |
| str: Formatted transcript with timestamps, where each entry is on a new line | |
| in the format: "[MM:SS] Text" | |
| """ | |
| # Extract video ID from URL | |
| video_id_pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*' | |
| video_id_match = re.search(video_id_pattern, url) | |
| if not video_id_match: | |
| raise ValueError("Invalid YouTube URL") | |
| video_id = video_id_match.group(1) | |
| def format_transcript(transcript): | |
| """Format transcript entries with timestamps""" | |
| formatted_entries = [] | |
| for entry in transcript: | |
| # Convert seconds to MM:SS format | |
| minutes = int(entry.start // 60) | |
| seconds = int(entry.start % 60) | |
| timestamp = f"[{minutes:02d}:{seconds:02d}]" | |
| formatted_entry = f"{timestamp} {entry.text}" | |
| formatted_entries.append(formatted_entry) | |
| # Join all entries with newlines | |
| return "\n".join(formatted_entries) | |
| # First attempt: Try without proxy | |
| try: | |
| ytt_api = YouTubeTranscriptApi() | |
| transcript = ytt_api.fetch(video_id) | |
| return format_transcript(transcript) | |
| except Exception as e: | |
| # Second attempt: Try with proxy if credentials are available | |
| proxy_username = os.getenv("PROXY_USERNAME") | |
| proxy_password = os.getenv("PROXY_PASSWORD") | |
| proxy_url_base = os.getenv("PROXY_URL") | |
| if proxy_username and proxy_password: | |
| try: | |
| # Construct proxy URLs with credentials | |
| http_proxy_url = f"http://{proxy_username}:{proxy_password}@{proxy_url_base}" | |
| https_proxy_url = f"https://{proxy_username}:{proxy_password}@{proxy_url_base}" | |
| proxy_config = GenericProxyConfig( | |
| http_url=http_proxy_url, | |
| https_url=https_proxy_url | |
| ) | |
| ytt_api_with_proxy = YouTubeTranscriptApi(proxy_config=proxy_config) | |
| transcript = ytt_api_with_proxy.fetch(video_id) | |
| return format_transcript(transcript) | |
| except Exception as proxy_error: | |
| raise Exception(f"Error fetching transcript (tried with and without proxy): {str(proxy_error)}") | |
| else: | |
| raise Exception(f"Error fetching transcript: {str(e)}. Proxy credentials not available.") | |
| def fetch_intstructions(prompt_name: str) -> str: | |
| """ | |
| Fetch instructions for a given prompt name from the prompts/ directory | |
| Args: | |
| prompt_name (str): Name of the prompt to fetch instructions for | |
| Available prompts: | |
| - write_blog_post | |
| - write_social_post | |
| - write_video_chapters | |
| Returns: | |
| str: Instructions for the given prompt | |
| """ | |
| import os | |
| script_dir = os.path.dirname(__file__) | |
| prompt_path = os.path.join(script_dir, "prompts", f"{prompt_name}.md") | |
| with open(prompt_path, "r") as f: | |
| return f.read() |