""" Service for fetching YouTube transcripts directly without downloading audio. Uses youtube-transcript-api for efficient text extraction. """ from typing import Optional, List from youtube_transcript_api import YouTubeTranscriptApi from src.utils.logger import setup_logger import re logger = setup_logger(__name__) class TranscriptFetcher: """Handles fetching transcripts from YouTube videos.""" @staticmethod def extract_video_id(url: str) -> Optional[str]: """Extract the 11-character video ID from a YouTube URL.""" reg_exp = re.compile( r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/watch\?.*v=)([a-zA-Z0-9_-]{11})" ) match = reg_exp.search(url) return match.group(1) if match else None def fetch_transcript( self, video_url: str, languages: List[str] = ["en", "ar"] ) -> Optional[str]: """ Attempt to fetch the transcript for a video. Returns the combined text if successful, None otherwise. """ video_id = self.extract_video_id(video_url) if not video_id: logger.error(f"Could not extract video ID from URL: {video_url}") return None try: logger.info(f"Attempting to fetch transcript for video: {video_id}") # Try to get transcript in the requested languages transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # Find the best transcript (manual preferred over auto-generated) try: # Try finding a manual one first transcript = transcript_list.find_manually_created_transcript(languages) except Exception: try: # Fallback to auto-generated transcript = transcript_list.find_generated_transcript(languages) except Exception: # Fallback to anything available transcript = transcript_list.find_transcript(languages) data = transcript.fetch() text = " ".join([entry["text"] for entry in data]) logger.info( f"Successfully fetched transcript for {video_id} ({len(text)} chars)" ) return text except Exception as e: logger.warning(f"Could not fetch transcript for {video_id}: {e}") return None