Spaces:
Running
Running
| """ | |
| Service for fetching YouTube transcripts directly without downloading audio. | |
| Uses youtube-transcript-api for efficient text extraction. | |
| """ | |
| from typing import Optional, List | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from src.utils.logger import setup_logger | |
| import re | |
| logger = setup_logger(__name__) | |
| class TranscriptFetcher: | |
| """Handles fetching transcripts from YouTube videos.""" | |
| def extract_video_id(url: str) -> Optional[str]: | |
| """Extract the 11-character video ID from a YouTube URL.""" | |
| reg_exp = re.compile( | |
| r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/watch\?.*v=)([a-zA-Z0-9_-]{11})" | |
| ) | |
| match = reg_exp.search(url) | |
| return match.group(1) if match else None | |
| def fetch_transcript( | |
| self, video_url: str, languages: List[str] = ["en", "ar"] | |
| ) -> Optional[str]: | |
| """ | |
| Attempt to fetch the transcript for a video. | |
| Returns the combined text if successful, None otherwise. | |
| """ | |
| video_id = self.extract_video_id(video_url) | |
| if not video_id: | |
| logger.error(f"Could not extract video ID from URL: {video_url}") | |
| return None | |
| try: | |
| logger.info(f"Attempting to fetch transcript for video: {video_id}") | |
| # Try to get transcript in the requested languages | |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
| # Find the best transcript (manual preferred over auto-generated) | |
| try: | |
| # Try finding a manual one first | |
| transcript = transcript_list.find_manually_created_transcript(languages) | |
| except Exception: | |
| try: | |
| # Fallback to auto-generated | |
| transcript = transcript_list.find_generated_transcript(languages) | |
| except Exception: | |
| # Fallback to anything available | |
| transcript = transcript_list.find_transcript(languages) | |
| data = transcript.fetch() | |
| text = " ".join([entry["text"] for entry in data]) | |
| logger.info( | |
| f"Successfully fetched transcript for {video_id} ({len(text)} chars)" | |
| ) | |
| return text | |
| except Exception as e: | |
| logger.warning(f"Could not fetch transcript for {video_id}: {e}") | |
| return None | |