AIdea-Server / src /transcription /transcript_fetcher.py
Ali Hashhash
chore: remove unused Dict import
b696f33
"""
Service for fetching YouTube transcripts directly without downloading audio.
Uses youtube-transcript-api for efficient text extraction.
"""
from typing import Optional, List
from youtube_transcript_api import YouTubeTranscriptApi
from src.utils.logger import setup_logger
import re
logger = setup_logger(__name__)
class TranscriptFetcher:
"""Handles fetching transcripts from YouTube videos."""
@staticmethod
def extract_video_id(url: str) -> Optional[str]:
"""Extract the 11-character video ID from a YouTube URL."""
reg_exp = re.compile(
r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/watch\?.*v=)([a-zA-Z0-9_-]{11})"
)
match = reg_exp.search(url)
return match.group(1) if match else None
def fetch_transcript(
self, video_url: str, languages: List[str] = ["en", "ar"]
) -> Optional[str]:
"""
Attempt to fetch the transcript for a video.
Returns the combined text if successful, None otherwise.
"""
video_id = self.extract_video_id(video_url)
if not video_id:
logger.error(f"Could not extract video ID from URL: {video_url}")
return None
try:
logger.info(f"Attempting to fetch transcript for video: {video_id}")
# Try to get transcript in the requested languages
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Find the best transcript (manual preferred over auto-generated)
try:
# Try finding a manual one first
transcript = transcript_list.find_manually_created_transcript(languages)
except Exception:
try:
# Fallback to auto-generated
transcript = transcript_list.find_generated_transcript(languages)
except Exception:
# Fallback to anything available
transcript = transcript_list.find_transcript(languages)
data = transcript.fetch()
text = " ".join([entry["text"] for entry in data])
logger.info(
f"Successfully fetched transcript for {video_id} ({len(text)} chars)"
)
return text
except Exception as e:
logger.warning(f"Could not fetch transcript for {video_id}: {e}")
return None