def fetch_youtube_transcripts(max_videos=None): import os import yt_dlp import re from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound import pandas as pd from datetime import datetime from src.configs.config import RAW_CSV from datetime import datetime, timedelta # Configuration CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams" START_DATE = datetime(2025, 7, 21) END_DATE = datetime.today() SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت'] LANGUAGE = 'ar' # Create output directory if needed output_dir = os.path.dirname(RAW_CSV) if output_dir: os.makedirs(output_dir, exist_ok=True) # Get video list def get_video_list(channel_url): print(f"-> Extraction des vidéos depuis : {channel_url}") ydl_opts = { "quiet": True, "extract_flat": True, "force_generic_extractor": True } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(channel_url, download=False) entries = info.get("entries", []) print(f"-> {len(entries)} vidéos extraites du flux.") return entries # Get transcript text def get_transcript_text(video_id): import glob video_url = f"https://www.youtube.com/watch?v={video_id}" output_path = f"/tmp/{video_id}.%(ext)s" ydl_opts = { 'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['ar'], 'skip_download': True, 'outtmpl': output_path, 'quiet': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: try: ydl.download([video_url]) except Exception as e: print(f"[!] yt-dlp download error: {e}") return None # Look for any Arabic VTT subtitle file (auto or manual) vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt") if not vtt_files: print(f"[x] No Arabic subtitle found for: {video_id}") return None try: with open(vtt_files[0], 'r', encoding='utf-8') as f: raw_text = f.read() def clean_caption(text): text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'Kind:.*\n?', '', text) text = re.sub(r'Language:.*\n?', '', text) text = re.sub(r'WEBVTT.*\n?', '', text) text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text) text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text) text = re.sub(r'\s+', ' ', text) return text.strip() return clean_caption(raw_text) except Exception as e: print(f"[!] Failed to read/clean subtitle: {e}") return None # Skip by keywords def should_skip_title(title): if any(word in title for word in SKIP_WORDS): print(f"-> Ignorée (langue exclue) : {title}") return True return False print("\n==== LANCEMENT DU SCRIPT ====\n") videos = get_video_list(CHANNEL_URL) print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()} ➜ {END_DATE.date()}") data = [] video_count = 1 for idx, vid in enumerate(videos, start=1): if max_videos is not None and video_count > max_videos: print(f"\n✅ Limite de {max_videos} vidéos atteinte.") break video_id = vid.get("id") title = vid.get("title") or "" url = f"https://www.youtube.com/watch?v={video_id}" print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}") if should_skip_title(title): continue try: with yt_dlp.YoutubeDL({'quiet': True}) as ydl: full_info = ydl.extract_info(url, download=False) upload_date_str = full_info.get("upload_date") if not upload_date_str: print(f" [!] Date non trouvée pour : {title}") continue upload_date = datetime.strptime(upload_date_str, "%Y%m%d") if not (START_DATE <= upload_date <= END_DATE): print(f" [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.") break transcript = get_transcript_text(video_id) if transcript: data.append({ "id": video_count, "titre": title, "date": upload_date.strftime("%Y-%m-%d"), "langue": LANGUAGE, "sous-titre": transcript, "lien": url }) print(f" [v] Transcription ajoutée (ID: {video_count})") video_count += 1 else: print(f" [x] Transcription non disponible ou vide.") except Exception as e: print(f" [!] Erreur pendant le traitement de la vidéo {video_id}: {e}") print(f"\n-> Génération du fichier CSV: {RAW_CSV}") df = pd.DataFrame(data) df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig') print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'") print("\n==== FIN DU SCRIPT ====\n")