Spaces:
Sleeping
Sleeping
| def fetch_youtube_transcripts(max_videos=None): | |
| import os | |
| import yt_dlp | |
| import re | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
| import pandas as pd | |
| from datetime import datetime | |
| from src.configs.config import RAW_CSV | |
| from datetime import datetime, timedelta | |
| # Configuration | |
| CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams" | |
| START_DATE = datetime(2025, 7, 21) | |
| END_DATE = datetime.today() | |
| SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت'] | |
| LANGUAGE = 'ar' | |
| # Create output directory if needed | |
| output_dir = os.path.dirname(RAW_CSV) | |
| if output_dir: | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Get video list | |
| def get_video_list(channel_url): | |
| print(f"-> Extraction des vidéos depuis : {channel_url}") | |
| ydl_opts = { | |
| "quiet": True, | |
| "extract_flat": True, | |
| "force_generic_extractor": True | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info = ydl.extract_info(channel_url, download=False) | |
| entries = info.get("entries", []) | |
| print(f"-> {len(entries)} vidéos extraites du flux.") | |
| return entries | |
| # Get transcript text | |
| def get_transcript_text(video_id): | |
| import glob | |
| video_url = f"https://www.youtube.com/watch?v={video_id}" | |
| output_path = f"/tmp/{video_id}.%(ext)s" | |
| ydl_opts = { | |
| 'writesubtitles': True, | |
| 'writeautomaticsub': True, | |
| 'subtitleslangs': ['ar'], | |
| 'skip_download': True, | |
| 'outtmpl': output_path, | |
| 'quiet': True, | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| try: | |
| ydl.download([video_url]) | |
| except Exception as e: | |
| print(f"[!] yt-dlp download error: {e}") | |
| return None | |
| # Look for any Arabic VTT subtitle file (auto or manual) | |
| vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt") | |
| if not vtt_files: | |
| print(f"[x] No Arabic subtitle found for: {video_id}") | |
| return None | |
| try: | |
| with open(vtt_files[0], 'r', encoding='utf-8') as f: | |
| raw_text = f.read() | |
| def clean_caption(text): | |
| text = re.sub(r'<[^>]+>', '', text) | |
| text = re.sub(r'Kind:.*\n?', '', text) | |
| text = re.sub(r'Language:.*\n?', '', text) | |
| text = re.sub(r'WEBVTT.*\n?', '', text) | |
| text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text) | |
| text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| return clean_caption(raw_text) | |
| except Exception as e: | |
| print(f"[!] Failed to read/clean subtitle: {e}") | |
| return None | |
| # Skip by keywords | |
| def should_skip_title(title): | |
| if any(word in title for word in SKIP_WORDS): | |
| print(f"-> Ignorée (langue exclue) : {title}") | |
| return True | |
| return False | |
| print("\n==== LANCEMENT DU SCRIPT ====\n") | |
| videos = get_video_list(CHANNEL_URL) | |
| print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()} ➜ {END_DATE.date()}") | |
| data = [] | |
| video_count = 1 | |
| for idx, vid in enumerate(videos, start=1): | |
| if max_videos is not None and video_count > max_videos: | |
| print(f"\n✅ Limite de {max_videos} vidéos atteinte.") | |
| break | |
| video_id = vid.get("id") | |
| title = vid.get("title") or "" | |
| url = f"https://www.youtube.com/watch?v={video_id}" | |
| print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}") | |
| if should_skip_title(title): | |
| continue | |
| try: | |
| with yt_dlp.YoutubeDL({'quiet': True}) as ydl: | |
| full_info = ydl.extract_info(url, download=False) | |
| upload_date_str = full_info.get("upload_date") | |
| if not upload_date_str: | |
| print(f" [!] Date non trouvée pour : {title}") | |
| continue | |
| upload_date = datetime.strptime(upload_date_str, "%Y%m%d") | |
| if not (START_DATE <= upload_date <= END_DATE): | |
| print(f" [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.") | |
| break | |
| transcript = get_transcript_text(video_id) | |
| if transcript: | |
| data.append({ | |
| "id": video_count, | |
| "titre": title, | |
| "date": upload_date.strftime("%Y-%m-%d"), | |
| "langue": LANGUAGE, | |
| "sous-titre": transcript, | |
| "lien": url | |
| }) | |
| print(f" [v] Transcription ajoutée (ID: {video_count})") | |
| video_count += 1 | |
| else: | |
| print(f" [x] Transcription non disponible ou vide.") | |
| except Exception as e: | |
| print(f" [!] Erreur pendant le traitement de la vidéo {video_id}: {e}") | |
| print(f"\n-> Génération du fichier CSV: {RAW_CSV}") | |
| df = pd.DataFrame(data) | |
| df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig') | |
| print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'") | |
| print("\n==== FIN DU SCRIPT ====\n") | |