Spaces:
Sleeping
Sleeping
File size: 5,399 Bytes
3107242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
def fetch_youtube_transcripts(max_videos=None):
import os
import yt_dlp
import re
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import pandas as pd
from datetime import datetime
from src.configs.config import RAW_CSV
from datetime import datetime, timedelta
# Configuration
CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams"
START_DATE = datetime(2025, 7, 21)
END_DATE = datetime.today()
SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت']
LANGUAGE = 'ar'
# Create output directory if needed
output_dir = os.path.dirname(RAW_CSV)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
# Get video list
def get_video_list(channel_url):
print(f"-> Extraction des vidéos depuis : {channel_url}")
ydl_opts = {
"quiet": True,
"extract_flat": True,
"force_generic_extractor": True
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(channel_url, download=False)
entries = info.get("entries", [])
print(f"-> {len(entries)} vidéos extraites du flux.")
return entries
# Get transcript text
def get_transcript_text(video_id):
import glob
video_url = f"https://www.youtube.com/watch?v={video_id}"
output_path = f"/tmp/{video_id}.%(ext)s"
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['ar'],
'skip_download': True,
'outtmpl': output_path,
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([video_url])
except Exception as e:
print(f"[!] yt-dlp download error: {e}")
return None
# Look for any Arabic VTT subtitle file (auto or manual)
vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt")
if not vtt_files:
print(f"[x] No Arabic subtitle found for: {video_id}")
return None
try:
with open(vtt_files[0], 'r', encoding='utf-8') as f:
raw_text = f.read()
def clean_caption(text):
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'Kind:.*\n?', '', text)
text = re.sub(r'Language:.*\n?', '', text)
text = re.sub(r'WEBVTT.*\n?', '', text)
text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text)
text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
return clean_caption(raw_text)
except Exception as e:
print(f"[!] Failed to read/clean subtitle: {e}")
return None
# Skip by keywords
def should_skip_title(title):
if any(word in title for word in SKIP_WORDS):
print(f"-> Ignorée (langue exclue) : {title}")
return True
return False
print("\n==== LANCEMENT DU SCRIPT ====\n")
videos = get_video_list(CHANNEL_URL)
print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()} ➜ {END_DATE.date()}")
data = []
video_count = 1
for idx, vid in enumerate(videos, start=1):
if max_videos is not None and video_count > max_videos:
print(f"\n✅ Limite de {max_videos} vidéos atteinte.")
break
video_id = vid.get("id")
title = vid.get("title") or ""
url = f"https://www.youtube.com/watch?v={video_id}"
print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}")
if should_skip_title(title):
continue
try:
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
full_info = ydl.extract_info(url, download=False)
upload_date_str = full_info.get("upload_date")
if not upload_date_str:
print(f" [!] Date non trouvée pour : {title}")
continue
upload_date = datetime.strptime(upload_date_str, "%Y%m%d")
if not (START_DATE <= upload_date <= END_DATE):
print(f" [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.")
break
transcript = get_transcript_text(video_id)
if transcript:
data.append({
"id": video_count,
"titre": title,
"date": upload_date.strftime("%Y-%m-%d"),
"langue": LANGUAGE,
"sous-titre": transcript,
"lien": url
})
print(f" [v] Transcription ajoutée (ID: {video_count})")
video_count += 1
else:
print(f" [x] Transcription non disponible ou vide.")
except Exception as e:
print(f" [!] Erreur pendant le traitement de la vidéo {video_id}: {e}")
print(f"\n-> Génération du fichier CSV: {RAW_CSV}")
df = pd.DataFrame(data)
df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig')
print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'")
print("\n==== FIN DU SCRIPT ====\n")
|