File size: 5,399 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def fetch_youtube_transcripts(max_videos=None):
    import os
    import yt_dlp
    import re
    from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
    import pandas as pd
    from datetime import datetime
    from src.configs.config import RAW_CSV
    from datetime import datetime, timedelta

    # Configuration
    CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams"
    START_DATE = datetime(2025, 7, 21)
    END_DATE = datetime.today()
    SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت']
    LANGUAGE = 'ar'

    # Create output directory if needed
    output_dir = os.path.dirname(RAW_CSV)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    # Get video list
    def get_video_list(channel_url):
        print(f"-> Extraction des vidéos depuis : {channel_url}")
        ydl_opts = {
            "quiet": True,
            "extract_flat": True,
            "force_generic_extractor": True
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(channel_url, download=False)
            entries = info.get("entries", [])
            print(f"-> {len(entries)} vidéos extraites du flux.")
            return entries

    # Get transcript text
    def get_transcript_text(video_id):
        import glob

        video_url = f"https://www.youtube.com/watch?v={video_id}"
        output_path = f"/tmp/{video_id}.%(ext)s"

        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['ar'],
            'skip_download': True,
            'outtmpl': output_path,
            'quiet': True,
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([video_url])
            except Exception as e:
                print(f"[!] yt-dlp download error: {e}")
                return None

        # Look for any Arabic VTT subtitle file (auto or manual)
        vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt")
        if not vtt_files:
            print(f"[x] No Arabic subtitle found for: {video_id}")
            return None

        try:
            with open(vtt_files[0], 'r', encoding='utf-8') as f:
                raw_text = f.read()

            def clean_caption(text):
                text = re.sub(r'<[^>]+>', '', text)
                text = re.sub(r'Kind:.*\n?', '', text)
                text = re.sub(r'Language:.*\n?', '', text)
                text = re.sub(r'WEBVTT.*\n?', '', text)
                text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text)
                text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text)
                text = re.sub(r'\s+', ' ', text)
                return text.strip()

            return clean_caption(raw_text)

        except Exception as e:
            print(f"[!] Failed to read/clean subtitle: {e}")
            return None
        
    # Skip by keywords
    def should_skip_title(title):
        if any(word in title for word in SKIP_WORDS):
            print(f"-> Ignorée (langue exclue) : {title}")
            return True
        return False

    print("\n==== LANCEMENT DU SCRIPT ====\n")
    videos = get_video_list(CHANNEL_URL)
    print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()}{END_DATE.date()}")

    data = []
    video_count = 1

    for idx, vid in enumerate(videos, start=1):
        if max_videos is not None and video_count > max_videos:
            print(f"\n✅ Limite de {max_videos} vidéos atteinte.")
            break
        video_id = vid.get("id")
        title = vid.get("title") or ""
        url = f"https://www.youtube.com/watch?v={video_id}"

        print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}")

        if should_skip_title(title):
            continue

        try:
            with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
                full_info = ydl.extract_info(url, download=False)

            upload_date_str = full_info.get("upload_date")
            if not upload_date_str:
                print(f"   [!] Date non trouvée pour : {title}")
                continue

            upload_date = datetime.strptime(upload_date_str, "%Y%m%d")

            if not (START_DATE <= upload_date <= END_DATE):
                print(f"   [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.")
                break

            transcript = get_transcript_text(video_id)
            if transcript:
                data.append({
                    "id": video_count,
                    "titre": title,
                    "date": upload_date.strftime("%Y-%m-%d"),
                    "langue": LANGUAGE,
                    "sous-titre": transcript,
                    "lien": url
                })
                print(f"   [v] Transcription ajoutée (ID: {video_count})")
                video_count += 1
            else:
                print(f"   [x] Transcription non disponible ou vide.")

        except Exception as e:
            print(f"   [!] Erreur pendant le traitement de la vidéo {video_id}: {e}")

    print(f"\n-> Génération du fichier CSV: {RAW_CSV}")
    df = pd.DataFrame(data)
    df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig')
    print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'")
    print("\n==== FIN DU SCRIPT ====\n")