Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

File size: 5,399 Bytes
def fetch_youtube_transcripts(max_videos=None):
    import os
    import yt_dlp
    import re
    from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
    import pandas as pd
    from datetime import datetime
    from src.configs.config import RAW_CSV
    from datetime import datetime, timedelta

    # Configuration
    CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams"
    START_DATE = datetime(2025, 7, 21)
    END_DATE = datetime.today()
    SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت']
    LANGUAGE = 'ar'

    # Create output directory if needed
    output_dir = os.path.dirname(RAW_CSV)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    # Get video list
    def get_video_list(channel_url):
        print(f"-> Extraction des vidéos depuis : {channel_url}")
        ydl_opts = {
            "quiet": True,
            "extract_flat": True,
            "force_generic_extractor": True
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(channel_url, download=False)
            entries = info.get("entries", [])
            print(f"-> {len(entries)} vidéos extraites du flux.")
            return entries

    # Get transcript text
    def get_transcript_text(video_id):
        import glob

        video_url = f"https://www.youtube.com/watch?v={video_id}"
        output_path = f"/tmp/{video_id}.%(ext)s"

        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['ar'],
            'skip_download': True,
            'outtmpl': output_path,
            'quiet': True,
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([video_url])
            except Exception as e:
                print(f"[!] yt-dlp download error: {e}")
                return None

        # Look for any Arabic VTT subtitle file (auto or manual)
        vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt")
        if not vtt_files:
            print(f"[x] No Arabic subtitle found for: {video_id}")
            return None

        try:
            with open(vtt_files[0], 'r', encoding='utf-8') as f:
                raw_text = f.read()

            def clean_caption(text):
                text = re.sub(r'<[^>]+>', '', text)
                text = re.sub(r'Kind:.*\n?', '', text)
                text = re.sub(r'Language:.*\n?', '', text)
                text = re.sub(r'WEBVTT.*\n?', '', text)
                text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text)
                text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text)
                text = re.sub(r'\s+', ' ', text)
                return text.strip()

            return clean_caption(raw_text)

        except Exception as e:
            print(f"[!] Failed to read/clean subtitle: {e}")
            return None
        
    # Skip by keywords
    def should_skip_title(title):
        if any(word in title for word in SKIP_WORDS):
            print(f"-> Ignorée (langue exclue) : {title}")
            return True
        return False

    print("\n==== LANCEMENT DU SCRIPT ====\n")
    videos = get_video_list(CHANNEL_URL)
    print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()} ➜ {END_DATE.date()}")

    data = []
    video_count = 1

    for idx, vid in enumerate(videos, start=1):
        if max_videos is not None and video_count > max_videos:
            print(f"\n✅ Limite de {max_videos} vidéos atteinte.")
            break
        video_id = vid.get("id")
        title = vid.get("title") or ""
        url = f"https://www.youtube.com/watch?v={video_id}"

        print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}")

        if should_skip_title(title):
            continue

        try:
            with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
                full_info = ydl.extract_info(url, download=False)

            upload_date_str = full_info.get("upload_date")
            if not upload_date_str:
                print(f"   [!] Date non trouvée pour : {title}")
                continue

            upload_date = datetime.strptime(upload_date_str, "%Y%m%d")

            if not (START_DATE <= upload_date <= END_DATE):
                print(f"   [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.")
                break

            transcript = get_transcript_text(video_id)
            if transcript:
                data.append({
                    "id": video_count,
                    "titre": title,
                    "date": upload_date.strftime("%Y-%m-%d"),
                    "langue": LANGUAGE,
                    "sous-titre": transcript,
                    "lien": url
                })
                print(f"   [v] Transcription ajoutée (ID: {video_count})")
                video_count += 1
            else:
                print(f"   [x] Transcription non disponible ou vide.")

        except Exception as e:
            print(f"   [!] Erreur pendant le traitement de la vidéo {video_id}: {e}")

    print(f"\n-> Génération du fichier CSV: {RAW_CSV}")
    df = pd.DataFrame(data)
    df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig')
    print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'")
    print("\n==== FIN DU SCRIPT ====\n")