Spaces:

atoye1
/

518_yt_monitor

Sleeping

File size: 6,261 Bytes

import json
import logging
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List

from youtube_transcript_api import (
    NoTranscriptFound,
    TranscriptsDisabled,
    YouTubeTranscriptApi,
)

# 로깅 설정
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def load_existing_transcripts(file_path: Path) -> Dict:
    """기존 트랜스크립트 데이터 로드"""
    if not file_path.exists():
        return {"transcripts": []}
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            return data
    except Exception as e:
        logger.error(f"트랜스크립트 파일 로드 실패: {e}")
        return {"transcripts": []}


def load_video_info(videos_file: str) -> List[Dict]:
    """비디오 정보를 로드하는 함수"""
    try:
        with open(videos_file, "r", encoding="utf-8") as f:
            videos_data = json.load(f)
            return videos_data.get("videos", [])
    except Exception as e:
        logger.error(f"비디오 파일 로드 실패: {str(e)}")
        return []


def fetch_transcript(video_id: str, max_retries: int, retry_delay: int) -> Dict:
    """개별 비디오의 트랜스크립트를 API로 호출하는 함수"""
    for attempt in range(max_retries):
        try:
            transcript_list = YouTubeTranscriptApi.get_transcript(
                video_id, languages=["ko", "en"]
            )
            return {"transcript_segments": transcript_list, "error": None}
        except (TranscriptsDisabled, NoTranscriptFound) as e:
            return {
                "transcript_segments": None,
                "error": f"트랜스크립트 없음: {str(e)}",
            }
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = retry_delay * (attempt + 1)
                logger.warning(
                    f"오류 발생 (재시도 {attempt + 1}/{max_retries}), {wait_time}초 후 재시도..."
                )
                time.sleep(wait_time)
            else:
                return {
                    "transcript_segments": None,
                    "error": f"최대 재시도 횟수 초과: {str(e)}",
                }


def save_transcripts_to_file(transcripts: List[Dict], output_file: Path):
    """트랜스크립트를 파일에 저장하는 함수"""
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(transcripts, f, ensure_ascii=False, indent=2)
        logger.info(f"\n결과 저장 완료: {output_file}")
    except Exception as e:
        logger.error(f"결과 파일 저장 중 오류 발생: {str(e)}")


def collect_video_transcripts(
    max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
) -> List[Dict]:
    """
    비디오 트랜스크립트 수집 함수

    Args:
        max_retries: API 호출 실패 시 최대 재시도 횟수
        retry_delay: 재시도 간 대기 시간(초)
        videos_file: 비디오 정보가 담긴 JSON 파일 경로
    """
    output_dir = Path("data")
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / "transcripts_cache.json"
    all_transcripts = load_existing_transcripts(output_file).get("transcripts", [])

    # 비디오 정보 로드
    videos = load_video_info(videos_file)

    # 결과 저장용 리스트
    failed_videos = []

    # 이미 수집된 비디오 아이디 목록
    collected_video_ids = {transcript["video_id"] for transcript in all_transcripts}

    # 각 비디오의 트랜스크립트 수집
    total_videos = len(videos)
    for idx, video in enumerate(videos, 1):
        video_id = video["video_id"]

        # 이미 수집된 비디오인 경우 패스
        if video_id in collected_video_ids:
            logger.info(f"\n[{idx}/{total_videos}] 이미 수집된 비디오: {video_id} - {video['title']}")
            continue

        logger.info(
            f"\n[{idx}/{total_videos}] 트랜스크립트 수집 시도: {video_id} - {video['title']}"
        )

        result = fetch_transcript(video_id, max_retries, retry_delay)
        transcript_segments = result["transcript_segments"]
        error_message = result["error"]

        if transcript_segments:
            transcript_info = {
                "video_id": video_id,
                "channel_id": video["channel_id"],
                "channel_handle": video["channel_handle"],
                "title": video["title"],
                "transcript_segments": transcript_segments,
                "collected_at": datetime.now().isoformat(),
            }
            all_transcripts.append(transcript_info)
            logger.info("트랜스크립트 수집 성공")
        else:
            failed_videos.append(
                {
                    "video_id": video_id,
                    "channel_handle": video["channel_handle"],
                    "title": video["title"],
                    "error": error_message,
                }
            )
            logger.warning(f"트랜스크립트 수집 실패: {error_message}")

        # 50개마다 중간 저장
        if idx % 50 == 0:
            save_transcripts_to_file({"transcripts": all_transcripts}, output_file)

        # API 할당량 보호를 위한 대기
        time.sleep(0.2)

    # 최종 결과 저장
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_output_file = output_dir / f"transcripts_{timestamp}.json"
    result = {
        "collected_at": datetime.now().isoformat(),
        "total_videos": total_videos,
        "successful_videos": len(all_transcripts),
        "failed_videos": len(failed_videos),
        "transcripts": all_transcripts,
        "failures": failed_videos,
    }
    save_transcripts_to_file(result, final_output_file)

    if failed_videos:
        logger.warning("\n실패한 비디오들:")
        for fail in failed_videos:
            logger.warning(
                f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
            )

    return all_transcripts


if __name__ == "__main__":
    collect_video_transcripts()