Spaces:

atoye1
/

518_yt_monitor

Sleeping

File size: 4,589 Bytes

b9cc1a2

import json
import logging
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def collect_video_transcripts(
    max_retries: int = 3,
    retry_delay: int = 5,
    videos_file: str = "data/videos.json"
) -> List[Dict]:
    """
    비디오 트랜스크립트 수집 함수
    
    Args:
        max_retries: API 호출 실패 시 최대 재시도 횟수
        retry_delay: 재시도 간 대기 시간(초)
        videos_file: 비디오 정보가 담긴 JSON 파일 경로
    """
    output_dir = Path("data")
    output_dir.mkdir(parents=True, exist_ok=True)

    # 비디오 정보 로드
    try:
        with open(videos_file, 'r', encoding='utf-8') as f:
            videos_data = json.load(f)
            videos = videos_data.get('videos', [])
    except Exception as e:
        logger.error(f"비디오 파일 로드 실패: {str(e)}")
        return []

    # 결과 저장용 리스트
    all_transcripts = []
    failed_videos = []

    # 각 비디오의 트랜스크립트 수집
    total_videos = len(videos)
    for idx, video in enumerate(videos, 1):
        video_id = video['video_id']
        logger.info(f"\n[{idx}/{total_videos}] 트랜스크립트 수집 시도: {video_id} - {video['title']}")

        # 재시도 로직
        transcript_segments = None
        error_message = None
        
        for attempt in range(max_retries):
            try:
                transcript_list = YouTubeTranscriptApi.get_transcript(
                    video_id,
                    languages=['ko', 'en']
                )
                transcript_segments = transcript_list
                break
            except (TranscriptsDisabled, NoTranscriptFound) as e:
                error_message = f"트랜스크립트 없음: {str(e)}"
                break
            except Exception as e:
                if attempt < max_retries - 1:
                    wait_time = retry_delay * (attempt + 1)
                    logger.warning(f"오류 발생 (재시도 {attempt + 1}/{max_retries}), {wait_time}초 후 재시도...")
                    time.sleep(wait_time)
                else:
                    error_message = f"최대 재시도 횟수 초과: {str(e)}"

        if transcript_segments:
            transcript_info = {
                'video_id': video_id,
                'channel_id': video['channel_id'],
                'channel_handle': video['channel_handle'],
                'title': video['title'],
                'transcript_segments': transcript_segments,
                'collected_at': datetime.now().isoformat()
            }
            all_transcripts.append(transcript_info)
            logger.info(f"트랜스크립트 수집 성공")
        else:
            failed_videos.append({
                'video_id': video_id,
                'channel_handle': video['channel_handle'],
                'title': video['title'],
                'error': error_message
            })
            logger.warning(f"트랜스크립트 수집 실패: {error_message}")

        # API 할당량 보호를 위한 대기
        time.sleep(1)

    # 결과 저장
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    result = {
        "collected_at": datetime.now().isoformat(),
        "total_videos": total_videos,
        "successful_videos": len(all_transcripts),
        "failed_videos": len(failed_videos),
        "transcripts": all_transcripts,
        "failures": failed_videos
    }

    # 결과 파일 저장
    output_file = output_dir / f"transcripts_{timestamp}.json"
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        logger.info(f"\n결과 저장 완료: {output_file}")
        logger.info(f"총 {len(all_transcripts)}개 트랜스크립트 수집 완료 (실패: {len(failed_videos)}개)")

        if failed_videos:
            logger.warning("\n실패한 비디오들:")
            for fail in failed_videos:
                logger.warning(f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}")
    except Exception as e:
        logger.error(f"결과 파일 저장 중 오류 발생: {str(e)}")
        return all_transcripts

    return all_transcripts

if __name__ == "__main__":
    collect_video_transcripts()