Spaces:
Sleeping
Sleeping
| import json | |
| import logging | |
| import time | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, List | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
| # ๋ก๊น ์ค์ | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def collect_video_transcripts( | |
| max_retries: int = 3, | |
| retry_delay: int = 5, | |
| videos_file: str = "data/videos.json" | |
| ) -> List[Dict]: | |
| """ | |
| ๋น๋์ค ํธ๋์คํฌ๋ฆฝํธ ์์ง ํจ์ | |
| Args: | |
| max_retries: API ํธ์ถ ์คํจ ์ ์ต๋ ์ฌ์๋ ํ์ | |
| retry_delay: ์ฌ์๋ ๊ฐ ๋๊ธฐ ์๊ฐ(์ด) | |
| videos_file: ๋น๋์ค ์ ๋ณด๊ฐ ๋ด๊ธด JSON ํ์ผ ๊ฒฝ๋ก | |
| """ | |
| output_dir = Path("data") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # ๋น๋์ค ์ ๋ณด ๋ก๋ | |
| try: | |
| with open(videos_file, 'r', encoding='utf-8') as f: | |
| videos_data = json.load(f) | |
| videos = videos_data.get('videos', []) | |
| except Exception as e: | |
| logger.error(f"๋น๋์ค ํ์ผ ๋ก๋ ์คํจ: {str(e)}") | |
| return [] | |
| # ๊ฒฐ๊ณผ ์ ์ฅ์ฉ ๋ฆฌ์คํธ | |
| all_transcripts = [] | |
| failed_videos = [] | |
| # ๊ฐ ๋น๋์ค์ ํธ๋์คํฌ๋ฆฝํธ ์์ง | |
| total_videos = len(videos) | |
| for idx, video in enumerate(videos, 1): | |
| video_id = video['video_id'] | |
| logger.info(f"\n[{idx}/{total_videos}] ํธ๋์คํฌ๋ฆฝํธ ์์ง ์๋: {video_id} - {video['title']}") | |
| # ์ฌ์๋ ๋ก์ง | |
| transcript_segments = None | |
| error_message = None | |
| for attempt in range(max_retries): | |
| try: | |
| transcript_list = YouTubeTranscriptApi.get_transcript( | |
| video_id, | |
| languages=['ko', 'en'] | |
| ) | |
| transcript_segments = transcript_list | |
| break | |
| except (TranscriptsDisabled, NoTranscriptFound) as e: | |
| error_message = f"ํธ๋์คํฌ๋ฆฝํธ ์์: {str(e)}" | |
| break | |
| except Exception as e: | |
| if attempt < max_retries - 1: | |
| wait_time = retry_delay * (attempt + 1) | |
| logger.warning(f"์ค๋ฅ ๋ฐ์ (์ฌ์๋ {attempt + 1}/{max_retries}), {wait_time}์ด ํ ์ฌ์๋...") | |
| time.sleep(wait_time) | |
| else: | |
| error_message = f"์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ: {str(e)}" | |
| if transcript_segments: | |
| transcript_info = { | |
| 'video_id': video_id, | |
| 'channel_id': video['channel_id'], | |
| 'channel_handle': video['channel_handle'], | |
| 'title': video['title'], | |
| 'transcript_segments': transcript_segments, | |
| 'collected_at': datetime.now().isoformat() | |
| } | |
| all_transcripts.append(transcript_info) | |
| logger.info(f"ํธ๋์คํฌ๋ฆฝํธ ์์ง ์ฑ๊ณต") | |
| else: | |
| failed_videos.append({ | |
| 'video_id': video_id, | |
| 'channel_handle': video['channel_handle'], | |
| 'title': video['title'], | |
| 'error': error_message | |
| }) | |
| logger.warning(f"ํธ๋์คํฌ๋ฆฝํธ ์์ง ์คํจ: {error_message}") | |
| # API ํ ๋น๋ ๋ณดํธ๋ฅผ ์ํ ๋๊ธฐ | |
| time.sleep(1) | |
| # ๊ฒฐ๊ณผ ์ ์ฅ | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| result = { | |
| "collected_at": datetime.now().isoformat(), | |
| "total_videos": total_videos, | |
| "successful_videos": len(all_transcripts), | |
| "failed_videos": len(failed_videos), | |
| "transcripts": all_transcripts, | |
| "failures": failed_videos | |
| } | |
| # ๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ | |
| output_file = output_dir / f"transcripts_{timestamp}.json" | |
| try: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(result, f, ensure_ascii=False, indent=2) | |
| logger.info(f"\n๊ฒฐ๊ณผ ์ ์ฅ ์๋ฃ: {output_file}") | |
| logger.info(f"์ด {len(all_transcripts)}๊ฐ ํธ๋์คํฌ๋ฆฝํธ ์์ง ์๋ฃ (์คํจ: {len(failed_videos)}๊ฐ)") | |
| if failed_videos: | |
| logger.warning("\n์คํจํ ๋น๋์ค๋ค:") | |
| for fail in failed_videos: | |
| logger.warning(f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}") | |
| except Exception as e: | |
| logger.error(f"๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| return all_transcripts | |
| return all_transcripts | |
| if __name__ == "__main__": | |
| collect_video_transcripts() |