Spaces:
Sleeping
Sleeping
| import json | |
| import logging | |
| import time | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, List | |
| from youtube_transcript_api import ( | |
| NoTranscriptFound, | |
| TranscriptsDisabled, | |
| YouTubeTranscriptApi, | |
| ) | |
| # ๋ก๊น ์ค์ | |
| logging.basicConfig( | |
| level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def load_existing_transcripts(file_path: Path) -> Dict: | |
| """๊ธฐ์กด ํธ๋์คํฌ๋ฆฝํธ ๋ฐ์ดํฐ ๋ก๋""" | |
| if not file_path.exists(): | |
| return {"transcripts": []} | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| return data | |
| except Exception as e: | |
| logger.error(f"ํธ๋์คํฌ๋ฆฝํธ ํ์ผ ๋ก๋ ์คํจ: {e}") | |
| return {"transcripts": []} | |
| def load_video_info(videos_file: str) -> List[Dict]: | |
| """๋น๋์ค ์ ๋ณด๋ฅผ ๋ก๋ํ๋ ํจ์""" | |
| try: | |
| with open(videos_file, "r", encoding="utf-8") as f: | |
| videos_data = json.load(f) | |
| return videos_data.get("videos", []) | |
| except Exception as e: | |
| logger.error(f"๋น๋์ค ํ์ผ ๋ก๋ ์คํจ: {str(e)}") | |
| return [] | |
| def fetch_transcript(video_id: str, max_retries: int, retry_delay: int) -> Dict: | |
| """๊ฐ๋ณ ๋น๋์ค์ ํธ๋์คํฌ๋ฆฝํธ๋ฅผ API๋ก ํธ์ถํ๋ ํจ์""" | |
| for attempt in range(max_retries): | |
| try: | |
| transcript_list = YouTubeTranscriptApi.get_transcript( | |
| video_id, languages=["ko", "en"] | |
| ) | |
| return {"transcript_segments": transcript_list, "error": None} | |
| except (TranscriptsDisabled, NoTranscriptFound) as e: | |
| return { | |
| "transcript_segments": None, | |
| "error": f"ํธ๋์คํฌ๋ฆฝํธ ์์: {str(e)}", | |
| } | |
| except Exception as e: | |
| if attempt < max_retries - 1: | |
| wait_time = retry_delay * (attempt + 1) | |
| logger.warning( | |
| f"์ค๋ฅ ๋ฐ์ (์ฌ์๋ {attempt + 1}/{max_retries}), {wait_time}์ด ํ ์ฌ์๋..." | |
| ) | |
| time.sleep(wait_time) | |
| else: | |
| return { | |
| "transcript_segments": None, | |
| "error": f"์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ: {str(e)}", | |
| } | |
| def save_transcripts_to_file(transcripts: List[Dict], output_file: Path): | |
| """ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ํ์ผ์ ์ ์ฅํ๋ ํจ์""" | |
| try: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(transcripts, f, ensure_ascii=False, indent=2) | |
| logger.info(f"\n๊ฒฐ๊ณผ ์ ์ฅ ์๋ฃ: {output_file}") | |
| except Exception as e: | |
| logger.error(f"๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def collect_video_transcripts( | |
| max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json" | |
| ) -> List[Dict]: | |
| """ | |
| ๋น๋์ค ํธ๋์คํฌ๋ฆฝํธ ์์ง ํจ์ | |
| Args: | |
| max_retries: API ํธ์ถ ์คํจ ์ ์ต๋ ์ฌ์๋ ํ์ | |
| retry_delay: ์ฌ์๋ ๊ฐ ๋๊ธฐ ์๊ฐ(์ด) | |
| videos_file: ๋น๋์ค ์ ๋ณด๊ฐ ๋ด๊ธด JSON ํ์ผ ๊ฒฝ๋ก | |
| """ | |
| output_dir = Path("data") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| output_file = output_dir / "transcripts_cache.json" | |
| all_transcripts = load_existing_transcripts(output_file).get("transcripts", []) | |
| # ๋น๋์ค ์ ๋ณด ๋ก๋ | |
| videos = load_video_info(videos_file) | |
| # ๊ฒฐ๊ณผ ์ ์ฅ์ฉ ๋ฆฌ์คํธ | |
| failed_videos = [] | |
| # ์ด๋ฏธ ์์ง๋ ๋น๋์ค ์์ด๋ ๋ชฉ๋ก | |
| collected_video_ids = {transcript["video_id"] for transcript in all_transcripts} | |
| # ๊ฐ ๋น๋์ค์ ํธ๋์คํฌ๋ฆฝํธ ์์ง | |
| total_videos = len(videos) | |
| for idx, video in enumerate(videos, 1): | |
| video_id = video["video_id"] | |
| # ์ด๋ฏธ ์์ง๋ ๋น๋์ค์ธ ๊ฒฝ์ฐ ํจ์ค | |
| if video_id in collected_video_ids: | |
| logger.info(f"\n[{idx}/{total_videos}] ์ด๋ฏธ ์์ง๋ ๋น๋์ค: {video_id} - {video['title']}") | |
| continue | |
| logger.info( | |
| f"\n[{idx}/{total_videos}] ํธ๋์คํฌ๋ฆฝํธ ์์ง ์๋: {video_id} - {video['title']}" | |
| ) | |
| result = fetch_transcript(video_id, max_retries, retry_delay) | |
| transcript_segments = result["transcript_segments"] | |
| error_message = result["error"] | |
| if transcript_segments: | |
| transcript_info = { | |
| "video_id": video_id, | |
| "channel_id": video["channel_id"], | |
| "channel_handle": video["channel_handle"], | |
| "title": video["title"], | |
| "transcript_segments": transcript_segments, | |
| "collected_at": datetime.now().isoformat(), | |
| } | |
| all_transcripts.append(transcript_info) | |
| logger.info("ํธ๋์คํฌ๋ฆฝํธ ์์ง ์ฑ๊ณต") | |
| else: | |
| failed_videos.append( | |
| { | |
| "video_id": video_id, | |
| "channel_handle": video["channel_handle"], | |
| "title": video["title"], | |
| "error": error_message, | |
| } | |
| ) | |
| logger.warning(f"ํธ๋์คํฌ๋ฆฝํธ ์์ง ์คํจ: {error_message}") | |
| # 50๊ฐ๋ง๋ค ์ค๊ฐ ์ ์ฅ | |
| if idx % 50 == 0: | |
| save_transcripts_to_file({"transcripts": all_transcripts}, output_file) | |
| # API ํ ๋น๋ ๋ณดํธ๋ฅผ ์ํ ๋๊ธฐ | |
| time.sleep(0.2) | |
| # ์ต์ข ๊ฒฐ๊ณผ ์ ์ฅ | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| final_output_file = output_dir / f"transcripts_{timestamp}.json" | |
| result = { | |
| "collected_at": datetime.now().isoformat(), | |
| "total_videos": total_videos, | |
| "successful_videos": len(all_transcripts), | |
| "failed_videos": len(failed_videos), | |
| "transcripts": all_transcripts, | |
| "failures": failed_videos, | |
| } | |
| save_transcripts_to_file(result, final_output_file) | |
| if failed_videos: | |
| logger.warning("\n์คํจํ ๋น๋์ค๋ค:") | |
| for fail in failed_videos: | |
| logger.warning( | |
| f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}" | |
| ) | |
| return all_transcripts | |
| if __name__ == "__main__": | |
| collect_video_transcripts() | |