Spaces:

atoye1
/

518_yt_monitor

Sleeping

App Files Files Community

518_yt_monitor / src /scripts /collect_transcript.py

atoye1

adding updated datafiles

591c7e2 over 1 year ago

raw

history blame contribute delete

6.26 kB

	import json
	import logging
	import time
	from datetime import datetime
	from pathlib import Path
	from typing import Dict, List

	from youtube_transcript_api import (
	NoTranscriptFound,
	TranscriptsDisabled,
	YouTubeTranscriptApi,
	)

	# 로깅 설정
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)
	logger = logging.getLogger(__name__)


	def load_existing_transcripts(file_path: Path) -> Dict:
	"""기존 트랜스크립트 데이터 로드"""
	if not file_path.exists():
	return {"transcripts": []}
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	return data
	except Exception as e:
	logger.error(f"트랜스크립트 파일 로드 실패: {e}")
	return {"transcripts": []}


	def load_video_info(videos_file: str) -> List[Dict]:
	"""비디오 정보를 로드하는 함수"""
	try:
	with open(videos_file, "r", encoding="utf-8") as f:
	videos_data = json.load(f)
	return videos_data.get("videos", [])
	except Exception as e:
	logger.error(f"비디오 파일 로드 실패: {str(e)}")
	return []


	def fetch_transcript(video_id: str, max_retries: int, retry_delay: int) -> Dict:
	"""개별 비디오의 트랜스크립트를 API로 호출하는 함수"""
	for attempt in range(max_retries):
	try:
	transcript_list = YouTubeTranscriptApi.get_transcript(
	video_id, languages=["ko", "en"]
	)
	return {"transcript_segments": transcript_list, "error": None}
	except (TranscriptsDisabled, NoTranscriptFound) as e:
	return {
	"transcript_segments": None,
	"error": f"트랜스크립트 없음: {str(e)}",
	}
	except Exception as e:
	if attempt < max_retries - 1:
	wait_time = retry_delay * (attempt + 1)
	logger.warning(
	f"오류 발생 (재시도 {attempt + 1}/{max_retries}), {wait_time}초 후 재시도..."
	)
	time.sleep(wait_time)
	else:
	return {
	"transcript_segments": None,
	"error": f"최대 재시도 횟수 초과: {str(e)}",
	}


	def save_transcripts_to_file(transcripts: List[Dict], output_file: Path):
	"""트랜스크립트를 파일에 저장하는 함수"""
	try:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(transcripts, f, ensure_ascii=False, indent=2)
	logger.info(f"\n결과 저장 완료: {output_file}")
	except Exception as e:
	logger.error(f"결과 파일 저장 중 오류 발생: {str(e)}")


	def collect_video_transcripts(
	max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
	) -> List[Dict]:
	"""
	비디오 트랜스크립트 수집 함수

	Args:
	max_retries: API 호출 실패 시 최대 재시도 횟수
	retry_delay: 재시도 간 대기 시간(초)
	videos_file: 비디오 정보가 담긴 JSON 파일 경로
	"""
	output_dir = Path("data")
	output_dir.mkdir(parents=True, exist_ok=True)
	output_file = output_dir / "transcripts_cache.json"
	all_transcripts = load_existing_transcripts(output_file).get("transcripts", [])

	# 비디오 정보 로드
	videos = load_video_info(videos_file)

	# 결과 저장용 리스트
	failed_videos = []

	# 이미 수집된 비디오 아이디 목록
	collected_video_ids = {transcript["video_id"] for transcript in all_transcripts}

	# 각 비디오의 트랜스크립트 수집
	total_videos = len(videos)
	for idx, video in enumerate(videos, 1):
	video_id = video["video_id"]

	# 이미 수집된 비디오인 경우 패스
	if video_id in collected_video_ids:
	logger.info(f"\n[{idx}/{total_videos}] 이미 수집된 비디오: {video_id} - {video['title']}")
	continue

	logger.info(
	f"\n[{idx}/{total_videos}] 트랜스크립트 수집 시도: {video_id} - {video['title']}"
	)

	result = fetch_transcript(video_id, max_retries, retry_delay)
	transcript_segments = result["transcript_segments"]
	error_message = result["error"]

	if transcript_segments:
	transcript_info = {
	"video_id": video_id,
	"channel_id": video["channel_id"],
	"channel_handle": video["channel_handle"],
	"title": video["title"],
	"transcript_segments": transcript_segments,
	"collected_at": datetime.now().isoformat(),
	}
	all_transcripts.append(transcript_info)
	logger.info("트랜스크립트 수집 성공")
	else:
	failed_videos.append(
	{
	"video_id": video_id,
	"channel_handle": video["channel_handle"],
	"title": video["title"],
	"error": error_message,
	}
	)
	logger.warning(f"트랜스크립트 수집 실패: {error_message}")

	# 50개마다 중간 저장
	if idx % 50 == 0:
	save_transcripts_to_file({"transcripts": all_transcripts}, output_file)

	# API 할당량 보호를 위한 대기
	time.sleep(0.2)

	# 최종 결과 저장
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	final_output_file = output_dir / f"transcripts_{timestamp}.json"
	result = {
	"collected_at": datetime.now().isoformat(),
	"total_videos": total_videos,
	"successful_videos": len(all_transcripts),
	"failed_videos": len(failed_videos),
	"transcripts": all_transcripts,
	"failures": failed_videos,
	}
	save_transcripts_to_file(result, final_output_file)

	if failed_videos:
	logger.warning("\n실패한 비디오들:")
	for fail in failed_videos:
	logger.warning(
	f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
	)

	return all_transcripts


	if __name__ == "__main__":
	collect_video_transcripts()