Spaces:

atoye1
/

518_yt_monitor

Sleeping

App Files Files Community

518_yt_monitor / src /core /analyzer.py

atoye1

major commit with data files

b9cc1a2 over 1 year ago

raw

history blame contribute delete

3.12 kB

	from datetime import datetime
	from typing import Optional, List
	from youtube_transcript_api import YouTubeTranscriptApi

	from ..models.schemas import VideoAnalysis, TranscriptSegment
	from ..utils.cache import CacheManager
	from .youtube_api import YouTubeAPI

	class ContentAnalyzer:
	def __init__(self, youtube_api: YouTubeAPI, cache_manager: CacheManager):
	self.youtube_api = youtube_api
	self.cache_manager = cache_manager

	# 혐오 표현 키워드
	self.hate_speech_keywords = {
	"차별": ["짱깨", "흑형", "틀딱", "김치녀"],
	"비하": ["병신", "씨발", "미친"],
	"정치": ["폭동", "폭도", "종북"],
	}

	def analyze_video(self, video_id: str, channel_id: str) -> Optional[VideoAnalysis]:
	"""동영상 분석"""
	# 캐시 확인
	cached_analysis = self.cache_manager.get_video_analysis(video_id)
	if cached_analysis:
	return cached_analysis

	# 비디오 정보 가져오기
	video_info = self.youtube_api.get_video_details(video_id)
	if not video_info:
	return None

	try:
	# 트랜스크립트 가져오기
	transcript_list = YouTubeTranscriptApi.get_transcript(
	video_id,
	languages=['ko', 'en']
	)

	transcript_segments = [
	TranscriptSegment(
	start=t['start'],
	text=t['text'],
	duration=t['duration']
	) for t in transcript_list
	]

	# 혐오 표현 검사
	hate_speech_instances = self._detect_hate_speech(transcript_segments)

	analysis = VideoAnalysis(
	video_id=video_id,
	title=video_info['title'],
	channel_id=channel_id,
	publish_date=datetime.fromisoformat(video_info['publishedAt'].replace('Z', '+00:00')),
	last_analyzed=datetime.now(),
	transcript_segments=transcript_segments,
	hate_speech_instances=hate_speech_instances
	)

	# 캐시에 저장
	self.cache_manager.save_video_analysis(analysis)
	return analysis

	except Exception as e:
	print(f"Error analyzing video {video_id}: {str(e)}")
	return None

	def _detect_hate_speech(self, segments: List[TranscriptSegment]) -> List[dict]:
	"""혐오 표현 검출"""
	instances = []
	for segment in segments:
	for category, keywords in self.hate_speech_keywords.items():
	for keyword in keywords:
	if keyword in segment.text:
	instances.append({
	'timestamp': segment.start,
	'text': segment.text,
	'category': category,
	'keyword': keyword
	})
	return instances