518_yt_monitor / src /core /analyzer.py
atoye1's picture
major commit with data files
b9cc1a2
from datetime import datetime
from typing import Optional, List
from youtube_transcript_api import YouTubeTranscriptApi
from ..models.schemas import VideoAnalysis, TranscriptSegment
from ..utils.cache import CacheManager
from .youtube_api import YouTubeAPI
class ContentAnalyzer:
def __init__(self, youtube_api: YouTubeAPI, cache_manager: CacheManager):
self.youtube_api = youtube_api
self.cache_manager = cache_manager
# ํ˜์˜ค ํ‘œํ˜„ ํ‚ค์›Œ๋“œ
self.hate_speech_keywords = {
"์ฐจ๋ณ„": ["์งฑ๊นจ", "ํ‘ํ˜•", "ํ‹€๋”ฑ", "๊น€์น˜๋…€"],
"๋น„ํ•˜": ["๋ณ‘์‹ ", "์”จ๋ฐœ", "๋ฏธ์นœ"],
"์ •์น˜": ["ํญ๋™", "ํญ๋„", "์ข…๋ถ"],
}
def analyze_video(self, video_id: str, channel_id: str) -> Optional[VideoAnalysis]:
"""๋™์˜์ƒ ๋ถ„์„"""
# ์บ์‹œ ํ™•์ธ
cached_analysis = self.cache_manager.get_video_analysis(video_id)
if cached_analysis:
return cached_analysis
# ๋น„๋””์˜ค ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ
video_info = self.youtube_api.get_video_details(video_id)
if not video_info:
return None
try:
# ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
transcript_list = YouTubeTranscriptApi.get_transcript(
video_id,
languages=['ko', 'en']
)
transcript_segments = [
TranscriptSegment(
start=t['start'],
text=t['text'],
duration=t['duration']
) for t in transcript_list
]
# ํ˜์˜ค ํ‘œํ˜„ ๊ฒ€์‚ฌ
hate_speech_instances = self._detect_hate_speech(transcript_segments)
analysis = VideoAnalysis(
video_id=video_id,
title=video_info['title'],
channel_id=channel_id,
publish_date=datetime.fromisoformat(video_info['publishedAt'].replace('Z', '+00:00')),
last_analyzed=datetime.now(),
transcript_segments=transcript_segments,
hate_speech_instances=hate_speech_instances
)
# ์บ์‹œ์— ์ €์žฅ
self.cache_manager.save_video_analysis(analysis)
return analysis
except Exception as e:
print(f"Error analyzing video {video_id}: {str(e)}")
return None
def _detect_hate_speech(self, segments: List[TranscriptSegment]) -> List[dict]:
"""ํ˜์˜ค ํ‘œํ˜„ ๊ฒ€์ถœ"""
instances = []
for segment in segments:
for category, keywords in self.hate_speech_keywords.items():
for keyword in keywords:
if keyword in segment.text:
instances.append({
'timestamp': segment.start,
'text': segment.text,
'category': category,
'keyword': keyword
})
return instances