Spaces:

atoye1
/

518_yt_monitor

Sleeping

App Files Files Community

atoye1 commited on Dec 2, 2024

Commit

591c7e2

1 Parent(s): db5a958

adding updated datafiles

Browse files

Files changed (7) hide show

data/.DS_Store +0 -0
src/app.py +2 -0
src/pages/video_detail.py +15 -16
src/scripts/collect_channel_info.py +3 -3
src/scripts/collect_transcript.py +91 -53
src/scripts/collect_videos_info.py +4 -5
src/scripts/process_all_transcripts.py +154 -0

data/.DS_Store CHANGED Viewed

Binary files a/data/.DS_Store and b/data/.DS_Store differ

src/app.py CHANGED Viewed

@@ -29,6 +29,8 @@ if "data_manager" not in st.session_state:
 if "toxicity_classifier" not in st.session_state:
     toxicity_classifier = load_inference_model()
     st.session_state["toxicity_classifier"] = toxicity_classifier
 pg = st.navigation(

 if "toxicity_classifier" not in st.session_state:
     toxicity_classifier = load_inference_model()
     st.session_state["toxicity_classifier"] = toxicity_classifier
+if "analyzed_transcripts" not in st.session_state:
+    st.session_state["analyzed_transcripts"] = {}
 pg = st.navigation(

src/pages/video_detail.py CHANGED Viewed

@@ -116,36 +116,35 @@ def analyze():
         )
         if analyzed_transcript is None:
             analyzed_transcript = gen_analyzed_transcript()
-            if analyzed_transcript:
-                data_manager.set_analyzed_transcript(analyzed_transcript)
-                st.success("분석이 완료되었습니다!")
-            else:
-                st.error("분석 중 오류가 발생했습니다.")
-            st.rerun()  # 결과를 즉시 표시하기 위한 페이지 리로드
-        return analyzed_transcript
 analyzed_result = st.button("분석", icon="🔄", on_click=analyze)
 # 분석 결과 표시
-if "analyzed_transcript" not in st.session_state:
-    st.session_state.analyzed_transcript = (
-        data_manager.get_analyzed_transcript_by_video_id(current_video.video_id)
-    )
 if st.session_state.analyzed_transcript:
     st.subheader("트랜스크립트 분석 결과")
-    # 결과를 표 형식으로 표시
-    print(st.session_state.analyzed_transcript)
-    print(type(st.session_state.analyzed_transcript))
-    print("=" * 30)
     for i, segment in enumerate(
         st.session_state.analyzed_transcript.get("chunked_segments", [])
     ):
         with st.expander(
-            f"청크 {i+1} ({segment['start']:.1f}s - {segment['end']:.1f}s)"
         ):
             st.text(segment["transcript"])
             st.progress(float(segment["toxicity_score"]))
             st.text(f"유해도 점수: {segment['toxicity_score']:.3f}")

         )
         if analyzed_transcript is None:
             analyzed_transcript = gen_analyzed_transcript()
+            data_manager.set_analyzed_transcript(analyzed_transcript)
+            st.success("분석이 완료되었습니다!")
+        st.session_state.analyzed_transcript = analyzed_transcript
 analyzed_result = st.button("분석", icon="🔄", on_click=analyze)
 # 분석 결과 표시
+st.session_state.analyzed_transcript = data_manager.get_analyzed_transcript_by_video_id(
+    current_video.video_id
+)
 if st.session_state.analyzed_transcript:
     st.subheader("트랜스크립트 분석 결과")
     for i, segment in enumerate(
         st.session_state.analyzed_transcript.get("chunked_segments", [])
     ):
+        color = "green"
+        if segment["toxicity_score"] > 0.7:
+            color = "red"
+        elif segment["toxicity_score"] > 0.5:
+            color = "orange"
         with st.expander(
+            f"청크 {i+1} ({segment['start']:.1f}s - {segment['end']:.1f}s) - 유해도 점수: {segment['toxicity_score']:.3f}",
+            expanded=True,
         ):
             st.text(segment["transcript"])
             st.progress(float(segment["toxicity_score"]))
             st.text(f"유해도 점수: {segment['toxicity_score']:.3f}")
+            st.markdown("</div>", unsafe_allow_html=True)

src/scripts/collect_channel_info.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Dict, List
 from google.oauth2 import service_account
 from googleapiclient.errors import HttpError
 from core.youtube_api import YouTubeAPI
 # 로깅 설정
@@ -43,14 +44,13 @@ def collect_channel_info(max_retries: int = 3, retry_delay: int = 5) -> List[Dic
     youtube_api = YouTubeAPI(credentials)
     # 채널 목록
-    from core.config import target_channel_handles
     # 전체 결과를 저장할 리스트
     all_channels = []
     failed_channels = []
     # 각 채널 정보 수집
-    for handle in target_channel_ids:
         clean_handle = handle.replace("@", "")
         logger.info(f"\n채널 정보 수집 시도: {handle}")
@@ -91,7 +91,7 @@ def collect_channel_info(max_retries: int = 3, retry_delay: int = 5) -> List[Dic
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     result = {
         "collected_at": datetime.now().isoformat(),
-        "total_channels": len(target_channel_ids),
         "successful_channels": len(all_channels),
         "failed_channels": len(failed_channels),
         "channels": all_channels,

 from google.oauth2 import service_account
 from googleapiclient.errors import HttpError
+from core.config import target_channel_handles
 from core.youtube_api import YouTubeAPI
 # 로깅 설정
     youtube_api = YouTubeAPI(credentials)
     # 채널 목록
     # 전체 결과를 저장할 리스트
     all_channels = []
     failed_channels = []
     # 각 채널 정보 수집
+    for handle in target_channel_handles:
         clean_handle = handle.replace("@", "")
         logger.info(f"\n채널 정보 수집 시도: {handle}")
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     result = {
         "collected_at": datetime.now().isoformat(),
+        "total_channels": len(target_channel_handles),
         "successful_channels": len(all_channels),
         "failed_channels": len(failed_channels),
         "channels": all_channels,

src/scripts/collect_transcript.py CHANGED Viewed

@@ -18,6 +18,67 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 def collect_video_transcripts(
     max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
 ) -> List[Dict]:
@@ -31,51 +92,35 @@ def collect_video_transcripts(
     """
     output_dir = Path("data")
     output_dir.mkdir(parents=True, exist_ok=True)
     # 비디오 정보 로드
-    try:
-        with open(videos_file, "r", encoding="utf-8") as f:
-            videos_data = json.load(f)
-            videos = videos_data.get("videos", [])
-    except Exception as e:
-        logger.error(f"비디오 파일 로드 실패: {str(e)}")
-        return []
     # 결과 저장용 리스트
-    all_transcripts = []
     failed_videos = []
     # 각 비디오의 트랜스크립트 수집
     total_videos = len(videos)
     for idx, video in enumerate(videos, 1):
         video_id = video["video_id"]
         logger.info(
             f"\n[{idx}/{total_videos}] 트랜스크립트 수집 시도: {video_id} - {video['title']}"
         )
-        # 재시도 로직
-        transcript_segments = None
-        error_message = None
-        for attempt in range(max_retries):
-            try:
-                transcript_list = YouTubeTranscriptApi.get_transcript(
-                    video_id, languages=["ko", "en"]
-                )
-                transcript_segments = transcript_list
-                break
-            except (TranscriptsDisabled, NoTranscriptFound) as e:
-                error_message = f"트랜스크립트 없음: {str(e)}"
-                break
-            except Exception as e:
-                if attempt < max_retries - 1:
-                    wait_time = retry_delay * (attempt + 1)
-                    logger.warning(
-                        f"오류 발생 (재시도 {attempt + 1}/{max_retries}), {wait_time}초 후 재시도..."
-                    )
-                    time.sleep(wait_time)
-                else:
-                    error_message = f"최대 재시도 횟수 초과: {str(e)}"
         if transcript_segments:
             transcript_info = {
@@ -87,7 +132,7 @@ def collect_video_transcripts(
                 "collected_at": datetime.now().isoformat(),
             }
             all_transcripts.append(transcript_info)
-            logger.info(f"트랜스크립트 수집 성공")
         else:
             failed_videos.append(
                 {
@@ -99,11 +144,16 @@ def collect_video_transcripts(
             )
             logger.warning(f"트랜스크립트 수집 실패: {error_message}")
         # API 할당량 보호를 위한 대기
-        time.sleep(1)
-    # 결과 저장
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     result = {
         "collected_at": datetime.now().isoformat(),
         "total_videos": total_videos,
@@ -112,26 +162,14 @@ def collect_video_transcripts(
         "transcripts": all_transcripts,
         "failures": failed_videos,
     }
-    # 결과 파일 저장
-    output_file = output_dir / f"transcripts_{timestamp}.json"
-    try:
-        with open(output_file, "w", encoding="utf-8") as f:
-            json.dump(result, f, ensure_ascii=False, indent=2)
-        logger.info(f"\n결과 저장 완료: {output_file}")
-        logger.info(
-            f"총 {len(all_transcripts)}개 트랜스크립트 수집 완료 (실패: {len(failed_videos)}개)"
-        )
-        if failed_videos:
-            logger.warning("\n실패한 비디오들:")
-            for fail in failed_videos:
-                logger.warning(
-                    f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
-                )
-    except Exception as e:
-        logger.error(f"결과 파일 저장 중 오류 발생: {str(e)}")
-        return all_transcripts
     return all_transcripts

 logger = logging.getLogger(__name__)
+def load_existing_transcripts(file_path: Path) -> Dict:
+    """기존 트랜스크립트 데이터 로드"""
+    if not file_path.exists():
+        return {"transcripts": []}
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            return data
+    except Exception as e:
+        logger.error(f"트랜스크립트 파일 로드 실패: {e}")
+        return {"transcripts": []}
+def load_video_info(videos_file: str) -> List[Dict]:
+    """비디오 정보를 로드하는 함수"""
+    try:
+        with open(videos_file, "r", encoding="utf-8") as f:
+            videos_data = json.load(f)
+            return videos_data.get("videos", [])
+    except Exception as e:
+        logger.error(f"비디오 파일 로드 실패: {str(e)}")
+        return []
+def fetch_transcript(video_id: str, max_retries: int, retry_delay: int) -> Dict:
+    """개별 비디오의 트랜스크립트를 API로 호출하는 함수"""
+    for attempt in range(max_retries):
+        try:
+            transcript_list = YouTubeTranscriptApi.get_transcript(
+                video_id, languages=["ko", "en"]
+            )
+            return {"transcript_segments": transcript_list, "error": None}
+        except (TranscriptsDisabled, NoTranscriptFound) as e:
+            return {
+                "transcript_segments": None,
+                "error": f"트랜스크립트 없음: {str(e)}",
+            }
+        except Exception as e:
+            if attempt < max_retries - 1:
+                wait_time = retry_delay * (attempt + 1)
+                logger.warning(
+                    f"오류 발생 (재시도 {attempt + 1}/{max_retries}), {wait_time}초 후 재시도..."
+                )
+                time.sleep(wait_time)
+            else:
+                return {
+                    "transcript_segments": None,
+                    "error": f"최대 재시도 횟수 초과: {str(e)}",
+                }
+def save_transcripts_to_file(transcripts: List[Dict], output_file: Path):
+    """트랜스크립트를 파일에 저장하는 함수"""
+    try:
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(transcripts, f, ensure_ascii=False, indent=2)
+        logger.info(f"\n결과 저장 완료: {output_file}")
+    except Exception as e:
+        logger.error(f"결과 파일 저장 중 오류 발생: {str(e)}")
 def collect_video_transcripts(
     max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
 ) -> List[Dict]:
     """
     output_dir = Path("data")
     output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / "transcripts_cache.json"
+    all_transcripts = load_existing_transcripts(output_file).get("transcripts", [])
     # 비디오 정보 로드
+    videos = load_video_info(videos_file)
     # 결과 저장용 리스트
     failed_videos = []
+    # 이미 수집된 비디오 아이디 목록
+    collected_video_ids = {transcript["video_id"] for transcript in all_transcripts}
     # 각 비디오의 트랜스크립트 수집
     total_videos = len(videos)
     for idx, video in enumerate(videos, 1):
         video_id = video["video_id"]
+        # 이미 수집된 비디오인 경우 패스
+        if video_id in collected_video_ids:
+            logger.info(f"\n[{idx}/{total_videos}] 이미 수집된 비디오: {video_id} - {video['title']}")
+            continue
         logger.info(
             f"\n[{idx}/{total_videos}] 트랜스크립트 수집 시도: {video_id} - {video['title']}"
         )
+        result = fetch_transcript(video_id, max_retries, retry_delay)
+        transcript_segments = result["transcript_segments"]
+        error_message = result["error"]
         if transcript_segments:
             transcript_info = {
                 "collected_at": datetime.now().isoformat(),
             }
             all_transcripts.append(transcript_info)
+            logger.info("트랜스크립트 수집 성공")
         else:
             failed_videos.append(
                 {
             )
             logger.warning(f"트랜스크립트 수집 실패: {error_message}")
+        # 50개마다 중간 저장
+        if idx % 50 == 0:
+            save_transcripts_to_file({"transcripts": all_transcripts}, output_file)
         # API 할당량 보호를 위한 대기
+        time.sleep(0.2)
+    # 최종 결과 저장
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    final_output_file = output_dir / f"transcripts_{timestamp}.json"
     result = {
         "collected_at": datetime.now().isoformat(),
         "total_videos": total_videos,
         "transcripts": all_transcripts,
         "failures": failed_videos,
     }
+    save_transcripts_to_file(result, final_output_file)
+    if failed_videos:
+        logger.warning("\n실패한 비디오들:")
+        for fail in failed_videos:
+            logger.warning(
+                f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
+            )
     return all_transcripts

src/scripts/collect_videos_info.py CHANGED Viewed

@@ -6,8 +6,8 @@ from pathlib import Path
 from typing import Dict, List
 from google.oauth2 import service_account
-from googleapiclient.errors import HttpError
 from core.youtube_api import YouTubeAPI
 # 로깅 설정
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 def collect_videos_info(
-    max_retries: int = 3, retry_delay: int = 5, videos_per_channel: int = 100
 ) -> List[Dict]:
     """
     각 채널의 최신 동영상 정보 수집 함수
@@ -46,7 +46,6 @@ def collect_videos_info(
     youtube_api = YouTubeAPI(credentials)
     # 채널 목록
-    from core.config import target_channel_handles
     # 전체 결과를 저장할 리스트
     all_videos = []
@@ -54,7 +53,7 @@ def collect_videos_info(
     failed_videos = []
     # 각 채널의 동영상 정보 수집
-    for handle in target_channel_ids:
         clean_handle = handle.replace("@", "")
         logger.info(f"\n채널 동영상 수집 시도: {handle}")
@@ -128,7 +127,7 @@ def collect_videos_info(
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     result = {
         "collected_at": datetime.now().isoformat(),
-        "total_channels": len(target_channel_ids),
         "total_videos": len(all_videos),
         "failed_channels": len(failed_channels),
         "failed_videos": len(failed_videos),

 from typing import Dict, List
 from google.oauth2 import service_account
+from core.config import target_channel_handles
 from core.youtube_api import YouTubeAPI
 # 로깅 설정
 def collect_videos_info(
+    max_retries: int = 3, retry_delay: int = 5, videos_per_channel: int = 50
 ) -> List[Dict]:
     """
     각 채널의 최신 동영상 정보 수집 함수
     youtube_api = YouTubeAPI(credentials)
     # 채널 목록
     # 전체 결과를 저장할 리스트
     all_videos = []
     failed_videos = []
     # 각 채널의 동영상 정보 수집
+    for handle in target_channel_handles:
         clean_handle = handle.replace("@", "")
         logger.info(f"\n채널 동영상 수집 시도: {handle}")
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     result = {
         "collected_at": datetime.now().isoformat(),
+        "total_channels": len(target_channel_handles),
         "total_videos": len(all_videos),
         "failed_channels": len(failed_channels),
         "failed_videos": len(failed_videos),

src/scripts/process_all_transcripts.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import json
+from typing import Dict
+import tqdm
+from ai.classifier import ToxcitiyClassifier
+from core.data_manager import DataManager
+from models.schemas import AnalyzedTranscript, ChunkedSegment
+def batch_analyze_transcripts(
+    data_manager: DataManager,
+    classifier: ToxcitiyClassifier,
+    chunk_size: int = 60,
+    overlap: int = 10,
+) -> Dict[str, AnalyzedTranscript]:
+    """모든 트랜스크립트를 분석하고 결과를 저장"""
+    # 기존 분석 결과 로드
+    try:
+        with open("./data/analyzed_transcripts.json", "r", encoding="utf-8") as f:
+            existing_data = json.load(f)
+            analyzed_transcripts = existing_data.get("analyzed_transcripts", {})
+    except FileNotFoundError:
+        analyzed_transcripts = {}
+    # 모든 트랜스크립트 처리
+    all_transcripts = data_manager.transcript_data
+    print(f"총 {len(all_transcripts)}개의 트랜스크립트 처리 시작...")
+    for transcript_data in tqdm.tqdm(all_transcripts):
+        video_id = transcript_data.get("video_id")
+        # 이미 분석된 트랜스크립트는 건너뛰기
+        if video_id in analyzed_transcripts:
+            print(f"Video {video_id}: 이미 분석됨, 건너뛰기")
+            continue
+        # 트랜스크립트 데이터 준비
+        transcript = data_manager.get_transcript_by_video_id(video_id)
+        if transcript is None:
+            print(f"Video {video_id}: 트랜스크립트를 찾을 수 없음")
+            continue
+        try:
+            # 전체 영상 길이 계산
+            total_duration = max(
+                segment["start"] + segment["duration"]
+                for segment in transcript.transcript_segments
+            )
+            # 청크 처리
+            num_chunks = math.ceil(total_duration / chunk_size)
+            chunked_segments = []
+            is_toxic = False
+            max_toxicity = 0.0
+            for i in range(num_chunks):
+                chunk_start = i * chunk_size
+                chunk_end = (i + 1) * chunk_size
+                # 보간 범위 설정
+                overlap_start = max(0, chunk_start - overlap)
+                overlap_end = min(total_duration, chunk_end + overlap)
+                # 해당 청크에 포함될 트랜스크립트 수집
+                chunk_text = []
+                for segment in transcript.transcript_segments:
+                    segment_start = segment["start"]
+                    segment_end = segment_start + segment["duration"]
+                    if not (segment_end < overlap_start or segment_start > overlap_end):
+                        chunk_text.append(segment["text"])
+                # 청크 텍스트 생성
+                chunk_transcript = " ".join(chunk_text)
+                # toxicity inference 수행
+                if chunk_transcript.strip():  # 빈 텍스트가 아닌 경우만 분석
+                    toxicity_score = classifier.infer(chunk_transcript)
+                    max_toxicity = max(max_toxicity, toxicity_score)
+                else:
+                    toxicity_score = 0.0
+                # 청크 세그먼트 생성
+                chunk = ChunkedSegment(
+                    start=overlap_start,
+                    end=overlap_end,
+                    transcript=chunk_transcript,
+                    toxicity_score=float(toxicity_score),
+                )
+                chunked_segments.append(chunk)
+            # 유해성 판단 (임계값 0.5 적용)
+            is_toxic = max_toxicity > 0.5
+            # AnalyzedTranscript 객체 생성
+            analyzed_transcript = AnalyzedTranscript(
+                video_id=video_id,
+                chunk_count=len(chunked_segments),
+                chunked_segments=chunked_segments,
+                is_toxic=is_toxic,
+            )
+            # 결과 저장
+            analyzed_transcripts[video_id] = analyzed_transcript
+            # 중간 저장 (매 영상 분석 후)
+            with open("./data/analyzed_transcripts.json", "w", encoding="utf-8") as f:
+                json.dump(
+                    {
+                        "analyzed_transcripts": {
+                            vid: asdict(transcript)
+                            for vid, transcript in analyzed_transcripts.items()
+                        }
+                    },
+                    f,
+                    ensure_ascii=False,
+                    indent=2,
+                )
+            print(
+                f"Video {video_id}: 분석 완료 (유해성: {is_toxic}, 최대 점수: {max_toxicity:.3f})"
+            )
+        except Exception as e:
+            print(f"Video {video_id} 처리 중 오류 발생: {str(e)}")
+            continue
+    return analyzed_transcripts
+if __name__ == "__main__":
+    import math
+    from dataclasses import asdict
+    # 데이터 매니저와 분류기 초기화
+    data_manager = DataManager()
+    classifier = ToxcitiyClassifier()
+    # 배치 ���리 실행
+    results = batch_analyze_transcripts(data_manager, classifier)
+    # 최종 통계
+    total_analyzed = len(results)
+    total_toxic = sum(1 for transcript in results.values() if transcript.is_toxic)
+    print("\n분석 완료 통계:")
+    print(f"총 처리된 영상: {total_analyzed}")
+    print(f"유해 판정 영상: {total_toxic}")
+    print(f"유해 비율: {(total_toxic/total_analyzed)*100:.1f}%")