Spaces:
Sleeping
Sleeping
| import json | |
| from typing import Dict | |
| import tqdm | |
| from ai.classifier import ToxcitiyClassifier | |
| from core.data_manager import DataManager | |
| from models.schemas import AnalyzedTranscript, ChunkedSegment | |
| def batch_analyze_transcripts( | |
| data_manager: DataManager, | |
| classifier: ToxcitiyClassifier, | |
| chunk_size: int = 60, | |
| overlap: int = 10, | |
| ) -> Dict[str, AnalyzedTranscript]: | |
| """๋ชจ๋ ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ๋ถ์ํ๊ณ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ""" | |
| # ๊ธฐ์กด ๋ถ์ ๊ฒฐ๊ณผ ๋ก๋ | |
| try: | |
| with open("./data/analyzed_transcripts.json", "r", encoding="utf-8") as f: | |
| existing_data = json.load(f) | |
| analyzed_transcripts = existing_data.get("analyzed_transcripts", {}) | |
| except FileNotFoundError: | |
| analyzed_transcripts = {} | |
| # ๋ชจ๋ ํธ๋์คํฌ๋ฆฝํธ ์ฒ๋ฆฌ | |
| all_transcripts = data_manager.transcript_data | |
| print(f"์ด {len(all_transcripts)}๊ฐ์ ํธ๋์คํฌ๋ฆฝํธ ์ฒ๋ฆฌ ์์...") | |
| for transcript_data in tqdm.tqdm(all_transcripts): | |
| video_id = transcript_data.get("video_id") | |
| # ์ด๋ฏธ ๋ถ์๋ ํธ๋์คํฌ๋ฆฝํธ๋ ๊ฑด๋๋ฐ๊ธฐ | |
| if video_id in analyzed_transcripts: | |
| print(f"Video {video_id}: ์ด๋ฏธ ๋ถ์๋จ, ๊ฑด๋๋ฐ๊ธฐ") | |
| continue | |
| # ํธ๋์คํฌ๋ฆฝํธ ๋ฐ์ดํฐ ์ค๋น | |
| transcript = data_manager.get_transcript_by_video_id(video_id) | |
| if transcript is None: | |
| print(f"Video {video_id}: ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ์ฐพ์ ์ ์์") | |
| continue | |
| try: | |
| # ์ ์ฒด ์์ ๊ธธ์ด ๊ณ์ฐ | |
| total_duration = max( | |
| segment["start"] + segment["duration"] | |
| for segment in transcript.transcript_segments | |
| ) | |
| # ์ฒญํฌ ์ฒ๋ฆฌ | |
| num_chunks = math.ceil(total_duration / chunk_size) | |
| chunked_segments = [] | |
| is_toxic = False | |
| max_toxicity = 0.0 | |
| for i in range(num_chunks): | |
| chunk_start = i * chunk_size | |
| chunk_end = (i + 1) * chunk_size | |
| # ๋ณด๊ฐ ๋ฒ์ ์ค์ | |
| overlap_start = max(0, chunk_start - overlap) | |
| overlap_end = min(total_duration, chunk_end + overlap) | |
| # ํด๋น ์ฒญํฌ์ ํฌํจ๋ ํธ๋์คํฌ๋ฆฝํธ ์์ง | |
| chunk_text = [] | |
| for segment in transcript.transcript_segments: | |
| segment_start = segment["start"] | |
| segment_end = segment_start + segment["duration"] | |
| if not (segment_end < overlap_start or segment_start > overlap_end): | |
| chunk_text.append(segment["text"]) | |
| # ์ฒญํฌ ํ ์คํธ ์์ฑ | |
| chunk_transcript = " ".join(chunk_text) | |
| # toxicity inference ์ํ | |
| if chunk_transcript.strip(): # ๋น ํ ์คํธ๊ฐ ์๋ ๊ฒฝ์ฐ๋ง ๋ถ์ | |
| toxicity_score = classifier.infer(chunk_transcript) | |
| max_toxicity = max(max_toxicity, toxicity_score) | |
| else: | |
| toxicity_score = 0.0 | |
| # ์ฒญํฌ ์ธ๊ทธ๋จผํธ ์์ฑ | |
| chunk = ChunkedSegment( | |
| start=overlap_start, | |
| end=overlap_end, | |
| transcript=chunk_transcript, | |
| toxicity_score=float(toxicity_score), | |
| ) | |
| chunked_segments.append(chunk) | |
| # ์ ํด์ฑ ํ๋จ (์๊ณ๊ฐ 0.5 ์ ์ฉ) | |
| is_toxic = max_toxicity > 0.5 | |
| # AnalyzedTranscript ๊ฐ์ฒด ์์ฑ | |
| analyzed_transcript = AnalyzedTranscript( | |
| video_id=video_id, | |
| chunk_count=len(chunked_segments), | |
| chunked_segments=chunked_segments, | |
| is_toxic=is_toxic, | |
| ) | |
| # ๊ฒฐ๊ณผ ์ ์ฅ | |
| analyzed_transcripts[video_id] = analyzed_transcript | |
| # ์ค๊ฐ ์ ์ฅ (๋งค ์์ ๋ถ์ ํ) | |
| with open("./data/analyzed_transcripts.json", "w", encoding="utf-8") as f: | |
| json.dump( | |
| { | |
| "analyzed_transcripts": { | |
| vid: asdict(transcript) | |
| for vid, transcript in analyzed_transcripts.items() | |
| } | |
| }, | |
| f, | |
| ensure_ascii=False, | |
| indent=2, | |
| ) | |
| print( | |
| f"Video {video_id}: ๋ถ์ ์๋ฃ (์ ํด์ฑ: {is_toxic}, ์ต๋ ์ ์: {max_toxicity:.3f})" | |
| ) | |
| except Exception as e: | |
| print(f"Video {video_id} ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| continue | |
| return analyzed_transcripts | |
| if __name__ == "__main__": | |
| import math | |
| from dataclasses import asdict | |
| # ๋ฐ์ดํฐ ๋งค๋์ ์ ๋ถ๋ฅ๊ธฐ ์ด๊ธฐํ | |
| data_manager = DataManager() | |
| classifier = ToxcitiyClassifier() | |
| # ๋ฐฐ์น ์ฒ๋ฆฌ ์คํ | |
| results = batch_analyze_transcripts(data_manager, classifier) | |
| # ์ต์ข ํต๊ณ | |
| total_analyzed = len(results) | |
| total_toxic = sum(1 for transcript in results.values() if transcript.is_toxic) | |
| print("\n๋ถ์ ์๋ฃ ํต๊ณ:") | |
| print(f"์ด ์ฒ๋ฆฌ๋ ์์: {total_analyzed}") | |
| print(f"์ ํด ํ์ ์์: {total_toxic}") | |
| print(f"์ ํด ๋น์จ: {(total_toxic/total_analyzed)*100:.1f}%") | |