518_yt_monitor / src /scripts /process_all_transcripts.py
atoye1's picture
adding updated datafiles
591c7e2
import json
from typing import Dict
import tqdm
from ai.classifier import ToxcitiyClassifier
from core.data_manager import DataManager
from models.schemas import AnalyzedTranscript, ChunkedSegment
def batch_analyze_transcripts(
data_manager: DataManager,
classifier: ToxcitiyClassifier,
chunk_size: int = 60,
overlap: int = 10,
) -> Dict[str, AnalyzedTranscript]:
"""๋ชจ๋“  ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅ"""
# ๊ธฐ์กด ๋ถ„์„ ๊ฒฐ๊ณผ ๋กœ๋“œ
try:
with open("./data/analyzed_transcripts.json", "r", encoding="utf-8") as f:
existing_data = json.load(f)
analyzed_transcripts = existing_data.get("analyzed_transcripts", {})
except FileNotFoundError:
analyzed_transcripts = {}
# ๋ชจ๋“  ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ฒ˜๋ฆฌ
all_transcripts = data_manager.transcript_data
print(f"์ด {len(all_transcripts)}๊ฐœ์˜ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ฒ˜๋ฆฌ ์‹œ์ž‘...")
for transcript_data in tqdm.tqdm(all_transcripts):
video_id = transcript_data.get("video_id")
# ์ด๋ฏธ ๋ถ„์„๋œ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋Š” ๊ฑด๋„ˆ๋›ฐ๊ธฐ
if video_id in analyzed_transcripts:
print(f"Video {video_id}: ์ด๋ฏธ ๋ถ„์„๋จ, ๊ฑด๋„ˆ๋›ฐ๊ธฐ")
continue
# ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ๋ฐ์ดํ„ฐ ์ค€๋น„
transcript = data_manager.get_transcript_by_video_id(video_id)
if transcript is None:
print(f"Video {video_id}: ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ")
continue
try:
# ์ „์ฒด ์˜์ƒ ๊ธธ์ด ๊ณ„์‚ฐ
total_duration = max(
segment["start"] + segment["duration"]
for segment in transcript.transcript_segments
)
# ์ฒญํฌ ์ฒ˜๋ฆฌ
num_chunks = math.ceil(total_duration / chunk_size)
chunked_segments = []
is_toxic = False
max_toxicity = 0.0
for i in range(num_chunks):
chunk_start = i * chunk_size
chunk_end = (i + 1) * chunk_size
# ๋ณด๊ฐ„ ๋ฒ”์œ„ ์„ค์ •
overlap_start = max(0, chunk_start - overlap)
overlap_end = min(total_duration, chunk_end + overlap)
# ํ•ด๋‹น ์ฒญํฌ์— ํฌํ•จ๋  ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘
chunk_text = []
for segment in transcript.transcript_segments:
segment_start = segment["start"]
segment_end = segment_start + segment["duration"]
if not (segment_end < overlap_start or segment_start > overlap_end):
chunk_text.append(segment["text"])
# ์ฒญํฌ ํ…์ŠคํŠธ ์ƒ์„ฑ
chunk_transcript = " ".join(chunk_text)
# toxicity inference ์ˆ˜ํ–‰
if chunk_transcript.strip(): # ๋นˆ ํ…์ŠคํŠธ๊ฐ€ ์•„๋‹Œ ๊ฒฝ์šฐ๋งŒ ๋ถ„์„
toxicity_score = classifier.infer(chunk_transcript)
max_toxicity = max(max_toxicity, toxicity_score)
else:
toxicity_score = 0.0
# ์ฒญํฌ ์„ธ๊ทธ๋จผํŠธ ์ƒ์„ฑ
chunk = ChunkedSegment(
start=overlap_start,
end=overlap_end,
transcript=chunk_transcript,
toxicity_score=float(toxicity_score),
)
chunked_segments.append(chunk)
# ์œ ํ•ด์„ฑ ํŒ๋‹จ (์ž„๊ณ„๊ฐ’ 0.5 ์ ์šฉ)
is_toxic = max_toxicity > 0.5
# AnalyzedTranscript ๊ฐ์ฒด ์ƒ์„ฑ
analyzed_transcript = AnalyzedTranscript(
video_id=video_id,
chunk_count=len(chunked_segments),
chunked_segments=chunked_segments,
is_toxic=is_toxic,
)
# ๊ฒฐ๊ณผ ์ €์žฅ
analyzed_transcripts[video_id] = analyzed_transcript
# ์ค‘๊ฐ„ ์ €์žฅ (๋งค ์˜์ƒ ๋ถ„์„ ํ›„)
with open("./data/analyzed_transcripts.json", "w", encoding="utf-8") as f:
json.dump(
{
"analyzed_transcripts": {
vid: asdict(transcript)
for vid, transcript in analyzed_transcripts.items()
}
},
f,
ensure_ascii=False,
indent=2,
)
print(
f"Video {video_id}: ๋ถ„์„ ์™„๋ฃŒ (์œ ํ•ด์„ฑ: {is_toxic}, ์ตœ๋Œ€ ์ ์ˆ˜: {max_toxicity:.3f})"
)
except Exception as e:
print(f"Video {video_id} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
continue
return analyzed_transcripts
if __name__ == "__main__":
import math
from dataclasses import asdict
# ๋ฐ์ดํ„ฐ ๋งค๋‹ˆ์ €์™€ ๋ถ„๋ฅ˜๊ธฐ ์ดˆ๊ธฐํ™”
data_manager = DataManager()
classifier = ToxcitiyClassifier()
# ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ์‹คํ–‰
results = batch_analyze_transcripts(data_manager, classifier)
# ์ตœ์ข… ํ†ต๊ณ„
total_analyzed = len(results)
total_toxic = sum(1 for transcript in results.values() if transcript.is_toxic)
print("\n๋ถ„์„ ์™„๋ฃŒ ํ†ต๊ณ„:")
print(f"์ด ์ฒ˜๋ฆฌ๋œ ์˜์ƒ: {total_analyzed}")
print(f"์œ ํ•ด ํŒ์ • ์˜์ƒ: {total_toxic}")
print(f"์œ ํ•ด ๋น„์œจ: {(total_toxic/total_analyzed)*100:.1f}%")