Spaces:
Sleeping
Sleeping
File size: 5,385 Bytes
591c7e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | import json
from typing import Dict
import tqdm
from ai.classifier import ToxcitiyClassifier
from core.data_manager import DataManager
from models.schemas import AnalyzedTranscript, ChunkedSegment
def batch_analyze_transcripts(
data_manager: DataManager,
classifier: ToxcitiyClassifier,
chunk_size: int = 60,
overlap: int = 10,
) -> Dict[str, AnalyzedTranscript]:
"""๋ชจ๋ ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ๋ถ์ํ๊ณ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ"""
# ๊ธฐ์กด ๋ถ์ ๊ฒฐ๊ณผ ๋ก๋
try:
with open("./data/analyzed_transcripts.json", "r", encoding="utf-8") as f:
existing_data = json.load(f)
analyzed_transcripts = existing_data.get("analyzed_transcripts", {})
except FileNotFoundError:
analyzed_transcripts = {}
# ๋ชจ๋ ํธ๋์คํฌ๋ฆฝํธ ์ฒ๋ฆฌ
all_transcripts = data_manager.transcript_data
print(f"์ด {len(all_transcripts)}๊ฐ์ ํธ๋์คํฌ๋ฆฝํธ ์ฒ๋ฆฌ ์์...")
for transcript_data in tqdm.tqdm(all_transcripts):
video_id = transcript_data.get("video_id")
# ์ด๋ฏธ ๋ถ์๋ ํธ๋์คํฌ๋ฆฝํธ๋ ๊ฑด๋๋ฐ๊ธฐ
if video_id in analyzed_transcripts:
print(f"Video {video_id}: ์ด๋ฏธ ๋ถ์๋จ, ๊ฑด๋๋ฐ๊ธฐ")
continue
# ํธ๋์คํฌ๋ฆฝํธ ๋ฐ์ดํฐ ์ค๋น
transcript = data_manager.get_transcript_by_video_id(video_id)
if transcript is None:
print(f"Video {video_id}: ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ์ฐพ์ ์ ์์")
continue
try:
# ์ ์ฒด ์์ ๊ธธ์ด ๊ณ์ฐ
total_duration = max(
segment["start"] + segment["duration"]
for segment in transcript.transcript_segments
)
# ์ฒญํฌ ์ฒ๋ฆฌ
num_chunks = math.ceil(total_duration / chunk_size)
chunked_segments = []
is_toxic = False
max_toxicity = 0.0
for i in range(num_chunks):
chunk_start = i * chunk_size
chunk_end = (i + 1) * chunk_size
# ๋ณด๊ฐ ๋ฒ์ ์ค์
overlap_start = max(0, chunk_start - overlap)
overlap_end = min(total_duration, chunk_end + overlap)
# ํด๋น ์ฒญํฌ์ ํฌํจ๋ ํธ๋์คํฌ๋ฆฝํธ ์์ง
chunk_text = []
for segment in transcript.transcript_segments:
segment_start = segment["start"]
segment_end = segment_start + segment["duration"]
if not (segment_end < overlap_start or segment_start > overlap_end):
chunk_text.append(segment["text"])
# ์ฒญํฌ ํ
์คํธ ์์ฑ
chunk_transcript = " ".join(chunk_text)
# toxicity inference ์ํ
if chunk_transcript.strip(): # ๋น ํ
์คํธ๊ฐ ์๋ ๊ฒฝ์ฐ๋ง ๋ถ์
toxicity_score = classifier.infer(chunk_transcript)
max_toxicity = max(max_toxicity, toxicity_score)
else:
toxicity_score = 0.0
# ์ฒญํฌ ์ธ๊ทธ๋จผํธ ์์ฑ
chunk = ChunkedSegment(
start=overlap_start,
end=overlap_end,
transcript=chunk_transcript,
toxicity_score=float(toxicity_score),
)
chunked_segments.append(chunk)
# ์ ํด์ฑ ํ๋จ (์๊ณ๊ฐ 0.5 ์ ์ฉ)
is_toxic = max_toxicity > 0.5
# AnalyzedTranscript ๊ฐ์ฒด ์์ฑ
analyzed_transcript = AnalyzedTranscript(
video_id=video_id,
chunk_count=len(chunked_segments),
chunked_segments=chunked_segments,
is_toxic=is_toxic,
)
# ๊ฒฐ๊ณผ ์ ์ฅ
analyzed_transcripts[video_id] = analyzed_transcript
# ์ค๊ฐ ์ ์ฅ (๋งค ์์ ๋ถ์ ํ)
with open("./data/analyzed_transcripts.json", "w", encoding="utf-8") as f:
json.dump(
{
"analyzed_transcripts": {
vid: asdict(transcript)
for vid, transcript in analyzed_transcripts.items()
}
},
f,
ensure_ascii=False,
indent=2,
)
print(
f"Video {video_id}: ๋ถ์ ์๋ฃ (์ ํด์ฑ: {is_toxic}, ์ต๋ ์ ์: {max_toxicity:.3f})"
)
except Exception as e:
print(f"Video {video_id} ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
continue
return analyzed_transcripts
if __name__ == "__main__":
import math
from dataclasses import asdict
# ๋ฐ์ดํฐ ๋งค๋์ ์ ๋ถ๋ฅ๊ธฐ ์ด๊ธฐํ
data_manager = DataManager()
classifier = ToxcitiyClassifier()
# ๋ฐฐ์น ์ฒ๋ฆฌ ์คํ
results = batch_analyze_transcripts(data_manager, classifier)
# ์ต์ข
ํต๊ณ
total_analyzed = len(results)
total_toxic = sum(1 for transcript in results.values() if transcript.is_toxic)
print("\n๋ถ์ ์๋ฃ ํต๊ณ:")
print(f"์ด ์ฒ๋ฆฌ๋ ์์: {total_analyzed}")
print(f"์ ํด ํ์ ์์: {total_toxic}")
print(f"์ ํด ๋น์จ: {(total_toxic/total_analyzed)*100:.1f}%")
|