atoye1 commited on
Commit
591c7e2
ยท
1 Parent(s): db5a958

adding updated datafiles

Browse files
data/.DS_Store CHANGED
Binary files a/data/.DS_Store and b/data/.DS_Store differ
 
src/app.py CHANGED
@@ -29,6 +29,8 @@ if "data_manager" not in st.session_state:
29
  if "toxicity_classifier" not in st.session_state:
30
  toxicity_classifier = load_inference_model()
31
  st.session_state["toxicity_classifier"] = toxicity_classifier
 
 
32
 
33
 
34
  pg = st.navigation(
 
29
  if "toxicity_classifier" not in st.session_state:
30
  toxicity_classifier = load_inference_model()
31
  st.session_state["toxicity_classifier"] = toxicity_classifier
32
+ if "analyzed_transcripts" not in st.session_state:
33
+ st.session_state["analyzed_transcripts"] = {}
34
 
35
 
36
  pg = st.navigation(
src/pages/video_detail.py CHANGED
@@ -116,36 +116,35 @@ def analyze():
116
  )
117
  if analyzed_transcript is None:
118
  analyzed_transcript = gen_analyzed_transcript()
119
- if analyzed_transcript:
120
- data_manager.set_analyzed_transcript(analyzed_transcript)
121
- st.success("๋ถ„์„์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!")
122
- else:
123
- st.error("๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค.")
124
- st.rerun() # ๊ฒฐ๊ณผ๋ฅผ ์ฆ‰์‹œ ํ‘œ์‹œํ•˜๊ธฐ ์œ„ํ•œ ํŽ˜์ด์ง€ ๋ฆฌ๋กœ๋“œ
125
- return analyzed_transcript
126
 
127
 
128
  analyzed_result = st.button("๋ถ„์„", icon="๐Ÿ”„", on_click=analyze)
129
 
130
  # ๋ถ„์„ ๊ฒฐ๊ณผ ํ‘œ์‹œ
131
- if "analyzed_transcript" not in st.session_state:
132
- st.session_state.analyzed_transcript = (
133
- data_manager.get_analyzed_transcript_by_video_id(current_video.video_id)
134
- )
135
 
136
  if st.session_state.analyzed_transcript:
137
  st.subheader("ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ๋ถ„์„ ๊ฒฐ๊ณผ")
138
 
139
- # ๊ฒฐ๊ณผ๋ฅผ ํ‘œ ํ˜•์‹์œผ๋กœ ํ‘œ์‹œ
140
- print(st.session_state.analyzed_transcript)
141
- print(type(st.session_state.analyzed_transcript))
142
- print("=" * 30)
143
  for i, segment in enumerate(
144
  st.session_state.analyzed_transcript.get("chunked_segments", [])
145
  ):
 
 
 
 
 
 
146
  with st.expander(
147
- f"์ฒญํฌ {i+1} ({segment['start']:.1f}s - {segment['end']:.1f}s)"
 
148
  ):
149
  st.text(segment["transcript"])
150
  st.progress(float(segment["toxicity_score"]))
151
  st.text(f"์œ ํ•ด๋„ ์ ์ˆ˜: {segment['toxicity_score']:.3f}")
 
 
116
  )
117
  if analyzed_transcript is None:
118
  analyzed_transcript = gen_analyzed_transcript()
119
+ data_manager.set_analyzed_transcript(analyzed_transcript)
120
+ st.success("๋ถ„์„์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!")
121
+ st.session_state.analyzed_transcript = analyzed_transcript
 
 
 
 
122
 
123
 
124
  analyzed_result = st.button("๋ถ„์„", icon="๐Ÿ”„", on_click=analyze)
125
 
126
  # ๋ถ„์„ ๊ฒฐ๊ณผ ํ‘œ์‹œ
127
+ st.session_state.analyzed_transcript = data_manager.get_analyzed_transcript_by_video_id(
128
+ current_video.video_id
129
+ )
 
130
 
131
  if st.session_state.analyzed_transcript:
132
  st.subheader("ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ๋ถ„์„ ๊ฒฐ๊ณผ")
133
 
 
 
 
 
134
  for i, segment in enumerate(
135
  st.session_state.analyzed_transcript.get("chunked_segments", [])
136
  ):
137
+ color = "green"
138
+ if segment["toxicity_score"] > 0.7:
139
+ color = "red"
140
+ elif segment["toxicity_score"] > 0.5:
141
+ color = "orange"
142
+
143
  with st.expander(
144
+ f"์ฒญํฌ {i+1} ({segment['start']:.1f}s - {segment['end']:.1f}s) - ์œ ํ•ด๋„ ์ ์ˆ˜: {segment['toxicity_score']:.3f}",
145
+ expanded=True,
146
  ):
147
  st.text(segment["transcript"])
148
  st.progress(float(segment["toxicity_score"]))
149
  st.text(f"์œ ํ•ด๋„ ์ ์ˆ˜: {segment['toxicity_score']:.3f}")
150
+ st.markdown("</div>", unsafe_allow_html=True)
src/scripts/collect_channel_info.py CHANGED
@@ -8,6 +8,7 @@ from typing import Dict, List
8
  from google.oauth2 import service_account
9
  from googleapiclient.errors import HttpError
10
 
 
11
  from core.youtube_api import YouTubeAPI
12
 
13
  # ๋กœ๊น… ์„ค์ •
@@ -43,14 +44,13 @@ def collect_channel_info(max_retries: int = 3, retry_delay: int = 5) -> List[Dic
43
  youtube_api = YouTubeAPI(credentials)
44
 
45
  # ์ฑ„๋„ ๋ชฉ๋ก
46
- from core.config import target_channel_handles
47
 
48
  # ์ „์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
49
  all_channels = []
50
  failed_channels = []
51
 
52
  # ๊ฐ ์ฑ„๋„ ์ •๋ณด ์ˆ˜์ง‘
53
- for handle in target_channel_ids:
54
  clean_handle = handle.replace("@", "")
55
  logger.info(f"\n์ฑ„๋„ ์ •๋ณด ์ˆ˜์ง‘ ์‹œ๋„: {handle}")
56
 
@@ -91,7 +91,7 @@ def collect_channel_info(max_retries: int = 3, retry_delay: int = 5) -> List[Dic
91
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
92
  result = {
93
  "collected_at": datetime.now().isoformat(),
94
- "total_channels": len(target_channel_ids),
95
  "successful_channels": len(all_channels),
96
  "failed_channels": len(failed_channels),
97
  "channels": all_channels,
 
8
  from google.oauth2 import service_account
9
  from googleapiclient.errors import HttpError
10
 
11
+ from core.config import target_channel_handles
12
  from core.youtube_api import YouTubeAPI
13
 
14
  # ๋กœ๊น… ์„ค์ •
 
44
  youtube_api = YouTubeAPI(credentials)
45
 
46
  # ์ฑ„๋„ ๋ชฉ๋ก
 
47
 
48
  # ์ „์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
49
  all_channels = []
50
  failed_channels = []
51
 
52
  # ๊ฐ ์ฑ„๋„ ์ •๋ณด ์ˆ˜์ง‘
53
+ for handle in target_channel_handles:
54
  clean_handle = handle.replace("@", "")
55
  logger.info(f"\n์ฑ„๋„ ์ •๋ณด ์ˆ˜์ง‘ ์‹œ๋„: {handle}")
56
 
 
91
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
92
  result = {
93
  "collected_at": datetime.now().isoformat(),
94
+ "total_channels": len(target_channel_handles),
95
  "successful_channels": len(all_channels),
96
  "failed_channels": len(failed_channels),
97
  "channels": all_channels,
src/scripts/collect_transcript.py CHANGED
@@ -18,6 +18,67 @@ logging.basicConfig(
18
  logger = logging.getLogger(__name__)
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def collect_video_transcripts(
22
  max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
23
  ) -> List[Dict]:
@@ -31,51 +92,35 @@ def collect_video_transcripts(
31
  """
32
  output_dir = Path("data")
33
  output_dir.mkdir(parents=True, exist_ok=True)
 
 
34
 
35
  # ๋น„๋””์˜ค ์ •๋ณด ๋กœ๋“œ
36
- try:
37
- with open(videos_file, "r", encoding="utf-8") as f:
38
- videos_data = json.load(f)
39
- videos = videos_data.get("videos", [])
40
- except Exception as e:
41
- logger.error(f"๋น„๋””์˜ค ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
42
- return []
43
 
44
  # ๊ฒฐ๊ณผ ์ €์žฅ์šฉ ๋ฆฌ์ŠคํŠธ
45
- all_transcripts = []
46
  failed_videos = []
47
 
 
 
 
48
  # ๊ฐ ๋น„๋””์˜ค์˜ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘
49
  total_videos = len(videos)
50
  for idx, video in enumerate(videos, 1):
51
  video_id = video["video_id"]
 
 
 
 
 
 
52
  logger.info(
53
  f"\n[{idx}/{total_videos}] ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์‹œ๋„: {video_id} - {video['title']}"
54
  )
55
 
56
- # ์žฌ์‹œ๋„ ๋กœ์ง
57
- transcript_segments = None
58
- error_message = None
59
-
60
- for attempt in range(max_retries):
61
- try:
62
- transcript_list = YouTubeTranscriptApi.get_transcript(
63
- video_id, languages=["ko", "en"]
64
- )
65
- transcript_segments = transcript_list
66
- break
67
- except (TranscriptsDisabled, NoTranscriptFound) as e:
68
- error_message = f"ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์—†์Œ: {str(e)}"
69
- break
70
- except Exception as e:
71
- if attempt < max_retries - 1:
72
- wait_time = retry_delay * (attempt + 1)
73
- logger.warning(
74
- f"์˜ค๋ฅ˜ ๋ฐœ์ƒ (์žฌ์‹œ๋„ {attempt + 1}/{max_retries}), {wait_time}์ดˆ ํ›„ ์žฌ์‹œ๋„..."
75
- )
76
- time.sleep(wait_time)
77
- else:
78
- error_message = f"์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ: {str(e)}"
79
 
80
  if transcript_segments:
81
  transcript_info = {
@@ -87,7 +132,7 @@ def collect_video_transcripts(
87
  "collected_at": datetime.now().isoformat(),
88
  }
89
  all_transcripts.append(transcript_info)
90
- logger.info(f"ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์„ฑ๊ณต")
91
  else:
92
  failed_videos.append(
93
  {
@@ -99,11 +144,16 @@ def collect_video_transcripts(
99
  )
100
  logger.warning(f"ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์‹คํŒจ: {error_message}")
101
 
 
 
 
 
102
  # API ํ• ๋‹น๋Ÿ‰ ๋ณดํ˜ธ๋ฅผ ์œ„ํ•œ ๋Œ€๊ธฐ
103
- time.sleep(1)
104
 
105
- # ๊ฒฐ๊ณผ ์ €์žฅ
106
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
107
  result = {
108
  "collected_at": datetime.now().isoformat(),
109
  "total_videos": total_videos,
@@ -112,26 +162,14 @@ def collect_video_transcripts(
112
  "transcripts": all_transcripts,
113
  "failures": failed_videos,
114
  }
 
115
 
116
- # ๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ
117
- output_file = output_dir / f"transcripts_{timestamp}.json"
118
- try:
119
- with open(output_file, "w", encoding="utf-8") as f:
120
- json.dump(result, f, ensure_ascii=False, indent=2)
121
- logger.info(f"\n๊ฒฐ๊ณผ ์ €์žฅ ์™„๋ฃŒ: {output_file}")
122
- logger.info(
123
- f"์ด {len(all_transcripts)}๊ฐœ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์™„๋ฃŒ (์‹คํŒจ: {len(failed_videos)}๊ฐœ)"
124
- )
125
-
126
- if failed_videos:
127
- logger.warning("\n์‹คํŒจํ•œ ๋น„๋””์˜ค๋“ค:")
128
- for fail in failed_videos:
129
- logger.warning(
130
- f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
131
- )
132
- except Exception as e:
133
- logger.error(f"๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
134
- return all_transcripts
135
 
136
  return all_transcripts
137
 
 
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
+ def load_existing_transcripts(file_path: Path) -> Dict:
22
+ """๊ธฐ์กด ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ๋ฐ์ดํ„ฐ ๋กœ๋“œ"""
23
+ if not file_path.exists():
24
+ return {"transcripts": []}
25
+ try:
26
+ with open(file_path, "r", encoding="utf-8") as f:
27
+ data = json.load(f)
28
+ return data
29
+ except Exception as e:
30
+ logger.error(f"ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {e}")
31
+ return {"transcripts": []}
32
+
33
+
34
+ def load_video_info(videos_file: str) -> List[Dict]:
35
+ """๋น„๋””์˜ค ์ •๋ณด๋ฅผ ๋กœ๋“œํ•˜๋Š” ํ•จ์ˆ˜"""
36
+ try:
37
+ with open(videos_file, "r", encoding="utf-8") as f:
38
+ videos_data = json.load(f)
39
+ return videos_data.get("videos", [])
40
+ except Exception as e:
41
+ logger.error(f"๋น„๋””์˜ค ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
42
+ return []
43
+
44
+
45
+ def fetch_transcript(video_id: str, max_retries: int, retry_delay: int) -> Dict:
46
+ """๊ฐœ๋ณ„ ๋น„๋””์˜ค์˜ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋ฅผ API๋กœ ํ˜ธ์ถœํ•˜๋Š” ํ•จ์ˆ˜"""
47
+ for attempt in range(max_retries):
48
+ try:
49
+ transcript_list = YouTubeTranscriptApi.get_transcript(
50
+ video_id, languages=["ko", "en"]
51
+ )
52
+ return {"transcript_segments": transcript_list, "error": None}
53
+ except (TranscriptsDisabled, NoTranscriptFound) as e:
54
+ return {
55
+ "transcript_segments": None,
56
+ "error": f"ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์—†์Œ: {str(e)}",
57
+ }
58
+ except Exception as e:
59
+ if attempt < max_retries - 1:
60
+ wait_time = retry_delay * (attempt + 1)
61
+ logger.warning(
62
+ f"์˜ค๋ฅ˜ ๋ฐœ์ƒ (์žฌ์‹œ๋„ {attempt + 1}/{max_retries}), {wait_time}์ดˆ ํ›„ ์žฌ์‹œ๋„..."
63
+ )
64
+ time.sleep(wait_time)
65
+ else:
66
+ return {
67
+ "transcript_segments": None,
68
+ "error": f"์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ: {str(e)}",
69
+ }
70
+
71
+
72
+ def save_transcripts_to_file(transcripts: List[Dict], output_file: Path):
73
+ """ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋ฅผ ํŒŒ์ผ์— ์ €์žฅํ•˜๋Š” ํ•จ์ˆ˜"""
74
+ try:
75
+ with open(output_file, "w", encoding="utf-8") as f:
76
+ json.dump(transcripts, f, ensure_ascii=False, indent=2)
77
+ logger.info(f"\n๊ฒฐ๊ณผ ์ €์žฅ ์™„๋ฃŒ: {output_file}")
78
+ except Exception as e:
79
+ logger.error(f"๊ฒฐ๊ณผ ํŒŒ์ผ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
80
+
81
+
82
  def collect_video_transcripts(
83
  max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
84
  ) -> List[Dict]:
 
92
  """
93
  output_dir = Path("data")
94
  output_dir.mkdir(parents=True, exist_ok=True)
95
+ output_file = output_dir / "transcripts_cache.json"
96
+ all_transcripts = load_existing_transcripts(output_file).get("transcripts", [])
97
 
98
  # ๋น„๋””์˜ค ์ •๋ณด ๋กœ๋“œ
99
+ videos = load_video_info(videos_file)
 
 
 
 
 
 
100
 
101
  # ๊ฒฐ๊ณผ ์ €์žฅ์šฉ ๋ฆฌ์ŠคํŠธ
 
102
  failed_videos = []
103
 
104
+ # ์ด๋ฏธ ์ˆ˜์ง‘๋œ ๋น„๋””์˜ค ์•„์ด๋”” ๋ชฉ๋ก
105
+ collected_video_ids = {transcript["video_id"] for transcript in all_transcripts}
106
+
107
  # ๊ฐ ๋น„๋””์˜ค์˜ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘
108
  total_videos = len(videos)
109
  for idx, video in enumerate(videos, 1):
110
  video_id = video["video_id"]
111
+
112
+ # ์ด๋ฏธ ์ˆ˜์ง‘๋œ ๋น„๋””์˜ค์ธ ๊ฒฝ์šฐ ํŒจ์Šค
113
+ if video_id in collected_video_ids:
114
+ logger.info(f"\n[{idx}/{total_videos}] ์ด๋ฏธ ์ˆ˜์ง‘๋œ ๋น„๋””์˜ค: {video_id} - {video['title']}")
115
+ continue
116
+
117
  logger.info(
118
  f"\n[{idx}/{total_videos}] ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์‹œ๋„: {video_id} - {video['title']}"
119
  )
120
 
121
+ result = fetch_transcript(video_id, max_retries, retry_delay)
122
+ transcript_segments = result["transcript_segments"]
123
+ error_message = result["error"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  if transcript_segments:
126
  transcript_info = {
 
132
  "collected_at": datetime.now().isoformat(),
133
  }
134
  all_transcripts.append(transcript_info)
135
+ logger.info("ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์„ฑ๊ณต")
136
  else:
137
  failed_videos.append(
138
  {
 
144
  )
145
  logger.warning(f"ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘ ์‹คํŒจ: {error_message}")
146
 
147
+ # 50๊ฐœ๋งˆ๋‹ค ์ค‘๊ฐ„ ์ €์žฅ
148
+ if idx % 50 == 0:
149
+ save_transcripts_to_file({"transcripts": all_transcripts}, output_file)
150
+
151
  # API ํ• ๋‹น๋Ÿ‰ ๋ณดํ˜ธ๋ฅผ ์œ„ํ•œ ๋Œ€๊ธฐ
152
+ time.sleep(0.2)
153
 
154
+ # ์ตœ์ข… ๊ฒฐ๊ณผ ์ €์žฅ
155
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
156
+ final_output_file = output_dir / f"transcripts_{timestamp}.json"
157
  result = {
158
  "collected_at": datetime.now().isoformat(),
159
  "total_videos": total_videos,
 
162
  "transcripts": all_transcripts,
163
  "failures": failed_videos,
164
  }
165
+ save_transcripts_to_file(result, final_output_file)
166
 
167
+ if failed_videos:
168
+ logger.warning("\n์‹คํŒจํ•œ ๋น„๋””์˜ค๋“ค:")
169
+ for fail in failed_videos:
170
+ logger.warning(
171
+ f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
172
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  return all_transcripts
175
 
src/scripts/collect_videos_info.py CHANGED
@@ -6,8 +6,8 @@ from pathlib import Path
6
  from typing import Dict, List
7
 
8
  from google.oauth2 import service_account
9
- from googleapiclient.errors import HttpError
10
 
 
11
  from core.youtube_api import YouTubeAPI
12
 
13
  # ๋กœ๊น… ์„ค์ •
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
18
 
19
 
20
  def collect_videos_info(
21
- max_retries: int = 3, retry_delay: int = 5, videos_per_channel: int = 100
22
  ) -> List[Dict]:
23
  """
24
  ๊ฐ ์ฑ„๋„์˜ ์ตœ์‹  ๋™์˜์ƒ ์ •๋ณด ์ˆ˜์ง‘ ํ•จ์ˆ˜
@@ -46,7 +46,6 @@ def collect_videos_info(
46
  youtube_api = YouTubeAPI(credentials)
47
 
48
  # ์ฑ„๋„ ๋ชฉ๋ก
49
- from core.config import target_channel_handles
50
 
51
  # ์ „์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
52
  all_videos = []
@@ -54,7 +53,7 @@ def collect_videos_info(
54
  failed_videos = []
55
 
56
  # ๊ฐ ์ฑ„๋„์˜ ๋™์˜์ƒ ์ •๋ณด ์ˆ˜์ง‘
57
- for handle in target_channel_ids:
58
  clean_handle = handle.replace("@", "")
59
  logger.info(f"\n์ฑ„๋„ ๋™์˜์ƒ ์ˆ˜์ง‘ ์‹œ๋„: {handle}")
60
 
@@ -128,7 +127,7 @@ def collect_videos_info(
128
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
129
  result = {
130
  "collected_at": datetime.now().isoformat(),
131
- "total_channels": len(target_channel_ids),
132
  "total_videos": len(all_videos),
133
  "failed_channels": len(failed_channels),
134
  "failed_videos": len(failed_videos),
 
6
  from typing import Dict, List
7
 
8
  from google.oauth2 import service_account
 
9
 
10
+ from core.config import target_channel_handles
11
  from core.youtube_api import YouTubeAPI
12
 
13
  # ๋กœ๊น… ์„ค์ •
 
18
 
19
 
20
  def collect_videos_info(
21
+ max_retries: int = 3, retry_delay: int = 5, videos_per_channel: int = 50
22
  ) -> List[Dict]:
23
  """
24
  ๊ฐ ์ฑ„๋„์˜ ์ตœ์‹  ๋™์˜์ƒ ์ •๋ณด ์ˆ˜์ง‘ ํ•จ์ˆ˜
 
46
  youtube_api = YouTubeAPI(credentials)
47
 
48
  # ์ฑ„๋„ ๋ชฉ๋ก
 
49
 
50
  # ์ „์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
51
  all_videos = []
 
53
  failed_videos = []
54
 
55
  # ๊ฐ ์ฑ„๋„์˜ ๋™์˜์ƒ ์ •๋ณด ์ˆ˜์ง‘
56
+ for handle in target_channel_handles:
57
  clean_handle = handle.replace("@", "")
58
  logger.info(f"\n์ฑ„๋„ ๋™์˜์ƒ ์ˆ˜์ง‘ ์‹œ๋„: {handle}")
59
 
 
127
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
128
  result = {
129
  "collected_at": datetime.now().isoformat(),
130
+ "total_channels": len(target_channel_handles),
131
  "total_videos": len(all_videos),
132
  "failed_channels": len(failed_channels),
133
  "failed_videos": len(failed_videos),
src/scripts/process_all_transcripts.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict
3
+
4
+ import tqdm
5
+
6
+ from ai.classifier import ToxcitiyClassifier
7
+ from core.data_manager import DataManager
8
+ from models.schemas import AnalyzedTranscript, ChunkedSegment
9
+
10
+
11
+ def batch_analyze_transcripts(
12
+ data_manager: DataManager,
13
+ classifier: ToxcitiyClassifier,
14
+ chunk_size: int = 60,
15
+ overlap: int = 10,
16
+ ) -> Dict[str, AnalyzedTranscript]:
17
+ """๋ชจ๋“  ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅ"""
18
+
19
+ # ๊ธฐ์กด ๋ถ„์„ ๊ฒฐ๊ณผ ๋กœ๋“œ
20
+ try:
21
+ with open("./data/analyzed_transcripts.json", "r", encoding="utf-8") as f:
22
+ existing_data = json.load(f)
23
+ analyzed_transcripts = existing_data.get("analyzed_transcripts", {})
24
+ except FileNotFoundError:
25
+ analyzed_transcripts = {}
26
+
27
+ # ๋ชจ๋“  ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ฒ˜๋ฆฌ
28
+ all_transcripts = data_manager.transcript_data
29
+
30
+ print(f"์ด {len(all_transcripts)}๊ฐœ์˜ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ฒ˜๋ฆฌ ์‹œ์ž‘...")
31
+
32
+ for transcript_data in tqdm.tqdm(all_transcripts):
33
+ video_id = transcript_data.get("video_id")
34
+
35
+ # ์ด๋ฏธ ๋ถ„์„๋œ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋Š” ๊ฑด๋„ˆ๋›ฐ๊ธฐ
36
+ if video_id in analyzed_transcripts:
37
+ print(f"Video {video_id}: ์ด๋ฏธ ๋ถ„์„๋จ, ๊ฑด๋„ˆ๋›ฐ๊ธฐ")
38
+ continue
39
+
40
+ # ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ๋ฐ์ดํ„ฐ ์ค€๋น„
41
+ transcript = data_manager.get_transcript_by_video_id(video_id)
42
+ if transcript is None:
43
+ print(f"Video {video_id}: ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ")
44
+ continue
45
+
46
+ try:
47
+ # ์ „์ฒด ์˜์ƒ ๊ธธ์ด ๊ณ„์‚ฐ
48
+ total_duration = max(
49
+ segment["start"] + segment["duration"]
50
+ for segment in transcript.transcript_segments
51
+ )
52
+
53
+ # ์ฒญํฌ ์ฒ˜๋ฆฌ
54
+ num_chunks = math.ceil(total_duration / chunk_size)
55
+ chunked_segments = []
56
+ is_toxic = False
57
+ max_toxicity = 0.0
58
+
59
+ for i in range(num_chunks):
60
+ chunk_start = i * chunk_size
61
+ chunk_end = (i + 1) * chunk_size
62
+
63
+ # ๋ณด๊ฐ„ ๋ฒ”์œ„ ์„ค์ •
64
+ overlap_start = max(0, chunk_start - overlap)
65
+ overlap_end = min(total_duration, chunk_end + overlap)
66
+
67
+ # ํ•ด๋‹น ์ฒญํฌ์— ํฌํ•จ๋  ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ˆ˜์ง‘
68
+ chunk_text = []
69
+
70
+ for segment in transcript.transcript_segments:
71
+ segment_start = segment["start"]
72
+ segment_end = segment_start + segment["duration"]
73
+
74
+ if not (segment_end < overlap_start or segment_start > overlap_end):
75
+ chunk_text.append(segment["text"])
76
+
77
+ # ์ฒญํฌ ํ…์ŠคํŠธ ์ƒ์„ฑ
78
+ chunk_transcript = " ".join(chunk_text)
79
+
80
+ # toxicity inference ์ˆ˜ํ–‰
81
+ if chunk_transcript.strip(): # ๋นˆ ํ…์ŠคํŠธ๊ฐ€ ์•„๋‹Œ ๊ฒฝ์šฐ๋งŒ ๋ถ„์„
82
+ toxicity_score = classifier.infer(chunk_transcript)
83
+ max_toxicity = max(max_toxicity, toxicity_score)
84
+ else:
85
+ toxicity_score = 0.0
86
+
87
+ # ์ฒญํฌ ์„ธ๊ทธ๋จผํŠธ ์ƒ์„ฑ
88
+ chunk = ChunkedSegment(
89
+ start=overlap_start,
90
+ end=overlap_end,
91
+ transcript=chunk_transcript,
92
+ toxicity_score=float(toxicity_score),
93
+ )
94
+
95
+ chunked_segments.append(chunk)
96
+
97
+ # ์œ ํ•ด์„ฑ ํŒ๋‹จ (์ž„๊ณ„๊ฐ’ 0.5 ์ ์šฉ)
98
+ is_toxic = max_toxicity > 0.5
99
+
100
+ # AnalyzedTranscript ๊ฐ์ฒด ์ƒ์„ฑ
101
+ analyzed_transcript = AnalyzedTranscript(
102
+ video_id=video_id,
103
+ chunk_count=len(chunked_segments),
104
+ chunked_segments=chunked_segments,
105
+ is_toxic=is_toxic,
106
+ )
107
+
108
+ # ๊ฒฐ๊ณผ ์ €์žฅ
109
+ analyzed_transcripts[video_id] = analyzed_transcript
110
+
111
+ # ์ค‘๊ฐ„ ์ €์žฅ (๋งค ์˜์ƒ ๋ถ„์„ ํ›„)
112
+ with open("./data/analyzed_transcripts.json", "w", encoding="utf-8") as f:
113
+ json.dump(
114
+ {
115
+ "analyzed_transcripts": {
116
+ vid: asdict(transcript)
117
+ for vid, transcript in analyzed_transcripts.items()
118
+ }
119
+ },
120
+ f,
121
+ ensure_ascii=False,
122
+ indent=2,
123
+ )
124
+
125
+ print(
126
+ f"Video {video_id}: ๋ถ„์„ ์™„๋ฃŒ (์œ ํ•ด์„ฑ: {is_toxic}, ์ตœ๋Œ€ ์ ์ˆ˜: {max_toxicity:.3f})"
127
+ )
128
+
129
+ except Exception as e:
130
+ print(f"Video {video_id} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
131
+ continue
132
+
133
+ return analyzed_transcripts
134
+
135
+
136
+ if __name__ == "__main__":
137
+ import math
138
+ from dataclasses import asdict
139
+
140
+ # ๋ฐ์ดํ„ฐ ๋งค๋‹ˆ์ €์™€ ๋ถ„๋ฅ˜๊ธฐ ์ดˆ๊ธฐํ™”
141
+ data_manager = DataManager()
142
+ classifier = ToxcitiyClassifier()
143
+
144
+ # ๋ฐฐ์น˜ ๏ฟฝ๏ฟฝ๏ฟฝ๋ฆฌ ์‹คํ–‰
145
+ results = batch_analyze_transcripts(data_manager, classifier)
146
+
147
+ # ์ตœ์ข… ํ†ต๊ณ„
148
+ total_analyzed = len(results)
149
+ total_toxic = sum(1 for transcript in results.values() if transcript.is_toxic)
150
+
151
+ print("\n๋ถ„์„ ์™„๋ฃŒ ํ†ต๊ณ„:")
152
+ print(f"์ด ์ฒ˜๋ฆฌ๋œ ์˜์ƒ: {total_analyzed}")
153
+ print(f"์œ ํ•ด ํŒ์ • ์˜์ƒ: {total_toxic}")
154
+ print(f"์œ ํ•ด ๋น„์œจ: {(total_toxic/total_analyzed)*100:.1f}%")