Spaces:
Sleeping
Sleeping
adding updated datafiles
Browse files- data/.DS_Store +0 -0
- src/app.py +2 -0
- src/pages/video_detail.py +15 -16
- src/scripts/collect_channel_info.py +3 -3
- src/scripts/collect_transcript.py +91 -53
- src/scripts/collect_videos_info.py +4 -5
- src/scripts/process_all_transcripts.py +154 -0
data/.DS_Store
CHANGED
|
Binary files a/data/.DS_Store and b/data/.DS_Store differ
|
|
|
src/app.py
CHANGED
|
@@ -29,6 +29,8 @@ if "data_manager" not in st.session_state:
|
|
| 29 |
if "toxicity_classifier" not in st.session_state:
|
| 30 |
toxicity_classifier = load_inference_model()
|
| 31 |
st.session_state["toxicity_classifier"] = toxicity_classifier
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
pg = st.navigation(
|
|
|
|
| 29 |
if "toxicity_classifier" not in st.session_state:
|
| 30 |
toxicity_classifier = load_inference_model()
|
| 31 |
st.session_state["toxicity_classifier"] = toxicity_classifier
|
| 32 |
+
if "analyzed_transcripts" not in st.session_state:
|
| 33 |
+
st.session_state["analyzed_transcripts"] = {}
|
| 34 |
|
| 35 |
|
| 36 |
pg = st.navigation(
|
src/pages/video_detail.py
CHANGED
|
@@ -116,36 +116,35 @@ def analyze():
|
|
| 116 |
)
|
| 117 |
if analyzed_transcript is None:
|
| 118 |
analyzed_transcript = gen_analyzed_transcript()
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
else:
|
| 123 |
-
st.error("๋ถ์ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค.")
|
| 124 |
-
st.rerun() # ๊ฒฐ๊ณผ๋ฅผ ์ฆ์ ํ์ํ๊ธฐ ์ํ ํ์ด์ง ๋ฆฌ๋ก๋
|
| 125 |
-
return analyzed_transcript
|
| 126 |
|
| 127 |
|
| 128 |
analyzed_result = st.button("๋ถ์", icon="๐", on_click=analyze)
|
| 129 |
|
| 130 |
# ๋ถ์ ๊ฒฐ๊ณผ ํ์
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
)
|
| 135 |
|
| 136 |
if st.session_state.analyzed_transcript:
|
| 137 |
st.subheader("ํธ๋์คํฌ๋ฆฝํธ ๋ถ์ ๊ฒฐ๊ณผ")
|
| 138 |
|
| 139 |
-
# ๊ฒฐ๊ณผ๋ฅผ ํ ํ์์ผ๋ก ํ์
|
| 140 |
-
print(st.session_state.analyzed_transcript)
|
| 141 |
-
print(type(st.session_state.analyzed_transcript))
|
| 142 |
-
print("=" * 30)
|
| 143 |
for i, segment in enumerate(
|
| 144 |
st.session_state.analyzed_transcript.get("chunked_segments", [])
|
| 145 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
with st.expander(
|
| 147 |
-
f"์ฒญํฌ {i+1} ({segment['start']:.1f}s - {segment['end']:.1f}s)"
|
|
|
|
| 148 |
):
|
| 149 |
st.text(segment["transcript"])
|
| 150 |
st.progress(float(segment["toxicity_score"]))
|
| 151 |
st.text(f"์ ํด๋ ์ ์: {segment['toxicity_score']:.3f}")
|
|
|
|
|
|
| 116 |
)
|
| 117 |
if analyzed_transcript is None:
|
| 118 |
analyzed_transcript = gen_analyzed_transcript()
|
| 119 |
+
data_manager.set_analyzed_transcript(analyzed_transcript)
|
| 120 |
+
st.success("๋ถ์์ด ์๋ฃ๋์์ต๋๋ค!")
|
| 121 |
+
st.session_state.analyzed_transcript = analyzed_transcript
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
|
| 124 |
analyzed_result = st.button("๋ถ์", icon="๐", on_click=analyze)
|
| 125 |
|
| 126 |
# ๋ถ์ ๊ฒฐ๊ณผ ํ์
|
| 127 |
+
st.session_state.analyzed_transcript = data_manager.get_analyzed_transcript_by_video_id(
|
| 128 |
+
current_video.video_id
|
| 129 |
+
)
|
|
|
|
| 130 |
|
| 131 |
if st.session_state.analyzed_transcript:
|
| 132 |
st.subheader("ํธ๋์คํฌ๋ฆฝํธ ๋ถ์ ๊ฒฐ๊ณผ")
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
for i, segment in enumerate(
|
| 135 |
st.session_state.analyzed_transcript.get("chunked_segments", [])
|
| 136 |
):
|
| 137 |
+
color = "green"
|
| 138 |
+
if segment["toxicity_score"] > 0.7:
|
| 139 |
+
color = "red"
|
| 140 |
+
elif segment["toxicity_score"] > 0.5:
|
| 141 |
+
color = "orange"
|
| 142 |
+
|
| 143 |
with st.expander(
|
| 144 |
+
f"์ฒญํฌ {i+1} ({segment['start']:.1f}s - {segment['end']:.1f}s) - ์ ํด๋ ์ ์: {segment['toxicity_score']:.3f}",
|
| 145 |
+
expanded=True,
|
| 146 |
):
|
| 147 |
st.text(segment["transcript"])
|
| 148 |
st.progress(float(segment["toxicity_score"]))
|
| 149 |
st.text(f"์ ํด๋ ์ ์: {segment['toxicity_score']:.3f}")
|
| 150 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
src/scripts/collect_channel_info.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Dict, List
|
|
| 8 |
from google.oauth2 import service_account
|
| 9 |
from googleapiclient.errors import HttpError
|
| 10 |
|
|
|
|
| 11 |
from core.youtube_api import YouTubeAPI
|
| 12 |
|
| 13 |
# ๋ก๊น
์ค์
|
|
@@ -43,14 +44,13 @@ def collect_channel_info(max_retries: int = 3, retry_delay: int = 5) -> List[Dic
|
|
| 43 |
youtube_api = YouTubeAPI(credentials)
|
| 44 |
|
| 45 |
# ์ฑ๋ ๋ชฉ๋ก
|
| 46 |
-
from core.config import target_channel_handles
|
| 47 |
|
| 48 |
# ์ ์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ ๋ฆฌ์คํธ
|
| 49 |
all_channels = []
|
| 50 |
failed_channels = []
|
| 51 |
|
| 52 |
# ๊ฐ ์ฑ๋ ์ ๋ณด ์์ง
|
| 53 |
-
for handle in
|
| 54 |
clean_handle = handle.replace("@", "")
|
| 55 |
logger.info(f"\n์ฑ๋ ์ ๋ณด ์์ง ์๋: {handle}")
|
| 56 |
|
|
@@ -91,7 +91,7 @@ def collect_channel_info(max_retries: int = 3, retry_delay: int = 5) -> List[Dic
|
|
| 91 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 92 |
result = {
|
| 93 |
"collected_at": datetime.now().isoformat(),
|
| 94 |
-
"total_channels": len(
|
| 95 |
"successful_channels": len(all_channels),
|
| 96 |
"failed_channels": len(failed_channels),
|
| 97 |
"channels": all_channels,
|
|
|
|
| 8 |
from google.oauth2 import service_account
|
| 9 |
from googleapiclient.errors import HttpError
|
| 10 |
|
| 11 |
+
from core.config import target_channel_handles
|
| 12 |
from core.youtube_api import YouTubeAPI
|
| 13 |
|
| 14 |
# ๋ก๊น
์ค์
|
|
|
|
| 44 |
youtube_api = YouTubeAPI(credentials)
|
| 45 |
|
| 46 |
# ์ฑ๋ ๋ชฉ๋ก
|
|
|
|
| 47 |
|
| 48 |
# ์ ์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ ๋ฆฌ์คํธ
|
| 49 |
all_channels = []
|
| 50 |
failed_channels = []
|
| 51 |
|
| 52 |
# ๊ฐ ์ฑ๋ ์ ๋ณด ์์ง
|
| 53 |
+
for handle in target_channel_handles:
|
| 54 |
clean_handle = handle.replace("@", "")
|
| 55 |
logger.info(f"\n์ฑ๋ ์ ๋ณด ์์ง ์๋: {handle}")
|
| 56 |
|
|
|
|
| 91 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 92 |
result = {
|
| 93 |
"collected_at": datetime.now().isoformat(),
|
| 94 |
+
"total_channels": len(target_channel_handles),
|
| 95 |
"successful_channels": len(all_channels),
|
| 96 |
"failed_channels": len(failed_channels),
|
| 97 |
"channels": all_channels,
|
src/scripts/collect_transcript.py
CHANGED
|
@@ -18,6 +18,67 @@ logging.basicConfig(
|
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def collect_video_transcripts(
|
| 22 |
max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
|
| 23 |
) -> List[Dict]:
|
|
@@ -31,51 +92,35 @@ def collect_video_transcripts(
|
|
| 31 |
"""
|
| 32 |
output_dir = Path("data")
|
| 33 |
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# ๋น๋์ค ์ ๋ณด ๋ก๋
|
| 36 |
-
|
| 37 |
-
with open(videos_file, "r", encoding="utf-8") as f:
|
| 38 |
-
videos_data = json.load(f)
|
| 39 |
-
videos = videos_data.get("videos", [])
|
| 40 |
-
except Exception as e:
|
| 41 |
-
logger.error(f"๋น๋์ค ํ์ผ ๋ก๋ ์คํจ: {str(e)}")
|
| 42 |
-
return []
|
| 43 |
|
| 44 |
# ๊ฒฐ๊ณผ ์ ์ฅ์ฉ ๋ฆฌ์คํธ
|
| 45 |
-
all_transcripts = []
|
| 46 |
failed_videos = []
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
# ๊ฐ ๋น๋์ค์ ํธ๋์คํฌ๋ฆฝํธ ์์ง
|
| 49 |
total_videos = len(videos)
|
| 50 |
for idx, video in enumerate(videos, 1):
|
| 51 |
video_id = video["video_id"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
logger.info(
|
| 53 |
f"\n[{idx}/{total_videos}] ํธ๋์คํฌ๋ฆฝํธ ์์ง ์๋: {video_id} - {video['title']}"
|
| 54 |
)
|
| 55 |
|
| 56 |
-
|
| 57 |
-
transcript_segments =
|
| 58 |
-
error_message =
|
| 59 |
-
|
| 60 |
-
for attempt in range(max_retries):
|
| 61 |
-
try:
|
| 62 |
-
transcript_list = YouTubeTranscriptApi.get_transcript(
|
| 63 |
-
video_id, languages=["ko", "en"]
|
| 64 |
-
)
|
| 65 |
-
transcript_segments = transcript_list
|
| 66 |
-
break
|
| 67 |
-
except (TranscriptsDisabled, NoTranscriptFound) as e:
|
| 68 |
-
error_message = f"ํธ๋์คํฌ๋ฆฝํธ ์์: {str(e)}"
|
| 69 |
-
break
|
| 70 |
-
except Exception as e:
|
| 71 |
-
if attempt < max_retries - 1:
|
| 72 |
-
wait_time = retry_delay * (attempt + 1)
|
| 73 |
-
logger.warning(
|
| 74 |
-
f"์ค๋ฅ ๋ฐ์ (์ฌ์๋ {attempt + 1}/{max_retries}), {wait_time}์ด ํ ์ฌ์๋..."
|
| 75 |
-
)
|
| 76 |
-
time.sleep(wait_time)
|
| 77 |
-
else:
|
| 78 |
-
error_message = f"์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ: {str(e)}"
|
| 79 |
|
| 80 |
if transcript_segments:
|
| 81 |
transcript_info = {
|
|
@@ -87,7 +132,7 @@ def collect_video_transcripts(
|
|
| 87 |
"collected_at": datetime.now().isoformat(),
|
| 88 |
}
|
| 89 |
all_transcripts.append(transcript_info)
|
| 90 |
-
logger.info(
|
| 91 |
else:
|
| 92 |
failed_videos.append(
|
| 93 |
{
|
|
@@ -99,11 +144,16 @@ def collect_video_transcripts(
|
|
| 99 |
)
|
| 100 |
logger.warning(f"ํธ๋์คํฌ๋ฆฝํธ ์์ง ์คํจ: {error_message}")
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
# API ํ ๋น๋ ๋ณดํธ๋ฅผ ์ํ ๋๊ธฐ
|
| 103 |
-
time.sleep(
|
| 104 |
|
| 105 |
-
# ๊ฒฐ๊ณผ ์ ์ฅ
|
| 106 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
| 107 |
result = {
|
| 108 |
"collected_at": datetime.now().isoformat(),
|
| 109 |
"total_videos": total_videos,
|
|
@@ -112,26 +162,14 @@ def collect_video_transcripts(
|
|
| 112 |
"transcripts": all_transcripts,
|
| 113 |
"failures": failed_videos,
|
| 114 |
}
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
logger.info(
|
| 123 |
-
f"์ด {len(all_transcripts)}๊ฐ ํธ๋์คํฌ๋ฆฝํธ ์์ง ์๋ฃ (์คํจ: {len(failed_videos)}๊ฐ)"
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
if failed_videos:
|
| 127 |
-
logger.warning("\n์คํจํ ๋น๋์ค๋ค:")
|
| 128 |
-
for fail in failed_videos:
|
| 129 |
-
logger.warning(
|
| 130 |
-
f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
|
| 131 |
-
)
|
| 132 |
-
except Exception as e:
|
| 133 |
-
logger.error(f"๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
| 134 |
-
return all_transcripts
|
| 135 |
|
| 136 |
return all_transcripts
|
| 137 |
|
|
|
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
|
| 21 |
+
def load_existing_transcripts(file_path: Path) -> Dict:
|
| 22 |
+
"""๊ธฐ์กด ํธ๋์คํฌ๋ฆฝํธ ๋ฐ์ดํฐ ๋ก๋"""
|
| 23 |
+
if not file_path.exists():
|
| 24 |
+
return {"transcripts": []}
|
| 25 |
+
try:
|
| 26 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 27 |
+
data = json.load(f)
|
| 28 |
+
return data
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logger.error(f"ํธ๋์คํฌ๋ฆฝํธ ํ์ผ ๋ก๋ ์คํจ: {e}")
|
| 31 |
+
return {"transcripts": []}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_video_info(videos_file: str) -> List[Dict]:
|
| 35 |
+
"""๋น๋์ค ์ ๋ณด๋ฅผ ๋ก๋ํ๋ ํจ์"""
|
| 36 |
+
try:
|
| 37 |
+
with open(videos_file, "r", encoding="utf-8") as f:
|
| 38 |
+
videos_data = json.load(f)
|
| 39 |
+
return videos_data.get("videos", [])
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logger.error(f"๋น๋์ค ํ์ผ ๋ก๋ ์คํจ: {str(e)}")
|
| 42 |
+
return []
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def fetch_transcript(video_id: str, max_retries: int, retry_delay: int) -> Dict:
|
| 46 |
+
"""๊ฐ๋ณ ๋น๋์ค์ ํธ๋์คํฌ๋ฆฝํธ๋ฅผ API๋ก ํธ์ถํ๋ ํจ์"""
|
| 47 |
+
for attempt in range(max_retries):
|
| 48 |
+
try:
|
| 49 |
+
transcript_list = YouTubeTranscriptApi.get_transcript(
|
| 50 |
+
video_id, languages=["ko", "en"]
|
| 51 |
+
)
|
| 52 |
+
return {"transcript_segments": transcript_list, "error": None}
|
| 53 |
+
except (TranscriptsDisabled, NoTranscriptFound) as e:
|
| 54 |
+
return {
|
| 55 |
+
"transcript_segments": None,
|
| 56 |
+
"error": f"ํธ๋์คํฌ๋ฆฝํธ ์์: {str(e)}",
|
| 57 |
+
}
|
| 58 |
+
except Exception as e:
|
| 59 |
+
if attempt < max_retries - 1:
|
| 60 |
+
wait_time = retry_delay * (attempt + 1)
|
| 61 |
+
logger.warning(
|
| 62 |
+
f"์ค๋ฅ ๋ฐ์ (์ฌ์๋ {attempt + 1}/{max_retries}), {wait_time}์ด ํ ์ฌ์๋..."
|
| 63 |
+
)
|
| 64 |
+
time.sleep(wait_time)
|
| 65 |
+
else:
|
| 66 |
+
return {
|
| 67 |
+
"transcript_segments": None,
|
| 68 |
+
"error": f"์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ: {str(e)}",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def save_transcripts_to_file(transcripts: List[Dict], output_file: Path):
|
| 73 |
+
"""ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ํ์ผ์ ์ ์ฅํ๋ ํจ์"""
|
| 74 |
+
try:
|
| 75 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 76 |
+
json.dump(transcripts, f, ensure_ascii=False, indent=2)
|
| 77 |
+
logger.info(f"\n๊ฒฐ๊ณผ ์ ์ฅ ์๋ฃ: {output_file}")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"๊ฒฐ๊ณผ ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
def collect_video_transcripts(
|
| 83 |
max_retries: int = 3, retry_delay: int = 5, videos_file: str = "data/videos.json"
|
| 84 |
) -> List[Dict]:
|
|
|
|
| 92 |
"""
|
| 93 |
output_dir = Path("data")
|
| 94 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 95 |
+
output_file = output_dir / "transcripts_cache.json"
|
| 96 |
+
all_transcripts = load_existing_transcripts(output_file).get("transcripts", [])
|
| 97 |
|
| 98 |
# ๋น๋์ค ์ ๋ณด ๋ก๋
|
| 99 |
+
videos = load_video_info(videos_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
# ๊ฒฐ๊ณผ ์ ์ฅ์ฉ ๋ฆฌ์คํธ
|
|
|
|
| 102 |
failed_videos = []
|
| 103 |
|
| 104 |
+
# ์ด๋ฏธ ์์ง๋ ๋น๋์ค ์์ด๋ ๋ชฉ๋ก
|
| 105 |
+
collected_video_ids = {transcript["video_id"] for transcript in all_transcripts}
|
| 106 |
+
|
| 107 |
# ๊ฐ ๋น๋์ค์ ํธ๋์คํฌ๋ฆฝํธ ์์ง
|
| 108 |
total_videos = len(videos)
|
| 109 |
for idx, video in enumerate(videos, 1):
|
| 110 |
video_id = video["video_id"]
|
| 111 |
+
|
| 112 |
+
# ์ด๋ฏธ ์์ง๋ ๋น๋์ค์ธ ๊ฒฝ์ฐ ํจ์ค
|
| 113 |
+
if video_id in collected_video_ids:
|
| 114 |
+
logger.info(f"\n[{idx}/{total_videos}] ์ด๋ฏธ ์์ง๋ ๋น๋์ค: {video_id} - {video['title']}")
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
logger.info(
|
| 118 |
f"\n[{idx}/{total_videos}] ํธ๋์คํฌ๋ฆฝํธ ์์ง ์๋: {video_id} - {video['title']}"
|
| 119 |
)
|
| 120 |
|
| 121 |
+
result = fetch_transcript(video_id, max_retries, retry_delay)
|
| 122 |
+
transcript_segments = result["transcript_segments"]
|
| 123 |
+
error_message = result["error"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
if transcript_segments:
|
| 126 |
transcript_info = {
|
|
|
|
| 132 |
"collected_at": datetime.now().isoformat(),
|
| 133 |
}
|
| 134 |
all_transcripts.append(transcript_info)
|
| 135 |
+
logger.info("ํธ๋์คํฌ๋ฆฝํธ ์์ง ์ฑ๊ณต")
|
| 136 |
else:
|
| 137 |
failed_videos.append(
|
| 138 |
{
|
|
|
|
| 144 |
)
|
| 145 |
logger.warning(f"ํธ๋์คํฌ๋ฆฝํธ ์์ง ์คํจ: {error_message}")
|
| 146 |
|
| 147 |
+
# 50๊ฐ๋ง๋ค ์ค๊ฐ ์ ์ฅ
|
| 148 |
+
if idx % 50 == 0:
|
| 149 |
+
save_transcripts_to_file({"transcripts": all_transcripts}, output_file)
|
| 150 |
+
|
| 151 |
# API ํ ๋น๋ ๋ณดํธ๋ฅผ ์ํ ๋๊ธฐ
|
| 152 |
+
time.sleep(0.2)
|
| 153 |
|
| 154 |
+
# ์ต์ข
๊ฒฐ๊ณผ ์ ์ฅ
|
| 155 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 156 |
+
final_output_file = output_dir / f"transcripts_{timestamp}.json"
|
| 157 |
result = {
|
| 158 |
"collected_at": datetime.now().isoformat(),
|
| 159 |
"total_videos": total_videos,
|
|
|
|
| 162 |
"transcripts": all_transcripts,
|
| 163 |
"failures": failed_videos,
|
| 164 |
}
|
| 165 |
+
save_transcripts_to_file(result, final_output_file)
|
| 166 |
|
| 167 |
+
if failed_videos:
|
| 168 |
+
logger.warning("\n์คํจํ ๋น๋์ค๋ค:")
|
| 169 |
+
for fail in failed_videos:
|
| 170 |
+
logger.warning(
|
| 171 |
+
f"- [{fail['channel_handle']}] {fail['title']}: {fail['error']}"
|
| 172 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
return all_transcripts
|
| 175 |
|
src/scripts/collect_videos_info.py
CHANGED
|
@@ -6,8 +6,8 @@ from pathlib import Path
|
|
| 6 |
from typing import Dict, List
|
| 7 |
|
| 8 |
from google.oauth2 import service_account
|
| 9 |
-
from googleapiclient.errors import HttpError
|
| 10 |
|
|
|
|
| 11 |
from core.youtube_api import YouTubeAPI
|
| 12 |
|
| 13 |
# ๋ก๊น
์ค์
|
|
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
| 18 |
|
| 19 |
|
| 20 |
def collect_videos_info(
|
| 21 |
-
max_retries: int = 3, retry_delay: int = 5, videos_per_channel: int =
|
| 22 |
) -> List[Dict]:
|
| 23 |
"""
|
| 24 |
๊ฐ ์ฑ๋์ ์ต์ ๋์์ ์ ๋ณด ์์ง ํจ์
|
|
@@ -46,7 +46,6 @@ def collect_videos_info(
|
|
| 46 |
youtube_api = YouTubeAPI(credentials)
|
| 47 |
|
| 48 |
# ์ฑ๋ ๋ชฉ๋ก
|
| 49 |
-
from core.config import target_channel_handles
|
| 50 |
|
| 51 |
# ์ ์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ ๋ฆฌ์คํธ
|
| 52 |
all_videos = []
|
|
@@ -54,7 +53,7 @@ def collect_videos_info(
|
|
| 54 |
failed_videos = []
|
| 55 |
|
| 56 |
# ๊ฐ ์ฑ๋์ ๋์์ ์ ๋ณด ์์ง
|
| 57 |
-
for handle in
|
| 58 |
clean_handle = handle.replace("@", "")
|
| 59 |
logger.info(f"\n์ฑ๋ ๋์์ ์์ง ์๋: {handle}")
|
| 60 |
|
|
@@ -128,7 +127,7 @@ def collect_videos_info(
|
|
| 128 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 129 |
result = {
|
| 130 |
"collected_at": datetime.now().isoformat(),
|
| 131 |
-
"total_channels": len(
|
| 132 |
"total_videos": len(all_videos),
|
| 133 |
"failed_channels": len(failed_channels),
|
| 134 |
"failed_videos": len(failed_videos),
|
|
|
|
| 6 |
from typing import Dict, List
|
| 7 |
|
| 8 |
from google.oauth2 import service_account
|
|
|
|
| 9 |
|
| 10 |
+
from core.config import target_channel_handles
|
| 11 |
from core.youtube_api import YouTubeAPI
|
| 12 |
|
| 13 |
# ๋ก๊น
์ค์
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def collect_videos_info(
|
| 21 |
+
max_retries: int = 3, retry_delay: int = 5, videos_per_channel: int = 50
|
| 22 |
) -> List[Dict]:
|
| 23 |
"""
|
| 24 |
๊ฐ ์ฑ๋์ ์ต์ ๋์์ ์ ๋ณด ์์ง ํจ์
|
|
|
|
| 46 |
youtube_api = YouTubeAPI(credentials)
|
| 47 |
|
| 48 |
# ์ฑ๋ ๋ชฉ๋ก
|
|
|
|
| 49 |
|
| 50 |
# ์ ์ฒด ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ ๋ฆฌ์คํธ
|
| 51 |
all_videos = []
|
|
|
|
| 53 |
failed_videos = []
|
| 54 |
|
| 55 |
# ๊ฐ ์ฑ๋์ ๋์์ ์ ๋ณด ์์ง
|
| 56 |
+
for handle in target_channel_handles:
|
| 57 |
clean_handle = handle.replace("@", "")
|
| 58 |
logger.info(f"\n์ฑ๋ ๋์์ ์์ง ์๋: {handle}")
|
| 59 |
|
|
|
|
| 127 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 128 |
result = {
|
| 129 |
"collected_at": datetime.now().isoformat(),
|
| 130 |
+
"total_channels": len(target_channel_handles),
|
| 131 |
"total_videos": len(all_videos),
|
| 132 |
"failed_channels": len(failed_channels),
|
| 133 |
"failed_videos": len(failed_videos),
|
src/scripts/process_all_transcripts.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Dict
|
| 3 |
+
|
| 4 |
+
import tqdm
|
| 5 |
+
|
| 6 |
+
from ai.classifier import ToxcitiyClassifier
|
| 7 |
+
from core.data_manager import DataManager
|
| 8 |
+
from models.schemas import AnalyzedTranscript, ChunkedSegment
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def batch_analyze_transcripts(
|
| 12 |
+
data_manager: DataManager,
|
| 13 |
+
classifier: ToxcitiyClassifier,
|
| 14 |
+
chunk_size: int = 60,
|
| 15 |
+
overlap: int = 10,
|
| 16 |
+
) -> Dict[str, AnalyzedTranscript]:
|
| 17 |
+
"""๋ชจ๋ ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ๋ถ์ํ๊ณ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ"""
|
| 18 |
+
|
| 19 |
+
# ๊ธฐ์กด ๋ถ์ ๊ฒฐ๊ณผ ๋ก๋
|
| 20 |
+
try:
|
| 21 |
+
with open("./data/analyzed_transcripts.json", "r", encoding="utf-8") as f:
|
| 22 |
+
existing_data = json.load(f)
|
| 23 |
+
analyzed_transcripts = existing_data.get("analyzed_transcripts", {})
|
| 24 |
+
except FileNotFoundError:
|
| 25 |
+
analyzed_transcripts = {}
|
| 26 |
+
|
| 27 |
+
# ๋ชจ๋ ํธ๋์คํฌ๋ฆฝํธ ์ฒ๋ฆฌ
|
| 28 |
+
all_transcripts = data_manager.transcript_data
|
| 29 |
+
|
| 30 |
+
print(f"์ด {len(all_transcripts)}๊ฐ์ ํธ๋์คํฌ๋ฆฝํธ ์ฒ๋ฆฌ ์์...")
|
| 31 |
+
|
| 32 |
+
for transcript_data in tqdm.tqdm(all_transcripts):
|
| 33 |
+
video_id = transcript_data.get("video_id")
|
| 34 |
+
|
| 35 |
+
# ์ด๋ฏธ ๋ถ์๋ ํธ๋์คํฌ๋ฆฝํธ๋ ๊ฑด๋๋ฐ๊ธฐ
|
| 36 |
+
if video_id in analyzed_transcripts:
|
| 37 |
+
print(f"Video {video_id}: ์ด๋ฏธ ๋ถ์๋จ, ๊ฑด๋๋ฐ๊ธฐ")
|
| 38 |
+
continue
|
| 39 |
+
|
| 40 |
+
# ํธ๋์คํฌ๋ฆฝํธ ๋ฐ์ดํฐ ์ค๋น
|
| 41 |
+
transcript = data_manager.get_transcript_by_video_id(video_id)
|
| 42 |
+
if transcript is None:
|
| 43 |
+
print(f"Video {video_id}: ํธ๋์คํฌ๋ฆฝํธ๋ฅผ ์ฐพ์ ์ ์์")
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
# ์ ์ฒด ์์ ๊ธธ์ด ๊ณ์ฐ
|
| 48 |
+
total_duration = max(
|
| 49 |
+
segment["start"] + segment["duration"]
|
| 50 |
+
for segment in transcript.transcript_segments
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# ์ฒญํฌ ์ฒ๋ฆฌ
|
| 54 |
+
num_chunks = math.ceil(total_duration / chunk_size)
|
| 55 |
+
chunked_segments = []
|
| 56 |
+
is_toxic = False
|
| 57 |
+
max_toxicity = 0.0
|
| 58 |
+
|
| 59 |
+
for i in range(num_chunks):
|
| 60 |
+
chunk_start = i * chunk_size
|
| 61 |
+
chunk_end = (i + 1) * chunk_size
|
| 62 |
+
|
| 63 |
+
# ๋ณด๊ฐ ๋ฒ์ ์ค์
|
| 64 |
+
overlap_start = max(0, chunk_start - overlap)
|
| 65 |
+
overlap_end = min(total_duration, chunk_end + overlap)
|
| 66 |
+
|
| 67 |
+
# ํด๋น ์ฒญํฌ์ ํฌํจ๋ ํธ๋์คํฌ๋ฆฝํธ ์์ง
|
| 68 |
+
chunk_text = []
|
| 69 |
+
|
| 70 |
+
for segment in transcript.transcript_segments:
|
| 71 |
+
segment_start = segment["start"]
|
| 72 |
+
segment_end = segment_start + segment["duration"]
|
| 73 |
+
|
| 74 |
+
if not (segment_end < overlap_start or segment_start > overlap_end):
|
| 75 |
+
chunk_text.append(segment["text"])
|
| 76 |
+
|
| 77 |
+
# ์ฒญํฌ ํ
์คํธ ์์ฑ
|
| 78 |
+
chunk_transcript = " ".join(chunk_text)
|
| 79 |
+
|
| 80 |
+
# toxicity inference ์ํ
|
| 81 |
+
if chunk_transcript.strip(): # ๋น ํ
์คํธ๊ฐ ์๋ ๊ฒฝ์ฐ๋ง ๋ถ์
|
| 82 |
+
toxicity_score = classifier.infer(chunk_transcript)
|
| 83 |
+
max_toxicity = max(max_toxicity, toxicity_score)
|
| 84 |
+
else:
|
| 85 |
+
toxicity_score = 0.0
|
| 86 |
+
|
| 87 |
+
# ์ฒญํฌ ์ธ๊ทธ๋จผํธ ์์ฑ
|
| 88 |
+
chunk = ChunkedSegment(
|
| 89 |
+
start=overlap_start,
|
| 90 |
+
end=overlap_end,
|
| 91 |
+
transcript=chunk_transcript,
|
| 92 |
+
toxicity_score=float(toxicity_score),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
chunked_segments.append(chunk)
|
| 96 |
+
|
| 97 |
+
# ์ ํด์ฑ ํ๋จ (์๊ณ๊ฐ 0.5 ์ ์ฉ)
|
| 98 |
+
is_toxic = max_toxicity > 0.5
|
| 99 |
+
|
| 100 |
+
# AnalyzedTranscript ๊ฐ์ฒด ์์ฑ
|
| 101 |
+
analyzed_transcript = AnalyzedTranscript(
|
| 102 |
+
video_id=video_id,
|
| 103 |
+
chunk_count=len(chunked_segments),
|
| 104 |
+
chunked_segments=chunked_segments,
|
| 105 |
+
is_toxic=is_toxic,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# ๊ฒฐ๊ณผ ์ ์ฅ
|
| 109 |
+
analyzed_transcripts[video_id] = analyzed_transcript
|
| 110 |
+
|
| 111 |
+
# ์ค๊ฐ ์ ์ฅ (๋งค ์์ ๋ถ์ ํ)
|
| 112 |
+
with open("./data/analyzed_transcripts.json", "w", encoding="utf-8") as f:
|
| 113 |
+
json.dump(
|
| 114 |
+
{
|
| 115 |
+
"analyzed_transcripts": {
|
| 116 |
+
vid: asdict(transcript)
|
| 117 |
+
for vid, transcript in analyzed_transcripts.items()
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
f,
|
| 121 |
+
ensure_ascii=False,
|
| 122 |
+
indent=2,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
print(
|
| 126 |
+
f"Video {video_id}: ๋ถ์ ์๋ฃ (์ ํด์ฑ: {is_toxic}, ์ต๋ ์ ์: {max_toxicity:.3f})"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Video {video_id} ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
return analyzed_transcripts
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
import math
|
| 138 |
+
from dataclasses import asdict
|
| 139 |
+
|
| 140 |
+
# ๋ฐ์ดํฐ ๋งค๋์ ์ ๋ถ๋ฅ๊ธฐ ์ด๊ธฐํ
|
| 141 |
+
data_manager = DataManager()
|
| 142 |
+
classifier = ToxcitiyClassifier()
|
| 143 |
+
|
| 144 |
+
# ๋ฐฐ์น ๏ฟฝ๏ฟฝ๏ฟฝ๋ฆฌ ์คํ
|
| 145 |
+
results = batch_analyze_transcripts(data_manager, classifier)
|
| 146 |
+
|
| 147 |
+
# ์ต์ข
ํต๊ณ
|
| 148 |
+
total_analyzed = len(results)
|
| 149 |
+
total_toxic = sum(1 for transcript in results.values() if transcript.is_toxic)
|
| 150 |
+
|
| 151 |
+
print("\n๋ถ์ ์๋ฃ ํต๊ณ:")
|
| 152 |
+
print(f"์ด ์ฒ๋ฆฌ๋ ์์: {total_analyzed}")
|
| 153 |
+
print(f"์ ํด ํ์ ์์: {total_toxic}")
|
| 154 |
+
print(f"์ ํด ๋น์จ: {(total_toxic/total_analyzed)*100:.1f}%")
|