jonghhhh commited on
Commit
914398e
ยท
verified ยท
1 Parent(s): 8ebd22f

Upload youtube_segmenter.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. youtube_segmenter.py +461 -0
youtube_segmenter.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import ruptures as rpt
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from youtube_transcript_api import YouTubeTranscriptApi as YTApi
6
+ from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
7
+ import re
8
+ import torch
9
+ import os
10
+ import json
11
+ from dotenv import load_dotenv
12
+ import time
13
+ import socket
14
+
15
+ # .env ํŒŒ์ผ ๋กœ๋“œ
16
+ load_dotenv()
17
+
18
+ # DNS ์„ค์ • ํ™•์ธ (๋””๋ฒ„๊น…์šฉ)
19
+ def check_dns():
20
+ """DNS ์—ฐ๊ฒฐ ํ™•์ธ"""
21
+ try:
22
+ socket.gethostbyname('www.youtube.com')
23
+ return True
24
+ except socket.gaierror:
25
+ return False
26
+
27
+
28
+ def simple_sentence_split(text):
29
+ """
30
+ ๋น ๋ฅธ ๋ฌธ์žฅ ๋ถ„๋ฆฌ ํ•จ์ˆ˜ (์ •๊ทœ์‹ ๊ธฐ๋ฐ˜)
31
+ KSS๋ณด๋‹ค ๋น ๋ฅด๋ฉฐ, ์œ ํŠœ๋ธŒ ์ž๋ง‰์— ์ถฉ๋ถ„ํžˆ ํšจ๊ณผ์ 
32
+ """
33
+ # ๋ฌธ์žฅ ์ข…๊ฒฐ ๋ถ€ํ˜ธ ๊ธฐ์ค€์œผ๋กœ ๋ถ„๋ฆฌ
34
+ sentences = re.split(r'(?<=[.!?])\s+', text)
35
+ # ๋นˆ ๋ฌธ์ž์—ด ์ œ๊ฑฐ ๋ฐ ๊ณต๋ฐฑ ์ •๋ฆฌ
36
+ sentences = [s.strip() for s in sentences if s.strip()]
37
+ return sentences
38
+
39
+
40
+ def extract_video_id(url):
41
+ """
42
+ ์œ ํŠœ๋ธŒ URL์—์„œ ๋น„๋””์˜ค ID๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
43
+ """
44
+ patterns = [
45
+ r'(?:youtube\.com/watch\?v=|youtu\.be/)([^&\n?#]+)',
46
+ r'youtube\.com/embed/([^&\n?#]+)',
47
+ r'youtube\.com/v/([^&\n?#]+)'
48
+ ]
49
+
50
+ for pattern in patterns:
51
+ match = re.search(pattern, url)
52
+ if match:
53
+ return match.group(1)
54
+
55
+ raise ValueError("์œ ํšจํ•œ ์œ ํŠœ๋ธŒ URL์ด ์•„๋‹™๋‹ˆ๋‹ค.")
56
+
57
+
58
+ def get_youtube_transcript(url, language='ko', max_retries=3):
59
+ """
60
+ ์œ ํŠœ๋ธŒ URL์—์„œ ์ž๋ง‰์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
61
+ 2026๋…„ ์ตœ์‹  ๋ฐฉ์‹: list_transcripts๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ์ž๋ง‰์„ ์ฐพ๊ณ  fetch
62
+
63
+ Args:
64
+ url: ์œ ํŠœ๋ธŒ URL
65
+ language: ์ž๋ง‰ ์–ธ์–ด (๊ธฐ๋ณธ๊ฐ’: 'ko')
66
+ max_retries: ์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜ (๊ธฐ๋ณธ๊ฐ’: 3)
67
+
68
+ Returns:
69
+ tuple: (text, timeline_data)
70
+ - text: ์ „์ฒด ์ž๋ง‰ ํ…์ŠคํŠธ
71
+ - timeline_data: ๊ฐ ์ž๋ง‰์˜ ํƒ€์ž„์Šคํƒฌํ”„ ์ •๋ณด ๋ฆฌ์ŠคํŠธ
72
+ """
73
+ # DNS ํ™•์ธ
74
+ print("๋„คํŠธ์›Œํฌ ์—ฐ๊ฒฐ ํ™•์ธ ์ค‘...")
75
+ if not check_dns():
76
+ print("๊ฒฝ๊ณ : DNS ํ•ด์„์— ๋ฌธ์ œ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค. ์žฌ์‹œ๋„ ์ค‘...")
77
+ time.sleep(2)
78
+
79
+ for attempt in range(max_retries):
80
+ try:
81
+ video_id = extract_video_id(url)
82
+ print(f"๋น„๋””์˜ค ID: {video_id}")
83
+
84
+ # YouTubeTranscriptApi ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
85
+ api = YTApi()
86
+
87
+ # 1. ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ์ž๋ง‰ ๋ฆฌ์ŠคํŠธ ํ™•์ธ
88
+ print(f"์ž๋ง‰ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๋Š” ์ค‘... (์‹œ๋„ {attempt + 1}/{max_retries})")
89
+ transcript_list = api.list(video_id)
90
+
91
+ # 2. ํ•œ๊ตญ์–ด ์ž๋ง‰ ์šฐ์„  ๊ฒ€์ƒ‰ (์ˆ˜๋™ ์ž‘์„ฑ -> ์ž๋™ ์ƒ์„ฑ ์ˆœ)
92
+ try:
93
+ transcript = transcript_list.find_transcript([language])
94
+ print(f"{language} ์ž๋ง‰์„ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.")
95
+ except NoTranscriptFound:
96
+ # ํ•œ๊ตญ์–ด ์ž๋ง‰์ด ์—†์œผ๋ฉด ์˜์–ด๋ฅผ ๊ฐ€์ ธ์™€ ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญ ์š”์ฒญ
97
+ print(f"{language} ์ž๋ง‰์ด ์—†์–ด ์˜์–ด ์ž๋ง‰์„ ๋ฒˆ์—ญํ•˜์—ฌ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.")
98
+ try:
99
+ transcript = transcript_list.find_transcript(['en']).translate(language)
100
+ print("์˜์–ด ์ž๋ง‰์„ ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญํ–ˆ์Šต๋‹ˆ๋‹ค.")
101
+ except:
102
+ # ๋ฒˆ์—ญ๋„ ์•ˆ๋˜๋ฉด ์˜์–ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
103
+ transcript = transcript_list.find_transcript(['en'])
104
+ print("์˜์–ด ์ž๋ง‰์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
105
+
106
+ # 3. ์ž๋ง‰ ๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ
107
+ # fetch()๋Š” FetchedTranscript ๊ฐ์ฒด๋ฅผ ๋ฐ˜ํ™˜ํ•˜๋ฉฐ, ์ด๋Š” ๋ฆฌ์ŠคํŠธ์ฒ˜๋Ÿผ ์ˆœํšŒ ๊ฐ€๋Šฅ
108
+ fetched_transcript = transcript.fetch()
109
+
110
+ # 4. ํ…์ŠคํŠธ ๋ฐ ํƒ€์ž„์Šคํƒฌํ”„ ์ถ”์ถœ
111
+ text_parts = []
112
+ timeline_data = []
113
+
114
+ for item in fetched_transcript:
115
+ text_parts.append(item.text)
116
+ timeline_data.append({
117
+ 'text': item.text,
118
+ 'start': item.start,
119
+ 'duration': item.duration
120
+ })
121
+
122
+ text = " ".join(text_parts)
123
+
124
+ print(f"์ž๋ง‰ ๊ฐ€์ ธ์˜ค๊ธฐ ์„ฑ๊ณต! (์ด {len(timeline_data)}๊ฐœ ํ•ญ๋ชฉ)")
125
+ return text, timeline_data
126
+
127
+ except TranscriptsDisabled:
128
+ raise Exception("์ด ๋น„๋””์˜ค๋Š” ์ž๋ง‰์ด ๋น„ํ™œ์„ฑํ™”๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
129
+ except NoTranscriptFound:
130
+ raise Exception("์ด ๋น„๋””์˜ค์—์„œ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ์ž๋ง‰์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
131
+ except (ConnectionError, socket.gaierror, OSError) as e:
132
+ # ๋„คํŠธ์›Œํฌ ์˜ค๋ฅ˜๋Š” ์žฌ์‹œ๋„
133
+ if attempt < max_retries - 1:
134
+ wait_time = (attempt + 1) * 2
135
+ print(f"๋„คํŠธ์›Œํฌ ์˜ค๋ฅ˜ ๋ฐœ์ƒ. {wait_time}์ดˆ ํ›„ ์žฌ์‹œ๋„... ({str(e)})")
136
+ time.sleep(wait_time)
137
+ continue
138
+ else:
139
+ raise Exception(f"์ž๋ง‰์„ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘ ๋„คํŠธ์›Œํฌ ์˜ค๋ฅ˜ ๋ฐœ์ƒ (์ตœ๋Œ€ ์žฌ์‹œ๋„ ์ดˆ๊ณผ): {str(e)}")
140
+ except Exception as e:
141
+ # ๋‹ค๋ฅธ ์˜ค๋ฅ˜๋Š” ์ฆ‰์‹œ ์‹คํŒจ
142
+ raise Exception(f"์ž๋ง‰์„ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
143
+
144
+ # ๋ชจ๋“  ์žฌ์‹œ๋„ ์‹คํŒจ
145
+ raise Exception(f"์ž๋ง‰ ๊ฐ€์ ธ์˜ค๊ธฐ ์‹คํŒจ: ์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜({max_retries})๋ฅผ ์ดˆ๊ณผํ–ˆ์Šต๋‹ˆ๋‹ค.")
146
+
147
+
148
+ def segment_youtube_transcript(text, penalty=5.0, threshold=90, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
149
+ """
150
+ ์œ ํŠœ๋ธŒ ์ž๋ง‰์„ ์ฃผ์ œ๋ณ„๋กœ ๋ถ„๋ฆฌํ•˜๋Š” ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํŒŒ์ดํ”„๋ผ์ธ
151
+
152
+ Args:
153
+ text: ๋ถ„์„ํ•  ํ…์ŠคํŠธ
154
+ penalty: ruptures PELT์˜ ํŽ˜๋„ํ‹ฐ ๊ฐ’ (ํด์ˆ˜๋ก ๋ณ€ํ™”์ ์ด ์ ๊ฒŒ ํƒ์ง€๋จ)
155
+ threshold: ์˜๋ฏธ ์œ ์‚ฌ๋„ ๋ฐฑ๋ถ„์œ„์ˆ˜ ์ž„๊ณ„๊ฐ’ (ํด์ˆ˜๋ก ๋” ์„ธ๋ฐ€ํ•˜๊ฒŒ ๋ถ„๋ฆฌ)
156
+ model_name: ์‚ฌ์šฉํ•  ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ
157
+ """
158
+ # 1๋‹จ๊ณ„: ๋ฌธ์žฅ ๋ถ„๋ฆฌ (๋น ๋ฅธ ์ •๊ทœ์‹ ๊ธฐ๋ฐ˜ ๋ถ„๋ฆฌ)
159
+ print("1๋‹จ๊ณ„: ๋ฌธ์žฅ ๋ถ„๋ฆฌ ์ค‘...")
160
+ sentences = simple_sentence_split(text)
161
+ print(f"์ด {len(sentences)}๊ฐœ์˜ ๋ฌธ์žฅ์ด ๊ฐ์ง€๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
162
+
163
+ if len(sentences) < 2:
164
+ print("๋ฌธ์žฅ์ด ๋„ˆ๋ฌด ์ ์–ด ๋ถ„๋ฆฌํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
165
+ return [text]
166
+
167
+ # 2๋‹จ๊ณ„: ์ž„๋ฒ ๋”ฉ (multilingual-MiniLM)
168
+ t_start_embed = time.time()
169
+ print("2๋‹จ๊ณ„: ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์ค‘...")
170
+ device = "cuda" if torch.cuda.is_available() else "cpu"
171
+ print(f"์‚ฌ์šฉ ์ค‘์ธ ๋””๋ฐ”์ด์Šค: {device}")
172
+
173
+ model = SentenceTransformer(model_name, device=device)
174
+
175
+ # GPU ์‚ฌ์šฉ ์‹œ FP16์œผ๋กœ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
176
+ if device == "cuda":
177
+ model.half()
178
+
179
+ # ๋ฐฐ์น˜ ์‚ฌ์ด์ฆˆ ์กฐ์ ˆํ•˜์—ฌ ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ์ตœ์ ํ™”
180
+ embeddings = model.encode(sentences, batch_size=32, show_progress_bar=True)
181
+ print(f"์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์™„๋ฃŒ: {time.time() - t_start_embed:.2f}์ดˆ")
182
+
183
+ # 3๋‹จ๊ณ„: 1์ฐจ ๋ถ„๋ฆฌ - ๋Œ€์ฃผ์ œ (ruptures PELT)
184
+ t_start_pelt = time.time()
185
+ print("3๋‹จ๊ณ„: ruptures PELT๋ฅผ ์ด์šฉํ•œ ๋Œ€์ฃผ์ œ ๋ถ„๋ฆฌ ์ค‘...")
186
+ algo = rpt.Pelt(model="rbf").fit(embeddings)
187
+ breakpoints = algo.predict(pen=penalty)
188
+ print(f"๊ฐ์ง€๋œ ๋Œ€์ฃผ์ œ ๋ณ€ํ™”์ : {len(breakpoints)-1}๊ฐœ")
189
+ print(f"PELT ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {time.time() - t_start_pelt:.2f}์ดˆ")
190
+
191
+ # 4๋‹จ๊ณ„: 2์ฐจ ๋ถ„๋ฆฌ - ์†Œ์ฃผ์ œ (SemanticChunker ๋ฐฉ์‹ ์ ์šฉ)
192
+ t_start_sub = time.time()
193
+ print("4๋‹จ๊ณ„: ์˜๋ฏธ ์œ ์‚ฌ๋„ ๊ธฐ๋ฐ˜ ์†Œ์ฃผ์ œ ์ •๋ฐ€ ๋ถ„ํ•  ์ค‘...")
194
+ final_segments = []
195
+ start_idx = 0
196
+
197
+ for bp in breakpoints:
198
+ segment_embeddings = embeddings[start_idx:bp]
199
+ segment_sentences = sentences[start_idx:bp]
200
+
201
+ if len(segment_sentences) > 1:
202
+ # ์ธ์ ‘ ๋ฌธ์žฅ ๊ฐ„ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
203
+ sims = []
204
+ for i in range(len(segment_embeddings) - 1):
205
+ sim = cosine_similarity([segment_embeddings[i]], [segment_embeddings[i+1]])[0][0]
206
+ sims.append(sim)
207
+
208
+ # ํ•˜์œ„ ๋ฐฑ๋ถ„์œ„์ˆ˜(threshold)๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ถ„๋ฆฌ ์ง€์  ๊ฒฐ์ •
209
+ sub_threshold = np.percentile(sims, 100 - threshold)
210
+
211
+ current_sub_chunk = [segment_sentences[0]]
212
+ for i, sim in enumerate(sims):
213
+ if sim < sub_threshold:
214
+ final_segments.append(" ".join(current_sub_chunk))
215
+ current_sub_chunk = [segment_sentences[i+1]]
216
+ else:
217
+ current_sub_chunk.append(segment_sentences[i+1])
218
+ final_segments.append(" ".join(current_sub_chunk))
219
+ else:
220
+ final_segments.extend(segment_sentences)
221
+
222
+ start_idx = bp
223
+
224
+ print(f"์ตœ์ข… ๋ถ„ํ•  ์™„๋ฃŒ: ์ด {len(final_segments)}๊ฐœ์˜ ์„ธ๊ทธ๋จผํŠธ")
225
+ return final_segments
226
+
227
+
228
+ def process_youtube_video(youtube_url, penalty=5.0, threshold=90, output_dir="."):
229
+ """
230
+ ์œ ํŠœ๋ธŒ ๋น„๋””์˜ค๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ ์„ธ๊ทธ๋จผํŠธ๋กœ ๋ถ„๋ฆฌ (์™ธ๋ถ€์—์„œ ํ˜ธ์ถœ ๊ฐ€๋Šฅ)
231
+
232
+ Args:
233
+ youtube_url: ์œ ํŠœ๋ธŒ URL
234
+ penalty: ruptures PELT ํŽ˜๋„ํ‹ฐ ๊ฐ’
235
+ threshold: ์˜๋ฏธ ์œ ์‚ฌ๋„ ์ž„๊ณ„๊ฐ’
236
+ output_dir: ์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ
237
+
238
+ Returns:
239
+ str: ์ƒ์„ฑ๋œ JSON ํŒŒ์ผ ๊ฒฝ๋กœ
240
+ """
241
+ try:
242
+ # ์ž๋ง‰ ๊ฐ€์ ธ์˜ค๊ธฐ
243
+ print("์ž๋ง‰์„ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘...")
244
+ transcript_text, timeline_data = get_youtube_transcript(youtube_url)
245
+ print(f"์ž๋ง‰ ๊ธธ์ด: {len(transcript_text)} ๊ธ€์ž\n")
246
+
247
+ # ์ฃผ์ œ ๋ถ„๋ฆฌ ์‹คํ–‰
248
+ print("์ฃผ์ œ ๋ถ„๋ฆฌ ์‹œ์ž‘...\n")
249
+ segments = segment_youtube_transcript(
250
+ transcript_text,
251
+ penalty=penalty,
252
+ threshold=threshold
253
+ )
254
+
255
+ # ํƒ€์ž„์Šคํƒฌํ”„ ๋งคํ•‘ ํ•จ์ˆ˜
256
+ def find_start_time(segment_text, timeline_data):
257
+ """
258
+ ์„ธ๊ทธ๋จผํŠธ ํ…์ŠคํŠธ์˜ ์ฒซ ๋ถ€๋ถ„๊ณผ ์ผ์น˜ํ•˜๋Š” ํƒ€์ž„๋ผ์ธ ํ•ญ๋ชฉ์„ ์ฐพ์•„ ์‹œ์ž‘ ์‹œ๊ฐ„ ๋ฐ˜ํ™˜
259
+ """
260
+ # ์„ธ๊ทธ๋จผํŠธ์˜ ์ฒซ ๋ถ€๋ถ„ ์ถ”์ถœ
261
+ segment_start = segment_text.strip()[:100] # ์ฒซ 100์ž
262
+
263
+ if not segment_start:
264
+ return 0.0
265
+
266
+ # ์ „์ฒด ํƒ€์ž„๋ผ์ธ ํ…์ŠคํŠธ ์ƒ์„ฑ (๊ฒ€์ƒ‰์šฉ)
267
+ full_timeline_text = " ".join([item['text'] for item in timeline_data])
268
+
269
+ # ์„ธ๊ทธ๋จผํŠธ ์‹œ์ž‘ ๋ถ€๋ถ„์ด ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ์–ด๋””์— ์œ„์น˜ํ•˜๋Š”์ง€ ์ฐพ๊ธฐ
270
+ try:
271
+ # ์ฒซ 30์ž๋กœ ๊ฒ€์ƒ‰ (๋„ˆ๋ฌด ๊ธธ๋ฉด ๋งค์นญ์ด ์–ด๋ ค์šธ ์ˆ˜ ์žˆ์Œ)
272
+ search_text = segment_start[:30].strip()
273
+ position = full_timeline_text.find(search_text)
274
+
275
+ if position == -1:
276
+ # ๋ชป ์ฐพ์œผ๋ฉด ๋” ์งง๊ฒŒ ์‹œ๋„
277
+ search_text = segment_start[:15].strip()
278
+ position = full_timeline_text.find(search_text)
279
+
280
+ if position >= 0:
281
+ # ํ•ด๋‹น ์œ„์น˜๊ฐ€ ๋ช‡ ๋ฒˆ์งธ ํƒ€์ž„๋ผ์ธ ํ•ญ๋ชฉ์— ํ•ด๋‹นํ•˜๋Š”์ง€ ์ฐพ๊ธฐ
282
+ char_count = 0
283
+ for item in timeline_data:
284
+ if char_count + len(item['text']) + 1 > position: # +1์€ ๊ณต๋ฐฑ
285
+ return item['start']
286
+ char_count += len(item['text']) + 1
287
+
288
+ except Exception:
289
+ pass
290
+
291
+ # ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ 0 ๋ฐ˜ํ™˜
292
+ return 0.0
293
+
294
+ def format_time(seconds):
295
+ """
296
+ ์ดˆ๋ฅผ HH:MM:SS ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
297
+ """
298
+ hours = int(seconds // 3600)
299
+ minutes = int((seconds % 3600) // 60)
300
+ secs = int(seconds % 60)
301
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
302
+
303
+ # JSON ํ˜•์‹์œผ๋กœ ๊ฒฐ๊ณผ ๊ตฌ์„ฑ (flat key ๊ตฌ์กฐ)
304
+ result_data = []
305
+ for i, segment in enumerate(segments):
306
+ start_time = find_start_time(segment, timeline_data)
307
+ result_data.append({
308
+ "url": youtube_url,
309
+ "chunk_id": f"chunk_{i+1}",
310
+ "chunk_time": format_time(start_time),
311
+ "text": segment
312
+ })
313
+
314
+ # ๋น„๋””์˜ค ID๋ฅผ ํŒŒ์ผ๋ช…์œผ๋กœ ์‚ฌ์šฉ
315
+ video_id = extract_video_id(youtube_url)
316
+ output_file = os.path.join(output_dir, f"{video_id}.json")
317
+
318
+ with open(output_file, 'w', encoding='utf-8') as f:
319
+ json.dump(result_data, f, ensure_ascii=False, indent=2)
320
+
321
+ print(f"\n๊ฒฐ๊ณผ๊ฐ€ {output_file}์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
322
+ print(f"์ด {len(segments)}๊ฐœ์˜ ์„ธ๊ทธ๋จผํŠธ๊ฐ€ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n")
323
+
324
+ return output_file
325
+
326
+ except Exception as e:
327
+ raise Exception(f"์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
328
+
329
+
330
+ def main():
331
+ """
332
+ ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
333
+ """
334
+ print("=== ์œ ํŠœ๋ธŒ ํŠธ๋žœ์Šคํฌ๋ฆฝํŠธ ์ฃผ์ œ ๋ถ„๋ฆฌ ๋„๊ตฌ ===\n")
335
+
336
+ # .env์—์„œ API ํ‚ค ํ™•์ธ
337
+ youtube_api_key = os.getenv('YOUTUBE_API_KEY')
338
+ if youtube_api_key:
339
+ print(f"YOUTUBE_API_KEY ๋กœ๋“œ ์™„๋ฃŒ: {youtube_api_key[:10]}...\n")
340
+ else:
341
+ print("๊ฒฝ๊ณ : YOUTUBE_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์—†์Šต๋‹ˆ๋‹ค.\n")
342
+
343
+ # ์œ ํŠœ๋ธŒ URL ์ž…๋ ฅ
344
+ youtube_url = input("์œ ํŠœ๋ธŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”: ").strip()
345
+
346
+ if not youtube_url:
347
+ print("URL์ด ์ž…๋ ฅ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
348
+ return
349
+
350
+ try:
351
+ # ์ž๋ง‰ ๊ฐ€์ ธ์˜ค๊ธฐ
352
+ print("\n์ž๋ง‰์„ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘...")
353
+ transcript_text, timeline_data = get_youtube_transcript(youtube_url)
354
+ print(f"์ž๋ง‰ ๊ธธ์ด: {len(transcript_text)} ๊ธ€์ž\n")
355
+
356
+ # ํŒŒ๋ผ๋ฏธํ„ฐ ์„ค์ •
357
+ print("๋ถ„์„ ํŒŒ๋ผ๋ฏธํ„ฐ ์„ค์ •:")
358
+ print("Penalty: ๋Œ€์ฃผ์ œ ๋ถ„๋ฆฌ ๋ฏผ๊ฐ๋„ (์ž‘์„์ˆ˜๋ก ๋” ๋งŽ์€ ์ฃผ์ œ๋กœ ๋ถ„๋ฆฌ, ๊ถŒ์žฅ: 3-7)")
359
+ penalty_input = input("Penalty ๊ฐ’ (๊ธฐ๋ณธ๊ฐ’: 5.0): ").strip()
360
+ penalty = float(penalty_input) if penalty_input else 5.0
361
+
362
+ print("\nThreshold: ์†Œ์ฃผ์ œ ๋ถ„๋ฆฌ ๋ฏผ๊ฐ๋„")
363
+ print(" - ๋‚ฎ์„์ˆ˜๋ก(70-80): ๋” ๋งŽ์€ ์„ธ๊ทธ๋จผํŠธ๋กœ ๋ถ„๋ฆฌ (์„ธ๋ฐ€ํ•œ ๋ถ„๋ฆฌ)")
364
+ print(" - ์ค‘๊ฐ„(85-92): ๊ท ํ˜•์žกํžŒ ๋ถ„๋ฆฌ (๊ถŒ์žฅ)")
365
+ print(" - ๋†’์„์ˆ˜๋ก(95-98): ๋” ์ ์€ ์„ธ๊ทธ๋จผํŠธ๋กœ ๋ถ„๋ฆฌ (ํฐ ๋ฉ์–ด๋ฆฌ)")
366
+ threshold_input = input("Threshold ๊ฐ’ (๊ธฐ๋ณธ๊ฐ’: 90): ").strip()
367
+ threshold = int(threshold_input) if threshold_input else 90
368
+
369
+ # ์ฃผ์ œ ๋ถ„๋ฆฌ ์‹คํ–‰
370
+ print("\n์ฃผ์ œ ๋ถ„๋ฆฌ ์‹œ์ž‘...\n")
371
+ segments = segment_youtube_transcript(
372
+ transcript_text,
373
+ penalty=penalty,
374
+ threshold=threshold
375
+ )
376
+
377
+ # ํƒ€์ž„์Šคํƒฌํ”„ ๋งคํ•‘ ํ•จ์ˆ˜
378
+ def find_start_time(segment_text, timeline_data):
379
+ """
380
+ ์„ธ๊ทธ๋จผํŠธ ํ…์ŠคํŠธ์˜ ์ฒซ ๋ถ€๋ถ„๊ณผ ์ผ์น˜ํ•˜๋Š” ํƒ€์ž„๋ผ์ธ ํ•ญ๋ชฉ์„ ์ฐพ์•„ ์‹œ์ž‘ ์‹œ๊ฐ„ ๋ฐ˜ํ™˜
381
+ """
382
+ # ์„ธ๊ทธ๋จผํŠธ์˜ ์ฒซ ๋ถ€๋ถ„ ์ถ”์ถœ
383
+ segment_start = segment_text.strip()[:100] # ์ฒซ 100์ž
384
+
385
+ if not segment_start:
386
+ return 0.0
387
+
388
+ # ์ „์ฒด ํƒ€์ž„๋ผ์ธ ํ…์ŠคํŠธ ์ƒ์„ฑ (๊ฒ€์ƒ‰์šฉ)
389
+ full_timeline_text = " ".join([item['text'] for item in timeline_data])
390
+
391
+ # ์„ธ๊ทธ๋จผํŠธ ์‹œ์ž‘ ๋ถ€๋ถ„์ด ์ „์ฒด ํ…์ŠคํŠธ๏ฟฝ๏ฟฝ๏ฟฝ์„œ ์–ด๋””์— ์œ„์น˜ํ•˜๋Š”์ง€ ์ฐพ๊ธฐ
392
+ try:
393
+ # ์ฒซ 30์ž๋กœ ๊ฒ€์ƒ‰ (๋„ˆ๋ฌด ๊ธธ๋ฉด ๋งค์นญ์ด ์–ด๋ ค์šธ ์ˆ˜ ์žˆ์Œ)
394
+ search_text = segment_start[:30].strip()
395
+ position = full_timeline_text.find(search_text)
396
+
397
+ if position == -1:
398
+ # ๋ชป ์ฐพ์œผ๋ฉด ๋” ์งง๊ฒŒ ์‹œ๋„
399
+ search_text = segment_start[:15].strip()
400
+ position = full_timeline_text.find(search_text)
401
+
402
+ if position >= 0:
403
+ # ํ•ด๋‹น ์œ„์น˜๊ฐ€ ๋ช‡ ๋ฒˆ์งธ ํƒ€์ž„๋ผ์ธ ํ•ญ๋ชฉ์— ํ•ด๋‹นํ•˜๋Š”์ง€ ์ฐพ๊ธฐ
404
+ char_count = 0
405
+ for item in timeline_data:
406
+ if char_count + len(item['text']) + 1 > position: # +1์€ ๊ณต๋ฐฑ
407
+ return item['start']
408
+ char_count += len(item['text']) + 1
409
+
410
+ except Exception:
411
+ pass
412
+
413
+ # ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ 0 ๋ฐ˜ํ™˜
414
+ return 0.0
415
+
416
+ def format_time(seconds):
417
+ """
418
+ ์ดˆ๋ฅผ HH:MM:SS ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
419
+ """
420
+ hours = int(seconds // 3600)
421
+ minutes = int((seconds % 3600) // 60)
422
+ secs = int(seconds % 60)
423
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
424
+
425
+ # JSON ํ˜•์‹์œผ๋กœ ๊ฒฐ๊ณผ ๊ตฌ์„ฑ (flat key ๊ตฌ์กฐ)
426
+ result_data = []
427
+ for i, segment in enumerate(segments):
428
+ start_time = find_start_time(segment, timeline_data)
429
+ result_data.append({
430
+ "url": youtube_url,
431
+ "chunk_id": f"chunk_{i+1}",
432
+ "chunk_time": format_time(start_time),
433
+ "text": segment
434
+ })
435
+
436
+ # ๋น„๋””์˜ค ID๋ฅผ ํŒŒ์ผ๋ช…์œผ๋กœ ์‚ฌ์šฉ
437
+ video_id = extract_video_id(youtube_url)
438
+ output_file = f"{video_id}.json"
439
+ with open(output_file, 'w', encoding='utf-8') as f:
440
+ json.dump(result_data, f, ensure_ascii=False, indent=2)
441
+
442
+ print(f"\n๊ฒฐ๊ณผ๊ฐ€ {output_file}์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
443
+ print(f"์ด {len(segments)}๊ฐœ์˜ ์„ธ๊ทธ๋จผํŠธ๊ฐ€ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n")
444
+
445
+ # ๊ฒฐ๊ณผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
446
+ print("=" * 80)
447
+ print("๊ฒฐ๊ณผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ (์ฒซ 3๊ฐœ ์„ธ๊ทธ๋จผํŠธ)")
448
+ print("=" * 80)
449
+ for item in result_data[:3]:
450
+ print(f"\nChunk ID: {item['chunk_id']}")
451
+ print(f"URL: {item['url']}")
452
+ print(f"์‹œ์ž‘ ์‹œ๊ฐ„: {item['chunk_time']}")
453
+ print(f"Text: {item['text'][:200]}..." if len(item['text']) > 200 else f"Text: {item['text']}")
454
+ print("-" * 80)
455
+
456
+ except Exception as e:
457
+ print(f"\n์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
458
+
459
+
460
+ if __name__ == "__main__":
461
+ main()