yoon2566 commited on
Commit
ca4368e
Β·
verified Β·
1 Parent(s): 6cb0513

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -56
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import gradio as gr
2
- from youtube_transcript_api import YouTubeTranscriptApi
3
  import re
 
 
4
 
5
  def extract_video_id(url):
6
  """Extract video ID from YouTube URL"""
@@ -16,67 +18,54 @@ def extract_video_id(url):
16
  return None
17
 
18
  def get_transcript(url):
19
- """Get transcript from YouTube video"""
20
  try:
21
- # 1. Extract video ID
22
- video_id = extract_video_id(url)
23
- if not video_id:
24
- return "μœ νš¨ν•œ YouTube URL이 μ•„λ‹™λ‹ˆλ‹€. λ‹€μ‹œ ν™•μΈν•΄μ£Όμ„Έμš”."
25
-
26
- # 2. Get transcript list
27
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
28
 
29
- # 3. Try to get transcript in preferred order
30
- transcript = None
31
- try:
32
- # Try manual Korean transcript first
33
- transcript = transcript_list.find_manually_created_transcript(['ko'])
34
- except:
35
- try:
36
- # Try auto-generated Korean transcript
37
- transcript = transcript_list.find_generated_transcript(['ko'])
38
- except:
39
- try:
40
- # Try English transcript
41
- transcript = transcript_list.find_transcript(['en'])
42
- except:
43
- # Try any available transcript
44
- transcript = transcript_list.find_transcript(['ko', 'en', 'ja', 'zh-Hans'])
45
-
46
- # 4. Fetch and format transcript
47
- if transcript:
48
- transcript_data = transcript.fetch()
49
- formatted_transcript = ""
 
 
 
 
 
 
 
 
 
50
 
51
- for entry in transcript_data:
52
- time = int(entry['start'])
53
- minutes = time // 60
54
- seconds = time % 60
55
- text = entry['text'].replace('\n', ' ')
56
- formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
57
 
58
- return formatted_transcript if formatted_transcript else "μžλ§‰μ„ μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€."
59
- else:
60
- return "μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
61
 
62
  except Exception as e:
63
  error_msg = str(e)
64
- if "Subtitles are disabled" in error_msg:
65
- # Try alternative method
66
- try:
67
- transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
68
- formatted_transcript = ""
69
- for entry in transcript_data:
70
- time = int(entry['start'])
71
- minutes = time // 60
72
- seconds = time % 60
73
- text = entry['text'].replace('\n', ' ')
74
- formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
75
- return formatted_transcript
76
- except:
77
- return "이 μ˜μƒμ—μ„œ μžλ§‰μ„ μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€."
78
- else:
79
- return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {error_msg}"
80
 
81
  # Create Gradio interface
82
  iface = gr.Interface(
@@ -89,7 +78,7 @@ iface = gr.Interface(
89
  label="μΆ”μΆœλœ 슀크립트",
90
  lines=20
91
  ),
92
- title="YouTube μžλ§‰ μΆ”μΆœκΈ°",
93
  description="YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€. (ν•œκ΅­μ–΄ μš°μ„ , μ˜μ–΄ μ°¨μ„ )",
94
  allow_flagging="never"
95
  )
 
1
  import gradio as gr
2
+ from pytube import YouTube
3
  import re
4
+ import json
5
+ from html import unescape
6
 
7
  def extract_video_id(url):
8
  """Extract video ID from YouTube URL"""
 
18
  return None
19
 
20
  def get_transcript(url):
21
+ """Get transcript from YouTube video using pytube"""
22
  try:
23
+ # Create YouTube object
24
+ yt = YouTube(url)
 
 
 
 
 
25
 
26
+ # Get captions
27
+ captions = yt.captions
28
+
29
+ # Try to get Korean captions first, then English
30
+ caption_track = None
31
+ if 'ko' in captions:
32
+ caption_track = captions['ko']
33
+ elif 'a.ko' in captions: # auto-generated Korean
34
+ caption_track = captions['a.ko']
35
+ elif 'en' in captions:
36
+ caption_track = captions['en']
37
+ elif 'a.en' in captions: # auto-generated English
38
+ caption_track = captions['a.en']
39
+
40
+ if caption_track is None:
41
+ return f"μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.\n제λͺ©: {yt.title}\n길이: {yt.length}초"
42
+
43
+ # Get the XML captions
44
+ xml_captions = caption_track.xml_captions
45
+
46
+ # Parse the captions
47
+ formatted_transcript = f"제λͺ©: {yt.title}\n\n"
48
+
49
+ # Simple XML parsing for timestamps and text
50
+ caption_pattern = r'<text start="(\d+(?:\.\d+)?)"[^>]*>(.*?)</text>'
51
+ matches = re.finditer(caption_pattern, xml_captions)
52
+
53
+ for match in matches:
54
+ start_time = float(match.group(1))
55
+ text = unescape(match.group(2)).replace('\n', ' ')
56
 
57
+ minutes = int(start_time // 60)
58
+ seconds = int(start_time % 60)
 
 
 
 
59
 
60
+ formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
61
+
62
+ return formatted_transcript
63
 
64
  except Exception as e:
65
  error_msg = str(e)
66
+ if "age restricted" in error_msg.lower():
67
+ return "μ—°λ Ή μ œν•œμ΄ μžˆλŠ” μ˜μƒμž…λ‹ˆλ‹€."
68
+ return f"μžλ§‰ μΆ”μΆœ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {error_msg}"
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # Create Gradio interface
71
  iface = gr.Interface(
 
78
  label="μΆ”μΆœλœ 슀크립트",
79
  lines=20
80
  ),
81
+ title="YouTube μžλ§‰ μΆ”μΆœκΈ° (pytube 버전)",
82
  description="YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€. (ν•œκ΅­μ–΄ μš°μ„ , μ˜μ–΄ μ°¨μ„ )",
83
  allow_flagging="never"
84
  )