Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from
|
| 3 |
import re
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def extract_video_id(url):
|
| 6 |
"""Extract video ID from YouTube URL"""
|
|
@@ -16,67 +18,54 @@ def extract_video_id(url):
|
|
| 16 |
return None
|
| 17 |
|
| 18 |
def get_transcript(url):
|
| 19 |
-
"""Get transcript from YouTube video"""
|
| 20 |
try:
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
if not video_id:
|
| 24 |
-
return "μ ν¨ν YouTube URLμ΄ μλλλ€. λ€μ νμΈν΄μ£ΌμΈμ."
|
| 25 |
-
|
| 26 |
-
# 2. Get transcript list
|
| 27 |
-
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
#
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
minutes = time // 60
|
| 54 |
-
seconds = time % 60
|
| 55 |
-
text = entry['text'].replace('\n', ' ')
|
| 56 |
-
formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
|
| 62 |
except Exception as e:
|
| 63 |
error_msg = str(e)
|
| 64 |
-
if "
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
|
| 68 |
-
formatted_transcript = ""
|
| 69 |
-
for entry in transcript_data:
|
| 70 |
-
time = int(entry['start'])
|
| 71 |
-
minutes = time // 60
|
| 72 |
-
seconds = time % 60
|
| 73 |
-
text = entry['text'].replace('\n', ' ')
|
| 74 |
-
formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
|
| 75 |
-
return formatted_transcript
|
| 76 |
-
except:
|
| 77 |
-
return "μ΄ μμμμ μλ§μ μΆμΆν μ μμ΅λλ€."
|
| 78 |
-
else:
|
| 79 |
-
return f"μ€λ₯κ° λ°μνμ΅λλ€: {error_msg}"
|
| 80 |
|
| 81 |
# Create Gradio interface
|
| 82 |
iface = gr.Interface(
|
|
@@ -89,7 +78,7 @@ iface = gr.Interface(
|
|
| 89 |
label="μΆμΆλ μ€ν¬λ¦½νΈ",
|
| 90 |
lines=20
|
| 91 |
),
|
| 92 |
-
title="YouTube μλ§ μΆμΆκΈ°",
|
| 93 |
description="YouTube μμμ URLμ μ
λ ₯νλ©΄ μλ§μ μΆμΆν©λλ€. (νκ΅μ΄ μ°μ , μμ΄ μ°¨μ )",
|
| 94 |
allow_flagging="never"
|
| 95 |
)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from pytube import YouTube
|
| 3 |
import re
|
| 4 |
+
import json
|
| 5 |
+
from html import unescape
|
| 6 |
|
| 7 |
def extract_video_id(url):
|
| 8 |
"""Extract video ID from YouTube URL"""
|
|
|
|
| 18 |
return None
|
| 19 |
|
| 20 |
def get_transcript(url):
|
| 21 |
+
"""Get transcript from YouTube video using pytube"""
|
| 22 |
try:
|
| 23 |
+
# Create YouTube object
|
| 24 |
+
yt = YouTube(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
# Get captions
|
| 27 |
+
captions = yt.captions
|
| 28 |
+
|
| 29 |
+
# Try to get Korean captions first, then English
|
| 30 |
+
caption_track = None
|
| 31 |
+
if 'ko' in captions:
|
| 32 |
+
caption_track = captions['ko']
|
| 33 |
+
elif 'a.ko' in captions: # auto-generated Korean
|
| 34 |
+
caption_track = captions['a.ko']
|
| 35 |
+
elif 'en' in captions:
|
| 36 |
+
caption_track = captions['en']
|
| 37 |
+
elif 'a.en' in captions: # auto-generated English
|
| 38 |
+
caption_track = captions['a.en']
|
| 39 |
+
|
| 40 |
+
if caption_track is None:
|
| 41 |
+
return f"μλ§μ μ°Ύμ μ μμ΅λλ€.\nμ λͺ©: {yt.title}\nκΈΈμ΄: {yt.length}μ΄"
|
| 42 |
+
|
| 43 |
+
# Get the XML captions
|
| 44 |
+
xml_captions = caption_track.xml_captions
|
| 45 |
+
|
| 46 |
+
# Parse the captions
|
| 47 |
+
formatted_transcript = f"μ λͺ©: {yt.title}\n\n"
|
| 48 |
+
|
| 49 |
+
# Simple XML parsing for timestamps and text
|
| 50 |
+
caption_pattern = r'<text start="(\d+(?:\.\d+)?)"[^>]*>(.*?)</text>'
|
| 51 |
+
matches = re.finditer(caption_pattern, xml_captions)
|
| 52 |
+
|
| 53 |
+
for match in matches:
|
| 54 |
+
start_time = float(match.group(1))
|
| 55 |
+
text = unescape(match.group(2)).replace('\n', ' ')
|
| 56 |
|
| 57 |
+
minutes = int(start_time // 60)
|
| 58 |
+
seconds = int(start_time % 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
|
| 61 |
+
|
| 62 |
+
return formatted_transcript
|
| 63 |
|
| 64 |
except Exception as e:
|
| 65 |
error_msg = str(e)
|
| 66 |
+
if "age restricted" in error_msg.lower():
|
| 67 |
+
return "μ°λ Ή μ νμ΄ μλ μμμ
λλ€."
|
| 68 |
+
return f"μλ§ μΆμΆ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {error_msg}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# Create Gradio interface
|
| 71 |
iface = gr.Interface(
|
|
|
|
| 78 |
label="μΆμΆλ μ€ν¬λ¦½νΈ",
|
| 79 |
lines=20
|
| 80 |
),
|
| 81 |
+
title="YouTube μλ§ μΆμΆκΈ° (pytube λ²μ )",
|
| 82 |
description="YouTube μμμ URLμ μ
λ ₯νλ©΄ μλ§μ μΆμΆν©λλ€. (νκ΅μ΄ μ°μ , μμ΄ μ°¨μ )",
|
| 83 |
allow_flagging="never"
|
| 84 |
)
|