Spaces:

yoon2566
/

script

Sleeping

App Files Files Community

yoon2566 commited on Jan 9, 2025

Commit

453a2d3

verified ·

1 Parent(s): fcb165f

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -60

app.py CHANGED Viewed

@@ -1,75 +1,114 @@
 import gradio as gr
-from pytube import YouTube
 import re
-import json
-from html import unescape
-def extract_video_id(url):
-    """Extract video ID from YouTube URL"""
-    patterns = [
-        r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
-        r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url)
-        if match:
-            return match.group(1)
-    return None
-def get_transcript(url):
-    """Get transcript from YouTube video using pytube"""
     try:
-        # Create YouTube object
-        yt = YouTube(url)
-        # Get captions
-        captions = yt.captions
-        # Try to get Korean captions first, then English
-        caption_track = None
-        if 'ko' in captions:
-            caption_track = captions['ko']
-        elif 'a.ko' in captions:  # auto-generated Korean
-            caption_track = captions['a.ko']
-        elif 'en' in captions:
-            caption_track = captions['en']
-        elif 'a.en' in captions:  # auto-generated English
-            caption_track = captions['a.en']
-        if caption_track is None:
-            return f"자막을 찾을 수 없습니다.\n제목: {yt.title}\n길이: {yt.length}초"
-        # Get the XML captions
-        xml_captions = caption_track.xml_captions
-        # Parse the captions
-        formatted_transcript = f"제목: {yt.title}\n\n"
-        # Simple XML parsing for timestamps and text
-        caption_pattern = r'<text start="(\d+(?:\.\d+)?)"[^>]*>(.*?)</text>'
-        matches = re.finditer(caption_pattern, xml_captions)
-        for match in matches:
-            start_time = float(match.group(1))
-            text = unescape(match.group(2)).replace('\n', ' ')
-            minutes = int(start_time // 60)
-            seconds = int(start_time % 60)
-            formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
-        return formatted_transcript
     except Exception as e:
-        error_msg = str(e)
-        if "age restricted" in error_msg.lower():
-            return "연령 제한이 있는 영상입니다."
-        return f"자막 추출 중 오류가 발생했습니다: {error_msg}"
 # Create Gradio interface
 iface = gr.Interface(
-    fn=get_transcript,
     inputs=gr.Textbox(
         label="YouTube URL",
         placeholder="https://www.youtube.com/watch?v=..."
@@ -78,7 +117,7 @@ iface = gr.Interface(
         label="추출된 스크립트",
         lines=20
     ),
-    title="YouTube 자막 추출기 (pytube 버전)",
     description="YouTube 영상의 URL을 입력하면 자막을 추출합니다. (한국어 우선, 영어 차선)",
     allow_flagging="never"
 )

 import gradio as gr
+import yt_dlp
 import re
+from datetime import timedelta
+def format_timestamp(seconds):
+    """Convert seconds to MM:SS format"""
+    return str(timedelta(seconds=seconds)).split(':')[1:3]
+def extract_transcript(url):
+    """Extract transcript from YouTube video using yt-dlp"""
     try:
+        ydl_opts = {
+            'writesubtitles': True,
+            'writeautomaticsub': True,
+            'subtitleslangs': ['ko', 'en'],
+            'skip_download': True,
+            'quiet': True
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            # Get video info
+            info = ydl.extract_info(url, download=False)
+            # Get available subtitles
+            subtitles = info.get('subtitles', {})
+            automatic_captions = info.get('automatic_captions', {})
+            # Try to get subtitles in preferred order
+            subtitle_text = None
+            # 1. Try manual Korean subtitles
+            if 'ko' in subtitles:
+                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
+                    for sub in subtitles['ko']:
+                        if sub.get('ext') == fmt:
+                            subtitle_text = ydl.write_debug_json(sub['url'])
+                            break
+                    if subtitle_text:
+                        break
+            # 2. Try auto-generated Korean subtitles
+            if not subtitle_text and 'ko' in automatic_captions:
+                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
+                    for sub in automatic_captions['ko']:
+                        if sub.get('ext') == fmt:
+                            subtitle_text = ydl.write_debug_json(sub['url'])
+                            break
+                    if subtitle_text:
+                        break
+            # 3. Try English subtitles
+            if not subtitle_text and 'en' in subtitles:
+                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
+                    for sub in subtitles['en']:
+                        if sub.get('ext') == fmt:
+                            subtitle_text = ydl.write_debug_json(sub['url'])
+                            break
+                    if subtitle_text:
+                        break
+            # 4. Try auto-generated English subtitles
+            if not subtitle_text and 'en' in automatic_captions:
+                for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
+                    for sub in automatic_captions['en']:
+                        if sub.get('ext') == fmt:
+                            subtitle_text = ydl.write_debug_json(sub['url'])
+                            break
+                    if subtitle_text:
+                        break
+            if not subtitle_text:
+                return f"자막을 찾을 수 없습니다.\n제목: {info.get('title')}"
+            # Format output
+            formatted_output = f"제목: {info.get('title')}\n\n"
+            # Parse WebVTT format
+            lines = subtitle_text.split('\n')
+            current_time = None
+            current_text = []
+            for line in lines:
+                # Time stamp line
+                if re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
+                    if current_time and current_text:
+                        mins, secs = current_time
+                        formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
+                    start_time = line.split(' --> ')[0]
+                    time_parts = start_time.split(':')
+                    seconds = int(time_parts[1]) * 60 + float(time_parts[2].split('.')[0])
+                    current_time = divmod(int(seconds), 60)
+                    current_text = []
+                # Text line
+                elif line.strip() and not line.startswith('WEBVTT'):
+                    current_text.append(line.strip() + ' ')
+            # Add last subtitle
+            if current_time and current_text:
+                mins, secs = current_time
+                formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
+            return formatted_output
     except Exception as e:
+        return f"자막 추출 중 오류가 발생했습니다: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(
+    fn=extract_transcript,
     inputs=gr.Textbox(
         label="YouTube URL",
         placeholder="https://www.youtube.com/watch?v=..."
         label="추출된 스크립트",
         lines=20
     ),
+    title="YouTube 자막 추출기 (yt-dlp 버전)",
     description="YouTube 영상의 URL을 입력하면 자막을 추출합니다. (한국어 우선, 영어 차선)",
     allow_flagging="never"
 )