Spaces:

yoon2566
/

script

Sleeping

App Files Files Community

yoon2566 commited on Jan 9, 2025

Commit

ca4368e

verified ·

1 Parent(s): 6cb0513

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -56

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import gradio as gr
-from youtube_transcript_api import YouTubeTranscriptApi
 import re
 def extract_video_id(url):
     """Extract video ID from YouTube URL"""
@@ -16,67 +18,54 @@ def extract_video_id(url):
     return None
 def get_transcript(url):
-    """Get transcript from YouTube video"""
     try:
-        # 1. Extract video ID
-        video_id = extract_video_id(url)
-        if not video_id:
-            return "유효한 YouTube URL이 아닙니다. 다시 확인해주세요."
-        # 2. Get transcript list
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-        # 3. Try to get transcript in preferred order
-        transcript = None
-        try:
-            # Try manual Korean transcript first
-            transcript = transcript_list.find_manually_created_transcript(['ko'])
-        except:
-            try:
-                # Try auto-generated Korean transcript
-                transcript = transcript_list.find_generated_transcript(['ko'])
-            except:
-                try:
-                    # Try English transcript
-                    transcript = transcript_list.find_transcript(['en'])
-                except:
-                    # Try any available transcript
-                    transcript = transcript_list.find_transcript(['ko', 'en', 'ja', 'zh-Hans'])
-        # 4. Fetch and format transcript
-        if transcript:
-            transcript_data = transcript.fetch()
-            formatted_transcript = ""
-            for entry in transcript_data:
-                time = int(entry['start'])
-                minutes = time // 60
-                seconds = time % 60
-                text = entry['text'].replace('\n', ' ')
-                formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
-            return formatted_transcript if formatted_transcript else "자막을 추출할 수 없습니다."
-        else:
-            return "자막을 찾을 수 없습니다."
     except Exception as e:
         error_msg = str(e)
-        if "Subtitles are disabled" in error_msg:
-            # Try alternative method
-            try:
-                transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
-                formatted_transcript = ""
-                for entry in transcript_data:
-                    time = int(entry['start'])
-                    minutes = time // 60
-                    seconds = time % 60
-                    text = entry['text'].replace('\n', ' ')
-                    formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
-                return formatted_transcript
-            except:
-                return "이 영상에서 자막을 추출할 수 없습니다."
-        else:
-            return f"오류가 발생했습니다: {error_msg}"
 # Create Gradio interface
 iface = gr.Interface(
@@ -89,7 +78,7 @@ iface = gr.Interface(
         label="추출된 스크립트",
         lines=20
     ),
-    title="YouTube 자막 추출기",
     description="YouTube 영상의 URL을 입력하면 자막을 추출합니다. (한국어 우선, 영어 차선)",
     allow_flagging="never"
 )

 import gradio as gr
+from pytube import YouTube
 import re
+import json
+from html import unescape
 def extract_video_id(url):
     """Extract video ID from YouTube URL"""
     return None
 def get_transcript(url):
+    """Get transcript from YouTube video using pytube"""
     try:
+        # Create YouTube object
+        yt = YouTube(url)
+        # Get captions
+        captions = yt.captions
+        # Try to get Korean captions first, then English
+        caption_track = None
+        if 'ko' in captions:
+            caption_track = captions['ko']
+        elif 'a.ko' in captions:  # auto-generated Korean
+            caption_track = captions['a.ko']
+        elif 'en' in captions:
+            caption_track = captions['en']
+        elif 'a.en' in captions:  # auto-generated English
+            caption_track = captions['a.en']
+        if caption_track is None:
+            return f"자막을 찾을 수 없습니다.\n제목: {yt.title}\n길이: {yt.length}초"
+        # Get the XML captions
+        xml_captions = caption_track.xml_captions
+        # Parse the captions
+        formatted_transcript = f"제목: {yt.title}\n\n"
+        # Simple XML parsing for timestamps and text
+        caption_pattern = r'<text start="(\d+(?:\.\d+)?)"[^>]*>(.*?)</text>'
+        matches = re.finditer(caption_pattern, xml_captions)
+        for match in matches:
+            start_time = float(match.group(1))
+            text = unescape(match.group(2)).replace('\n', ' ')
+            minutes = int(start_time // 60)
+            seconds = int(start_time % 60)
+            formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
+        return formatted_transcript
     except Exception as e:
         error_msg = str(e)
+        if "age restricted" in error_msg.lower():
+            return "연령 제한이 있는 영상입니다."
+        return f"자막 추출 중 오류가 발생했습니다: {error_msg}"
 # Create Gradio interface
 iface = gr.Interface(
         label="추출된 스크립트",
         lines=20
     ),
+    title="YouTube 자막 추출기 (pytube 버전)",
     description="YouTube 영상의 URL을 입력하면 자막을 추출합니다. (한국어 우선, 영어 차선)",
     allow_flagging="never"
 )