Spaces:

yoon2566
/

script

Sleeping

App Files Files Community

yoon2566 commited on Jan 9, 2025

Commit

bc49d10

verified ·

1 Parent(s): 8e48050

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -102

app.py CHANGED Viewed

@@ -1,126 +1,99 @@
 import gradio as gr
 from youtube_transcript_api import YouTubeTranscriptApi
-from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, NoTranscriptAvailable
-import requests
-import json
-import os
-def get_video_info(video_id):
-    """YouTube 영상 정보를 가져오는 함수"""
-    try:
-        response = requests.get(f'https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v={video_id}&format=json')
-        return response.json()
-    except:
-        return None
 def extract_video_id(url):
-    """YouTube URL에서 비디오 ID를 추출하는 함수"""
-    video_id = None
-    try:
-        if 'youtube.com/watch?v=' in url:
-            video_id = url.split('youtube.com/watch?v=')[1].split('&')[0][:11]
-        elif 'youtu.be/' in url:
-            video_id = url.split('youtu.be/')[1].split('?')[0][:11]
-        return video_id
-    except:
-        return None
-def get_available_transcripts(video_id):
-    """사용 가능한 모든 자막 목록을 가져오는 함수"""
-    try:
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-        return transcript_list
-    except:
-        return None
 def get_transcript(url):
-    """YouTube 영상의 스크립트를 추출하는 함수"""
     try:
-        # 비디오 ID 추출
         video_id = extract_video_id(url)
         if not video_id:
-            return "올바른 YouTube URL을 입력해주세요."
-        # 영상 정보 확인
-        video_info = get_video_info(video_id)
-        if not video_info:
-            return "영상 정보를 가져올 수 없습니다. URL을 확인해주세요."
-        # 자막 추출 시도
-        transcript_list = None
-        transcript_data = None
         try:
-            # 모든 가능한 자막 목록 가져오기
-            transcript_list = get_available_transcripts(video_id)
-            if transcript_list:
-                # 1. 수동 한국어 자막 시도
                 try:
-                    transcript = transcript_list.find_manually_created_transcript(['ko'])
-                    transcript_data = transcript.fetch()
                 except:
-                    # 2. 자동 생성된 한국어 자막 시도
-                    try:
-                        transcript = transcript_list.find_generated_transcript(['ko'])
-                        transcript_data = transcript.fetch()
-                    except:
-                        # 3. 영어 자막 시도 (수동 또는 자동)
-                        try:
-                            transcript = transcript_list.find_transcript(['en'])
-                            transcript_data = transcript.fetch()
-                        except:
-                            return "지원되는 자막을 찾을 수 없습니다."
-            if not transcript_data:
-                # 직접 API로 시도
-                transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
-            # 전체 스크립트 텍스트 구성
-            full_transcript = f"제목: {video_info.get('title', '제목 없음')}\n\n"
-            for transcript in transcript_data:
-                text = transcript['text']
-                timestamp = transcript['start']
-                minutes = int(timestamp // 60)
-                seconds = int(timestamp % 60)
-                full_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
-            return full_transcript
-        except NoTranscriptAvailable:
-            return "이 영상에는 자막이 없습니다."
-        except TranscriptsDisabled:
-            return "이 영상은 자막이 비활성화되어 있습니다."
-        except Exception as e:
-            return f"자막 추출 중 오류가 발생했습니다: {str(e)}"
     except Exception as e:
-        return f"처리 중 오류가 발생했습니다: {str(e)}"
-# Gradio 인터페이스 생성
 iface = gr.Interface(
     fn=get_transcript,
-    inputs=[
-        gr.Textbox(label="YouTube URL을 입력하세요", placeholder="https://www.youtube.com/watch?v=...")
-    ],
-    outputs=gr.Textbox(label="추출된 스크립트", lines=20),
-    title="YouTube 스크립트 추출기",
-    description="""
-    YouTube 영상의 URL을 입력하면 자동으로 스크립트를 추출합니다.
-    - 한국어 자막 (수동/자동)
-    - 영어 자막 (수동/자동)
-    을 순차적으로 시도합니다.
-    """,
-    examples=[
-        ["https://www.youtube.com/watch?v=example1"],
-        ["https://youtu.be/example2"]
-    ]
 )
-# 환경변수 설정
-os.environ['GRADIO_SERVER_NAME'] = "0.0.0.0"
-os.environ['GRADIO_SERVER_PORT'] = "7860"
-# 애플리케이션 실행
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from youtube_transcript_api import YouTubeTranscriptApi
+import re
 def extract_video_id(url):
+    """Extract video ID from YouTube URL"""
+    patterns = [
+        r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
+        r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return None
 def get_transcript(url):
+    """Get transcript from YouTube video"""
     try:
+        # 1. Extract video ID
         video_id = extract_video_id(url)
         if not video_id:
+            return "유효한 YouTube URL이 아닙니다. 다시 확인해주세요."
+        # 2. Get transcript list
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        # 3. Try to get transcript in preferred order
+        transcript = None
         try:
+            # Try manual Korean transcript first
+            transcript = transcript_list.find_manually_created_transcript(['ko'])
+        except:
+            try:
+                # Try auto-generated Korean transcript
+                transcript = transcript_list.find_generated_transcript(['ko'])
+            except:
                 try:
+                    # Try English transcript
+                    transcript = transcript_list.find_transcript(['en'])
                 except:
+                    # Try any available transcript
+                    transcript = transcript_list.find_transcript(['ko', 'en', 'ja', 'zh-Hans'])
+        # 4. Fetch and format transcript
+        if transcript:
+            transcript_data = transcript.fetch()
+            formatted_transcript = ""
+            for entry in transcript_data:
+                time = int(entry['start'])
+                minutes = time // 60
+                seconds = time % 60
+                text = entry['text'].replace('\n', ' ')
+                formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
+            return formatted_transcript if formatted_transcript else "자막을 추출할 수 없습니다."
+        else:
+            return "자막을 찾을 수 없습니다."
     except Exception as e:
+        error_msg = str(e)
+        if "Subtitles are disabled" in error_msg:
+            # Try alternative method
+            try:
+                transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
+                formatted_transcript = ""
+                for entry in transcript_data:
+                    time = int(entry['start'])
+                    minutes = time // 60
+                    seconds = time % 60
+                    text = entry['text'].replace('\n', ' ')
+                    formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
+                return formatted_transcript
+            except:
+                return "이 영상에서 자막을 추출할 수 없습니다."
+        else:
+            return f"오류가 발생했습니다: {error_msg}"
+# Create Gradio interface
 iface = gr.Interface(
     fn=get_transcript,
+    inputs=gr.Textbox(
+        label="YouTube URL",
+        placeholder="https://www.youtube.com/watch?v=..."
+    ),
+    outputs=gr.Textbox(
+        label="추출된 스크립트",
+        lines=20
+    ),
+    title="YouTube 자막 추출기",
+    description="YouTube 영상의 URL을 입력하면 자막을 추출합니다. (한국어 우선, 영어 차선)",
+    allow_flagging="never"
 )
+# Launch the app
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0")