Spaces:

tregu0458
/

TubeTranscriptExtractor

Build error

File size: 3,669 Bytes

from youtube_transcript_api import (
    YouTubeTranscriptApi,
    NoTranscriptFound,
    TranscriptsDisabled,
    VideoUnavailable
)
from typing import List, Dict, Optional
import re
import gradio as gr

class YoutubeTranscript:
    @staticmethod
    def get_video_id(url: str) -> str:
        """URLからVideo IDを抽出"""
        # Use regular expression for more robust ID extraction
        match = re.search(r"(?<=v=)[^&#]+|(?<=be/)[^&#]+", url)
        return match.group(0) if match else None

    @staticmethod
    def get_transcript(url: str, language: str = 'ja') -> Optional[List[Dict]]:
        """
        YouTubeの書き起こしを取得

        Args:
            url: YouTube動画のURL
            language: 字幕の言語 (ja, en, en-US)

        Returns:
            字幕データのリスト。取得失敗時はNone
        Raises:
            NoTranscriptFound: 指定された言語の字幕が見つからない場合
        """
        try:
            video_id = YoutubeTranscript.get_video_id(url)
            if video_id is None:
                print(f"無効なURLです: {url}")
                return None

            transcript = YouTubeTranscriptApi.get_transcript(
                video_id,
                languages=[language]  # 指定された言語を使用
            )
            return transcript

        except TranscriptsDisabled:
            print(f"この動画では字幕が無効になっています: {url}")
            return None

        except VideoUnavailable:
            print(f"動画が利用できません: {url}")
            return None

        except Exception as e:
            # Check if the exception is related to language not found
            if "does not have any transcripts" in str(e) or \
               "Could not retrieve a transcript for the video" in str(e):
                raise NoTranscriptFound(f"指定した言語 ({language}) の字幕が見つかりません: {url}") from e
            else:
                print(f"予期せぬエラーが発生しました: {str(e)}")
                return None


# The function 'get_transcript_for_gradio' should be outside the class
def get_transcript_for_gradio(url: str, language: str) -> str:
    """Gradio UI 用の書き起こし取得関数"""
    try:
        transcript = YoutubeTranscript.get_transcript(url, language)
        if transcript:
            # formatted_transcript = "\n".join(
            #     [f"[{entry['start']:.1f}s] {entry['text']}" for entry in transcript]
            # )
            formatted_transcript = "".join(
                [f"{entry['text']}" for entry in transcript]
            )
            return formatted_transcript,str(len(formatted_transcript))
        else:
            return "字幕の取得に失敗しました。","字幕の取得に失敗しました。"
    except NoTranscriptFound as e:
        return str(e),str(len(e))  # Return the error message from the exception
    except Exception as e:
        return f"予期せぬエラーが発生しました: {str(e)}",str(len(e))

iface = gr.Interface(
    fn=get_transcript_for_gradio,
    inputs=[
        gr.Textbox(lines=1, placeholder="YouTube動画のURLを入力してください",show_copy_button=True),
        gr.Radio(["ja", "en", "en-US"], label="言語", value="ja")  # ラジオボタンを追加
    ],
    outputs=[
        gr.Code(label="字幕", max_lines=10),
        gr.Textbox(label="文字数")  # Add a Textbox for character count
    ],
    title="YouTube字幕取得アプリ",
    description="YouTube動画のURLを入力すると、字幕を取得して表示します。",
)

iface.launch()