Spaces:
Running
Running
| from langchain_community.document_loaders import YoutubeLoader | |
| from langchain_community.document_loaders.youtube import TranscriptFormat | |
| from langflow.custom import Component | |
| from langflow.inputs import DropdownInput, IntInput, MultilineInput | |
| from langflow.schema import Data | |
| from langflow.template import Output | |
| class YouTubeTranscriptsComponent(Component): | |
| """A component that extracts spoken content from YouTube videos as transcripts.""" | |
| display_name: str = "YouTube Transcripts" | |
| description: str = "Extracts spoken content from YouTube videos as transcripts." | |
| icon: str = "YouTube" | |
| name = "YouTubeTranscripts" | |
| inputs = [ | |
| MultilineInput( | |
| name="url", | |
| display_name="Video URL", | |
| info="Enter the YouTube video URL to get transcripts from.", | |
| tool_mode=True, | |
| ), | |
| DropdownInput( | |
| name="transcript_format", | |
| display_name="Transcript Format", | |
| options=["text", "chunks"], | |
| value="text", | |
| info="The format of the transcripts. Either 'text' for a single output " | |
| "or 'chunks' for timestamped chunks.", | |
| advanced=True, | |
| ), | |
| IntInput( | |
| name="chunk_size_seconds", | |
| display_name="Chunk Size (seconds)", | |
| value=60, | |
| advanced=True, | |
| info="The size of each transcript chunk in seconds. Only applicable when " | |
| "'Transcript Format' is set to 'chunks'.", | |
| ), | |
| DropdownInput( | |
| name="language", | |
| display_name="Language", | |
| options=[ | |
| "af", | |
| "ak", | |
| "sq", | |
| "am", | |
| "ar", | |
| "hy", | |
| "as", | |
| "ay", | |
| "az", | |
| "bn", | |
| "eu", | |
| "be", | |
| "bho", | |
| "bs", | |
| "bg", | |
| "my", | |
| "ca", | |
| "ceb", | |
| "zh", | |
| "zh-HK", | |
| "zh-CN", | |
| "zh-SG", | |
| "zh-TW", | |
| "zh-Hans", | |
| "zh-Hant", | |
| "hak-TW", | |
| "nan-TW", | |
| "co", | |
| "hr", | |
| "cs", | |
| "da", | |
| "dv", | |
| "nl", | |
| "en", | |
| "en-US", | |
| "eo", | |
| "et", | |
| "ee", | |
| "fil", | |
| "fi", | |
| "fr", | |
| "gl", | |
| "lg", | |
| "ka", | |
| "de", | |
| "el", | |
| "gn", | |
| "gu", | |
| "ht", | |
| "ha", | |
| "haw", | |
| "iw", | |
| "hi", | |
| "hmn", | |
| "hu", | |
| "is", | |
| "ig", | |
| "id", | |
| "ga", | |
| "it", | |
| "ja", | |
| "jv", | |
| "kn", | |
| "kk", | |
| "km", | |
| "rw", | |
| "ko", | |
| "kri", | |
| "ku", | |
| "ky", | |
| "lo", | |
| "la", | |
| "lv", | |
| "ln", | |
| "lt", | |
| "lb", | |
| "mk", | |
| "mg", | |
| "ms", | |
| "ml", | |
| "mt", | |
| "mi", | |
| "mr", | |
| "mn", | |
| "ne", | |
| "nso", | |
| "no", | |
| "ny", | |
| "or", | |
| "om", | |
| "ps", | |
| "fa", | |
| "pl", | |
| "pt", | |
| "pa", | |
| "qu", | |
| "ro", | |
| "ru", | |
| "sm", | |
| "sa", | |
| "gd", | |
| "sr", | |
| "sn", | |
| "sd", | |
| "si", | |
| "sk", | |
| "sl", | |
| "so", | |
| "st", | |
| "es", | |
| "su", | |
| "sw", | |
| "sv", | |
| "tg", | |
| "ta", | |
| "tt", | |
| "te", | |
| "th", | |
| "ti", | |
| "ts", | |
| "tr", | |
| "tk", | |
| "uk", | |
| "ur", | |
| "ug", | |
| "uz", | |
| "vi", | |
| "cy", | |
| "fy", | |
| "xh", | |
| "yi", | |
| "yo", | |
| "zu", | |
| ], | |
| value="en", | |
| info=( | |
| "Specify to make sure the transcripts are retrieved in your desired language. " | |
| "Defaults to English: 'en'" | |
| ), | |
| ), | |
| DropdownInput( | |
| name="translation", | |
| display_name="Translation Language", | |
| advanced=True, | |
| options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"], | |
| info="Translate the transcripts to the specified language. " "Leave empty for no translation.", | |
| ), | |
| ] | |
| outputs = [ | |
| Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"), | |
| ] | |
| def build_youtube_transcripts(self) -> Data | list[Data]: | |
| """Method to build transcripts from the provided YouTube URL. | |
| Returns: | |
| Data | list[Data]: The transcripts of the video, either as a single | |
| Data object or a list of Data objects. | |
| """ | |
| try: | |
| loader = YoutubeLoader.from_youtube_url( | |
| self.url, | |
| transcript_format=TranscriptFormat.TEXT | |
| if self.transcript_format == "text" | |
| else TranscriptFormat.CHUNKS, | |
| chunk_size_seconds=self.chunk_size_seconds, | |
| language=[self.language], | |
| translation=self.translation or None, | |
| ) | |
| transcripts = loader.load() | |
| if self.transcript_format == "text": | |
| # Extract only the page_content from the Document | |
| return Data(data={"transcripts": transcripts[0].page_content}) | |
| # For chunks, extract page_content and metadata separately | |
| return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts] | |
| except Exception as exc: # noqa: BLE001 | |
| # Using a specific error type for the return value | |
| return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"}) | |