|
|
from langchain_community.document_loaders import YoutubeLoader |
|
|
from langchain_community.document_loaders.youtube import TranscriptFormat |
|
|
|
|
|
from langflow.custom import Component |
|
|
from langflow.inputs import DropdownInput, IntInput, MultilineInput |
|
|
from langflow.schema import Data |
|
|
from langflow.template import Output |
|
|
|
|
|
|
|
|
class YouTubeTranscriptsComponent(Component): |
|
|
"""A component that extracts spoken content from YouTube videos as transcripts.""" |
|
|
|
|
|
display_name: str = "YouTube Transcripts" |
|
|
description: str = "Extracts spoken content from YouTube videos as transcripts." |
|
|
icon: str = "YouTube" |
|
|
name = "YouTubeTranscripts" |
|
|
|
|
|
inputs = [ |
|
|
MultilineInput( |
|
|
name="url", |
|
|
display_name="Video URL", |
|
|
info="Enter the YouTube video URL to get transcripts from.", |
|
|
tool_mode=True, |
|
|
), |
|
|
DropdownInput( |
|
|
name="transcript_format", |
|
|
display_name="Transcript Format", |
|
|
options=["text", "chunks"], |
|
|
value="text", |
|
|
info="The format of the transcripts. Either 'text' for a single output " |
|
|
"or 'chunks' for timestamped chunks.", |
|
|
advanced=True, |
|
|
), |
|
|
IntInput( |
|
|
name="chunk_size_seconds", |
|
|
display_name="Chunk Size (seconds)", |
|
|
value=60, |
|
|
advanced=True, |
|
|
info="The size of each transcript chunk in seconds. Only applicable when " |
|
|
"'Transcript Format' is set to 'chunks'.", |
|
|
), |
|
|
DropdownInput( |
|
|
name="language", |
|
|
display_name="Language", |
|
|
options=[ |
|
|
"af", |
|
|
"ak", |
|
|
"sq", |
|
|
"am", |
|
|
"ar", |
|
|
"hy", |
|
|
"as", |
|
|
"ay", |
|
|
"az", |
|
|
"bn", |
|
|
"eu", |
|
|
"be", |
|
|
"bho", |
|
|
"bs", |
|
|
"bg", |
|
|
"my", |
|
|
"ca", |
|
|
"ceb", |
|
|
"zh", |
|
|
"zh-HK", |
|
|
"zh-CN", |
|
|
"zh-SG", |
|
|
"zh-TW", |
|
|
"zh-Hans", |
|
|
"zh-Hant", |
|
|
"hak-TW", |
|
|
"nan-TW", |
|
|
"co", |
|
|
"hr", |
|
|
"cs", |
|
|
"da", |
|
|
"dv", |
|
|
"nl", |
|
|
"en", |
|
|
"en-US", |
|
|
"eo", |
|
|
"et", |
|
|
"ee", |
|
|
"fil", |
|
|
"fi", |
|
|
"fr", |
|
|
"gl", |
|
|
"lg", |
|
|
"ka", |
|
|
"de", |
|
|
"el", |
|
|
"gn", |
|
|
"gu", |
|
|
"ht", |
|
|
"ha", |
|
|
"haw", |
|
|
"iw", |
|
|
"hi", |
|
|
"hmn", |
|
|
"hu", |
|
|
"is", |
|
|
"ig", |
|
|
"id", |
|
|
"ga", |
|
|
"it", |
|
|
"ja", |
|
|
"jv", |
|
|
"kn", |
|
|
"kk", |
|
|
"km", |
|
|
"rw", |
|
|
"ko", |
|
|
"kri", |
|
|
"ku", |
|
|
"ky", |
|
|
"lo", |
|
|
"la", |
|
|
"lv", |
|
|
"ln", |
|
|
"lt", |
|
|
"lb", |
|
|
"mk", |
|
|
"mg", |
|
|
"ms", |
|
|
"ml", |
|
|
"mt", |
|
|
"mi", |
|
|
"mr", |
|
|
"mn", |
|
|
"ne", |
|
|
"nso", |
|
|
"no", |
|
|
"ny", |
|
|
"or", |
|
|
"om", |
|
|
"ps", |
|
|
"fa", |
|
|
"pl", |
|
|
"pt", |
|
|
"pa", |
|
|
"qu", |
|
|
"ro", |
|
|
"ru", |
|
|
"sm", |
|
|
"sa", |
|
|
"gd", |
|
|
"sr", |
|
|
"sn", |
|
|
"sd", |
|
|
"si", |
|
|
"sk", |
|
|
"sl", |
|
|
"so", |
|
|
"st", |
|
|
"es", |
|
|
"su", |
|
|
"sw", |
|
|
"sv", |
|
|
"tg", |
|
|
"ta", |
|
|
"tt", |
|
|
"te", |
|
|
"th", |
|
|
"ti", |
|
|
"ts", |
|
|
"tr", |
|
|
"tk", |
|
|
"uk", |
|
|
"ur", |
|
|
"ug", |
|
|
"uz", |
|
|
"vi", |
|
|
"cy", |
|
|
"fy", |
|
|
"xh", |
|
|
"yi", |
|
|
"yo", |
|
|
"zu", |
|
|
], |
|
|
value="en", |
|
|
info=( |
|
|
"Specify to make sure the transcripts are retrieved in your desired language. " |
|
|
"Defaults to English: 'en'" |
|
|
), |
|
|
), |
|
|
DropdownInput( |
|
|
name="translation", |
|
|
display_name="Translation Language", |
|
|
advanced=True, |
|
|
options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"], |
|
|
info="Translate the transcripts to the specified language. " "Leave empty for no translation.", |
|
|
), |
|
|
] |
|
|
|
|
|
outputs = [ |
|
|
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"), |
|
|
] |
|
|
|
|
|
def build_youtube_transcripts(self) -> Data | list[Data]: |
|
|
"""Method to build transcripts from the provided YouTube URL. |
|
|
|
|
|
Returns: |
|
|
Data | list[Data]: The transcripts of the video, either as a single |
|
|
Data object or a list of Data objects. |
|
|
""" |
|
|
try: |
|
|
loader = YoutubeLoader.from_youtube_url( |
|
|
self.url, |
|
|
transcript_format=TranscriptFormat.TEXT |
|
|
if self.transcript_format == "text" |
|
|
else TranscriptFormat.CHUNKS, |
|
|
chunk_size_seconds=self.chunk_size_seconds, |
|
|
language=[self.language], |
|
|
translation=self.translation or None, |
|
|
) |
|
|
|
|
|
transcripts = loader.load() |
|
|
|
|
|
if self.transcript_format == "text": |
|
|
|
|
|
return Data(data={"transcripts": transcripts[0].page_content}) |
|
|
|
|
|
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts] |
|
|
|
|
|
except Exception as exc: |
|
|
|
|
|
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"}) |
|
|
|