SerotoninRonin commited on
Commit
da66358
·
1 Parent(s): 4ad672b

Add YouTube transcript extraction tool and update imports

Browse files
Files changed (3) hide show
  1. agents.py +4 -2
  2. requirements.txt +2 -1
  3. tools.py +31 -1
agents.py CHANGED
@@ -11,7 +11,8 @@ from tools import (
11
  webpage_extraction_tool,
12
  brave_web_search,
13
  python_code_interpreter_tool,
14
- audio_file_transcriber
 
15
  )
16
 
17
  class AgentState(TypedDict):
@@ -26,7 +27,8 @@ tools = [
26
  webpage_extraction_tool,
27
  brave_web_search,
28
  python_code_interpreter_tool,
29
- audio_file_transcriber
 
30
  ]
31
 
32
  rate_limiter = InMemoryRateLimiter(
 
11
  webpage_extraction_tool,
12
  brave_web_search,
13
  python_code_interpreter_tool,
14
+ audio_file_transcriber,
15
+ get_youtube_transcript
16
  )
17
 
18
  class AgentState(TypedDict):
 
27
  webpage_extraction_tool,
28
  brave_web_search,
29
  python_code_interpreter_tool,
30
+ audio_file_transcriber,
31
+ get_youtube_transcript
32
  ]
33
 
34
  rate_limiter = InMemoryRateLimiter(
requirements.txt CHANGED
@@ -8,4 +8,5 @@ assemblyai
8
  openpyxl
9
  langchain-community
10
  pandas
11
- rizaio
 
 
8
  openpyxl
9
  langchain-community
10
  pandas
11
+ rizaio
12
+ youtube_transcript_api
tools.py CHANGED
@@ -7,7 +7,7 @@ from langchain_core.messages import HumanMessage
7
  from langchain_community.tools.riza.command import ExecPython
8
  from langchain_community.tools import BraveSearch
9
  from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
10
- import requests
11
  import base64
12
  import pandas as pd
13
 
@@ -67,6 +67,36 @@ def audio_file_transcriber(file_path: str) -> str :
67
  docs = loader.load()
68
  return docs[0].page_content
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  brave_web_search = BraveSearch.from_search_kwargs({"max_results": 4})
72
 
 
7
  from langchain_community.tools.riza.command import ExecPython
8
  from langchain_community.tools import BraveSearch
9
  from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
10
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
11
  import base64
12
  import pandas as pd
13
 
 
67
  docs = loader.load()
68
  return docs[0].page_content
69
 
70
+ @tool
71
+ def get_youtube_transcript(video_url: str, lang: Optional[str] = 'en') -> str:
72
+ """Extracts and returns the transcript of a YouTube video.
73
+
74
+ Args:
75
+ video_url (str): The full YouTube video URL.
76
+ lang (Optional[str]): The language of the transcript. Defaults to 'en'.
77
+
78
+ Returns:
79
+ str: The full transcript as a string, or a message if not available.
80
+ """
81
+ import re
82
+
83
+ video_id_match = re.search(r"(?:v=|youtu.be/)([\w-]{11})", video_url)
84
+ if not video_id_match:
85
+ return "Invalid YouTube URL"
86
+
87
+ video_id = video_id_match.group(1)
88
+
89
+ try:
90
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
91
+ transcript = transcript_list.find_transcript([lang])
92
+ transcript_text = " ".join([entry['text'] for entry in transcript.fetch()])
93
+ return transcript_text
94
+ except TranscriptsDisabled:
95
+ return "Transcript is disabled for this video."
96
+ except NoTranscriptFound:
97
+ return f"No transcript found in language: {lang}"
98
+ except Exception as e:
99
+ return f"Error retrieving transcript: {str(e)}"
100
 
101
  brave_web_search = BraveSearch.from_search_kwargs({"max_results": 4})
102