Commit
·
da66358
1
Parent(s):
4ad672b
Add YouTube transcript extraction tool and update imports
Browse files- agents.py +4 -2
- requirements.txt +2 -1
- tools.py +31 -1
agents.py
CHANGED
|
@@ -11,7 +11,8 @@ from tools import (
|
|
| 11 |
webpage_extraction_tool,
|
| 12 |
brave_web_search,
|
| 13 |
python_code_interpreter_tool,
|
| 14 |
-
audio_file_transcriber
|
|
|
|
| 15 |
)
|
| 16 |
|
| 17 |
class AgentState(TypedDict):
|
|
@@ -26,7 +27,8 @@ tools = [
|
|
| 26 |
webpage_extraction_tool,
|
| 27 |
brave_web_search,
|
| 28 |
python_code_interpreter_tool,
|
| 29 |
-
audio_file_transcriber
|
|
|
|
| 30 |
]
|
| 31 |
|
| 32 |
rate_limiter = InMemoryRateLimiter(
|
|
|
|
| 11 |
webpage_extraction_tool,
|
| 12 |
brave_web_search,
|
| 13 |
python_code_interpreter_tool,
|
| 14 |
+
audio_file_transcriber,
|
| 15 |
+
get_youtube_transcript
|
| 16 |
)
|
| 17 |
|
| 18 |
class AgentState(TypedDict):
|
|
|
|
| 27 |
webpage_extraction_tool,
|
| 28 |
brave_web_search,
|
| 29 |
python_code_interpreter_tool,
|
| 30 |
+
audio_file_transcriber,
|
| 31 |
+
get_youtube_transcript
|
| 32 |
]
|
| 33 |
|
| 34 |
rate_limiter = InMemoryRateLimiter(
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ assemblyai
|
|
| 8 |
openpyxl
|
| 9 |
langchain-community
|
| 10 |
pandas
|
| 11 |
-
rizaio
|
|
|
|
|
|
| 8 |
openpyxl
|
| 9 |
langchain-community
|
| 10 |
pandas
|
| 11 |
+
rizaio
|
| 12 |
+
youtube_transcript_api
|
tools.py
CHANGED
|
@@ -7,7 +7,7 @@ from langchain_core.messages import HumanMessage
|
|
| 7 |
from langchain_community.tools.riza.command import ExecPython
|
| 8 |
from langchain_community.tools import BraveSearch
|
| 9 |
from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
|
| 10 |
-
import
|
| 11 |
import base64
|
| 12 |
import pandas as pd
|
| 13 |
|
|
@@ -67,6 +67,36 @@ def audio_file_transcriber(file_path: str) -> str :
|
|
| 67 |
docs = loader.load()
|
| 68 |
return docs[0].page_content
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
brave_web_search = BraveSearch.from_search_kwargs({"max_results": 4})
|
| 72 |
|
|
|
|
| 7 |
from langchain_community.tools.riza.command import ExecPython
|
| 8 |
from langchain_community.tools import BraveSearch
|
| 9 |
from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
|
| 10 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 11 |
import base64
|
| 12 |
import pandas as pd
|
| 13 |
|
|
|
|
| 67 |
docs = loader.load()
|
| 68 |
return docs[0].page_content
|
| 69 |
|
| 70 |
+
@tool
|
| 71 |
+
def get_youtube_transcript(video_url: str, lang: Optional[str] = 'en') -> str:
|
| 72 |
+
"""Extracts and returns the transcript of a YouTube video.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
video_url (str): The full YouTube video URL.
|
| 76 |
+
lang (Optional[str]): The language of the transcript. Defaults to 'en'.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
str: The full transcript as a string, or a message if not available.
|
| 80 |
+
"""
|
| 81 |
+
import re
|
| 82 |
+
|
| 83 |
+
video_id_match = re.search(r"(?:v=|youtu.be/)([\w-]{11})", video_url)
|
| 84 |
+
if not video_id_match:
|
| 85 |
+
return "Invalid YouTube URL"
|
| 86 |
+
|
| 87 |
+
video_id = video_id_match.group(1)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 91 |
+
transcript = transcript_list.find_transcript([lang])
|
| 92 |
+
transcript_text = " ".join([entry['text'] for entry in transcript.fetch()])
|
| 93 |
+
return transcript_text
|
| 94 |
+
except TranscriptsDisabled:
|
| 95 |
+
return "Transcript is disabled for this video."
|
| 96 |
+
except NoTranscriptFound:
|
| 97 |
+
return f"No transcript found in language: {lang}"
|
| 98 |
+
except Exception as e:
|
| 99 |
+
return f"Error retrieving transcript: {str(e)}"
|
| 100 |
|
| 101 |
brave_web_search = BraveSearch.from_search_kwargs({"max_results": 4})
|
| 102 |
|