import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) class YoutubeSearchTool: def __init__(self, query: str, chunk_size: int = 1000, chunk_overlap: int = 200): from configs.config import Config from utils.generate_search_string import generate_search_string env = Config() self.generate_search_string = generate_search_string self.llm = env.LLM_VIDEO_MODEL_NAME self.embedding_model = env.EMBED_MODEL self.query = query self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.language = "en" def extract_youtube_link(self) -> str: """ Generate a YouTube search URL based on the query """ import re youtube_url_pattern = r"https?://www\.youtube\.com/watch\?v=[\w-]+" match = re.search(youtube_url_pattern, self.query) return match.group(0) if match else None def video_loader(self): """ Load a YouTube video based on the query and language """ from langchain_community.document_loaders import YoutubeLoader from langchain_community.document_loaders.youtube import TranscriptFormat from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document url = self.extract_youtube_link() optimized_string = self.generate_search_string(self.query) loader = YoutubeLoader.from_youtube_url( url, add_video_info=True, transcript_format=TranscriptFormat.CHUNKS, chunk_size_seconds=30, language=self.language, ) documents = loader.load() # combined_text = "".join([doc.page_content for doc in documents]) # # Split into chunks # splitter = RecursiveCharacterTextSplitter( # chunk_size=self.chunk_size, # chunk_overlap=self.chunk_overlap, # ) # chunks = splitter.split_text(combined_text) # return chunks def vector_store(self): """ Create a vector store from the video chunks """ from langchain_community.vectorstores import FAISS from langchain_openai import OpenAIEmbeddings chunks = self.video_loader() if not chunks: return "No relevant video chunks found." docs = [ Document(page_content=chunk, metadata={"source": self.query}) for chunk in chunks ] self.vector_store = FAISS.from_documents(docs, embedding = OpenAIEmbeddings()) def run(self, query: str): """ Run the YouTube search tool with the given query """ print("----- YouTube Search Tool Run ---") store = self.vector_store() results = store.similarity_search_with_score(query, k=1) if not results: return "No relevant video chunks found in the vector store." return [ { "content": doc.page_content, "score": score, "source": doc.metadata.get("source", "Unknown") } for doc, score in results ] if __name__ == "__main__": # Example usage query = "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in response to the question 'Isn't that hot?'" youtube_tool = YoutubeSearchTool(query=query) youtube_tool.run(query) print(f"Search URL: {youtube_tool.extract_youtube_link(query)}") print("Video chunks loaded successfully.")