Bshraman's picture
Upload 16 files
08583a4 verified
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
class YoutubeSearchTool:
def __init__(self, query: str, chunk_size: int = 1000, chunk_overlap: int = 200):
from configs.config import Config
from utils.generate_search_string import generate_search_string
env = Config()
self.generate_search_string = generate_search_string
self.llm = env.LLM_VIDEO_MODEL_NAME
self.embedding_model = env.EMBED_MODEL
self.query = query
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.language = "en"
def extract_youtube_link(self) -> str:
""" Generate a YouTube search URL based on the query """
import re
youtube_url_pattern = r"https?://www\.youtube\.com/watch\?v=[\w-]+"
match = re.search(youtube_url_pattern, self.query)
return match.group(0) if match else None
def video_loader(self):
""" Load a YouTube video based on the query and language """
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
url = self.extract_youtube_link()
optimized_string = self.generate_search_string(self.query)
loader = YoutubeLoader.from_youtube_url(
url,
add_video_info=True,
transcript_format=TranscriptFormat.CHUNKS,
chunk_size_seconds=30,
language=self.language,
)
documents = loader.load()
# combined_text = "".join([doc.page_content for doc in documents])
# # Split into chunks
# splitter = RecursiveCharacterTextSplitter(
# chunk_size=self.chunk_size,
# chunk_overlap=self.chunk_overlap,
# )
# chunks = splitter.split_text(combined_text)
# return chunks
def vector_store(self):
""" Create a vector store from the video chunks """
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
chunks = self.video_loader()
if not chunks:
return "No relevant video chunks found."
docs = [
Document(page_content=chunk, metadata={"source": self.query})
for chunk in chunks
]
self.vector_store = FAISS.from_documents(docs, embedding = OpenAIEmbeddings())
def run(self, query: str):
""" Run the YouTube search tool with the given query """
print("----- YouTube Search Tool Run ---")
store = self.vector_store()
results = store.similarity_search_with_score(query, k=1)
if not results:
return "No relevant video chunks found in the vector store."
return [
{
"content": doc.page_content,
"score": score,
"source": doc.metadata.get("source", "Unknown")
}
for doc, score in results
]
if __name__ == "__main__":
# Example usage
query = "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in response to the question 'Isn't that hot?'"
youtube_tool = YoutubeSearchTool(query=query)
youtube_tool.run(query)
print(f"Search URL: {youtube_tool.extract_youtube_link(query)}")
print("Video chunks loaded successfully.")