Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) | |
| class YoutubeSearchTool: | |
| def __init__(self, query: str, chunk_size: int = 1000, chunk_overlap: int = 200): | |
| from configs.config import Config | |
| from utils.generate_search_string import generate_search_string | |
| env = Config() | |
| self.generate_search_string = generate_search_string | |
| self.llm = env.LLM_VIDEO_MODEL_NAME | |
| self.embedding_model = env.EMBED_MODEL | |
| self.query = query | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.language = "en" | |
| def extract_youtube_link(self) -> str: | |
| """ Generate a YouTube search URL based on the query """ | |
| import re | |
| youtube_url_pattern = r"https?://www\.youtube\.com/watch\?v=[\w-]+" | |
| match = re.search(youtube_url_pattern, self.query) | |
| return match.group(0) if match else None | |
| def video_loader(self): | |
| """ Load a YouTube video based on the query and language """ | |
| from langchain_community.document_loaders import YoutubeLoader | |
| from langchain_community.document_loaders.youtube import TranscriptFormat | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| url = self.extract_youtube_link() | |
| optimized_string = self.generate_search_string(self.query) | |
| loader = YoutubeLoader.from_youtube_url( | |
| url, | |
| add_video_info=True, | |
| transcript_format=TranscriptFormat.CHUNKS, | |
| chunk_size_seconds=30, | |
| language=self.language, | |
| ) | |
| documents = loader.load() | |
| # combined_text = "".join([doc.page_content for doc in documents]) | |
| # # Split into chunks | |
| # splitter = RecursiveCharacterTextSplitter( | |
| # chunk_size=self.chunk_size, | |
| # chunk_overlap=self.chunk_overlap, | |
| # ) | |
| # chunks = splitter.split_text(combined_text) | |
| # return chunks | |
| def vector_store(self): | |
| """ Create a vector store from the video chunks """ | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import OpenAIEmbeddings | |
| chunks = self.video_loader() | |
| if not chunks: | |
| return "No relevant video chunks found." | |
| docs = [ | |
| Document(page_content=chunk, metadata={"source": self.query}) | |
| for chunk in chunks | |
| ] | |
| self.vector_store = FAISS.from_documents(docs, embedding = OpenAIEmbeddings()) | |
| def run(self, query: str): | |
| """ Run the YouTube search tool with the given query """ | |
| print("----- YouTube Search Tool Run ---") | |
| store = self.vector_store() | |
| results = store.similarity_search_with_score(query, k=1) | |
| if not results: | |
| return "No relevant video chunks found in the vector store." | |
| return [ | |
| { | |
| "content": doc.page_content, | |
| "score": score, | |
| "source": doc.metadata.get("source", "Unknown") | |
| } | |
| for doc, score in results | |
| ] | |
| if __name__ == "__main__": | |
| # Example usage | |
| query = "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in response to the question 'Isn't that hot?'" | |
| youtube_tool = YoutubeSearchTool(query=query) | |
| youtube_tool.run(query) | |
| print(f"Search URL: {youtube_tool.extract_youtube_link(query)}") | |
| print("Video chunks loaded successfully.") |