File size: 3,630 Bytes
08583a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

class YoutubeSearchTool:
    def __init__(self, query: str, chunk_size: int = 1000, chunk_overlap: int = 200):
        from configs.config import Config
        from utils.generate_search_string import generate_search_string
        
        env = Config()
        self.generate_search_string = generate_search_string
        self.llm = env.LLM_VIDEO_MODEL_NAME
        self.embedding_model = env.EMBED_MODEL
        self.query = query
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.language = "en"

    def extract_youtube_link(self) -> str:
        """ Generate a YouTube search URL based on the query """
        import re

        youtube_url_pattern = r"https?://www\.youtube\.com/watch\?v=[\w-]+"
        match = re.search(youtube_url_pattern, self.query)
    
        return match.group(0) if match else None

    def video_loader(self):
        """ Load a YouTube video based on the query and language """

        from langchain_community.document_loaders import YoutubeLoader
        from langchain_community.document_loaders.youtube import TranscriptFormat
        from langchain.text_splitter import RecursiveCharacterTextSplitter
        from langchain.schema import Document

        url = self.extract_youtube_link()
        optimized_string = self.generate_search_string(self.query)

        loader = YoutubeLoader.from_youtube_url(
            url,
            add_video_info=True,
            transcript_format=TranscriptFormat.CHUNKS,
            chunk_size_seconds=30,
            language=self.language,
            )

        documents = loader.load()

        # combined_text = "".join([doc.page_content for doc in documents])

        # # Split into chunks
        # splitter = RecursiveCharacterTextSplitter(
        #     chunk_size=self.chunk_size,
        #     chunk_overlap=self.chunk_overlap,
        # )
        # chunks = splitter.split_text(combined_text)
        
        # return chunks
    
    def vector_store(self):
        """ Create a vector store from the video chunks """
        from langchain_community.vectorstores import FAISS
        from langchain_openai import OpenAIEmbeddings

        chunks = self.video_loader()
        if not chunks:
            return "No relevant video chunks found."

        docs = [
            Document(page_content=chunk, metadata={"source": self.query})
            for chunk in chunks
        ]
        self.vector_store = FAISS.from_documents(docs, embedding = OpenAIEmbeddings())
        
    def run(self, query: str):
        """ Run the YouTube search tool with the given query """
        print("----- YouTube Search Tool Run ---")

        store = self.vector_store()

        results = store.similarity_search_with_score(query, k=1)
                        
        if not results:
            return "No relevant video chunks found in the vector store."
        
        return [
            {
                "content": doc.page_content,
                "score": score,
                "source": doc.metadata.get("source", "Unknown")
            }
            for doc, score in results
        ]


if __name__ == "__main__":
    # Example usage
    query = "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in response to the question 'Isn't that hot?'"
    
    youtube_tool = YoutubeSearchTool(query=query)
    youtube_tool.run(query)
    print(f"Search URL: {youtube_tool.extract_youtube_link(query)}")
    print("Video chunks loaded successfully.")