File size: 2,664 Bytes
7e24b41
 
 
 
 
 
 
 
 
 
1802405
7e24b41
 
 
 
1802405
7e24b41
1802405
 
 
 
 
 
 
 
 
 
7e24b41
 
 
1802405
 
 
 
 
 
 
 
 
 
 
7e24b41
 
 
1802405
 
 
 
 
 
 
 
 
 
 
7e24b41
 
 
1802405
 
 
 
 
 
 
 
 
 
 
7e24b41
 
 
1802405
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from src.components.loaders.websiteCrawler import WebsiteCrawler
from src.components.loaders.youtubeLoader import YoutubeTranscriptLoader
from src.components.loaders.pdfLoader import PdfLoader
from src.components.rag.RAG import Chain
from dotenv import load_dotenv

load_dotenv("secrets.env")

class Pipeline:
    def __init__(self):
        """Initialize the Pipeline with loaders and the RAG chain."""
        self.pdfLoader = PdfLoader()
        self.webCrawler = WebsiteCrawler()
        self.youtubeLoader = YoutubeTranscriptLoader()
        self.ragChain = Chain()

    def plainText(self, text: str):
        """

        Process plain text through the RAG chain.



        Args:

            text (str): The input text to process.



        Returns:

            Chain: The processed chain for the input text.

        """
        chain = self.ragChain.returnChain(text=text)
        return chain

    def searchablePdf(self, path: str):
        """

        Process a searchable PDF file.



        Args:

            path (str): The path to the PDF file.



        Returns:

            Chain: The processed chain from the extracted text.

        """
        extractedText = self.pdfLoader.searchablePdf(pdfPath=path)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain

    def scannablePdf(self, path: str):
        """

        Process a scannable PDF file.



        Args:

            path (str): The path to the PDF file.



        Returns:

            Chain: The processed chain from the extracted text.

        """
        extractedText = self.pdfLoader.scannablePdf(pdfPath=path)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain
    
    def webCrawl(self, urls: list[str]):
        """

        Crawl the web for text extraction from provided URLs.



        Args:

            urls (list[str]): A list of URLs to crawl.



        Returns:

            Chain: The processed chain from the extracted text.

        """
        extractedText = self.webCrawler.extractTextFromUrlList(urls=urls)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain     
    
    def youtubeLinks(self, urls: list[str]):
        """

        Extract transcripts from YouTube links.



        Args:

            urls (list[str]): A list of YouTube video URLs.



        Returns:

            Chain: The processed chain from the extracted transcripts.

        """
        extractedText = self.youtubeLoader.getTranscripts(urls=urls)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain