Spaces:

techconsptr
/

ConversAI

Sleeping

File size: 2,664 Bytes

from src.components.loaders.websiteCrawler import WebsiteCrawler
from src.components.loaders.youtubeLoader import YoutubeTranscriptLoader
from src.components.loaders.pdfLoader import PdfLoader
from src.components.rag.RAG import Chain
from dotenv import load_dotenv

load_dotenv("secrets.env")

class Pipeline:
    def __init__(self):
        """Initialize the Pipeline with loaders and the RAG chain."""
        self.pdfLoader = PdfLoader()
        self.webCrawler = WebsiteCrawler()
        self.youtubeLoader = YoutubeTranscriptLoader()
        self.ragChain = Chain()

    def plainText(self, text: str):
        """

        Process plain text through the RAG chain.



        Args:

            text (str): The input text to process.



        Returns:

            Chain: The processed chain for the input text.

        """
        chain = self.ragChain.returnChain(text=text)
        return chain

    def searchablePdf(self, path: str):
        """

        Process a searchable PDF file.



        Args:

            path (str): The path to the PDF file.



        Returns:

            Chain: The processed chain from the extracted text.

        """
        extractedText = self.pdfLoader.searchablePdf(pdfPath=path)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain

    def scannablePdf(self, path: str):
        """

        Process a scannable PDF file.



        Args:

            path (str): The path to the PDF file.



        Returns:

            Chain: The processed chain from the extracted text.

        """
        extractedText = self.pdfLoader.scannablePdf(pdfPath=path)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain
    
    def webCrawl(self, urls: list[str]):
        """

        Crawl the web for text extraction from provided URLs.



        Args:

            urls (list[str]): A list of URLs to crawl.



        Returns:

            Chain: The processed chain from the extracted text.

        """
        extractedText = self.webCrawler.extractTextFromUrlList(urls=urls)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain     
    
    def youtubeLinks(self, urls: list[str]):
        """

        Extract transcripts from YouTube links.



        Args:

            urls (list[str]): A list of YouTube video URLs.



        Returns:

            Chain: The processed chain from the extracted transcripts.

        """
        extractedText = self.youtubeLoader.getTranscripts(urls=urls)
        chain = self.ragChain.returnChain(text=extractedText)
        return chain