Spaces:
Sleeping
Sleeping
| from src.components.loaders.websiteCrawler import WebsiteCrawler | |
| from src.components.loaders.youtubeLoader import YoutubeTranscriptLoader | |
| from src.components.loaders.pdfLoader import PdfLoader | |
| from src.components.rag.RAG import Chain | |
| from dotenv import load_dotenv | |
| load_dotenv("secrets.env") | |
| class Pipeline: | |
| def __init__(self): | |
| """Initialize the Pipeline with loaders and the RAG chain.""" | |
| self.pdfLoader = PdfLoader() | |
| self.webCrawler = WebsiteCrawler() | |
| self.youtubeLoader = YoutubeTranscriptLoader() | |
| self.ragChain = Chain() | |
| def plainText(self, text: str): | |
| """ | |
| Process plain text through the RAG chain. | |
| Args: | |
| text (str): The input text to process. | |
| Returns: | |
| Chain: The processed chain for the input text. | |
| """ | |
| chain = self.ragChain.returnChain(text=text) | |
| return chain | |
| def searchablePdf(self, path: str): | |
| """ | |
| Process a searchable PDF file. | |
| Args: | |
| path (str): The path to the PDF file. | |
| Returns: | |
| Chain: The processed chain from the extracted text. | |
| """ | |
| extractedText = self.pdfLoader.searchablePdf(pdfPath=path) | |
| chain = self.ragChain.returnChain(text=extractedText) | |
| return chain | |
| def scannablePdf(self, path: str): | |
| """ | |
| Process a scannable PDF file. | |
| Args: | |
| path (str): The path to the PDF file. | |
| Returns: | |
| Chain: The processed chain from the extracted text. | |
| """ | |
| extractedText = self.pdfLoader.scannablePdf(pdfPath=path) | |
| chain = self.ragChain.returnChain(text=extractedText) | |
| return chain | |
| def webCrawl(self, urls: list[str]): | |
| """ | |
| Crawl the web for text extraction from provided URLs. | |
| Args: | |
| urls (list[str]): A list of URLs to crawl. | |
| Returns: | |
| Chain: The processed chain from the extracted text. | |
| """ | |
| extractedText = self.webCrawler.extractTextFromUrlList(urls=urls) | |
| chain = self.ragChain.returnChain(text=extractedText) | |
| return chain | |
| def youtubeLinks(self, urls: list[str]): | |
| """ | |
| Extract transcripts from YouTube links. | |
| Args: | |
| urls (list[str]): A list of YouTube video URLs. | |
| Returns: | |
| Chain: The processed chain from the extracted transcripts. | |
| """ | |
| extractedText = self.youtubeLoader.getTranscripts(urls=urls) | |
| chain = self.ragChain.returnChain(text=extractedText) | |
| return chain |