Spaces:
Sleeping
Sleeping
File size: 2,664 Bytes
7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 7e24b41 1802405 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | from src.components.loaders.websiteCrawler import WebsiteCrawler
from src.components.loaders.youtubeLoader import YoutubeTranscriptLoader
from src.components.loaders.pdfLoader import PdfLoader
from src.components.rag.RAG import Chain
from dotenv import load_dotenv
load_dotenv("secrets.env")
class Pipeline:
def __init__(self):
"""Initialize the Pipeline with loaders and the RAG chain."""
self.pdfLoader = PdfLoader()
self.webCrawler = WebsiteCrawler()
self.youtubeLoader = YoutubeTranscriptLoader()
self.ragChain = Chain()
def plainText(self, text: str):
"""
Process plain text through the RAG chain.
Args:
text (str): The input text to process.
Returns:
Chain: The processed chain for the input text.
"""
chain = self.ragChain.returnChain(text=text)
return chain
def searchablePdf(self, path: str):
"""
Process a searchable PDF file.
Args:
path (str): The path to the PDF file.
Returns:
Chain: The processed chain from the extracted text.
"""
extractedText = self.pdfLoader.searchablePdf(pdfPath=path)
chain = self.ragChain.returnChain(text=extractedText)
return chain
def scannablePdf(self, path: str):
"""
Process a scannable PDF file.
Args:
path (str): The path to the PDF file.
Returns:
Chain: The processed chain from the extracted text.
"""
extractedText = self.pdfLoader.scannablePdf(pdfPath=path)
chain = self.ragChain.returnChain(text=extractedText)
return chain
def webCrawl(self, urls: list[str]):
"""
Crawl the web for text extraction from provided URLs.
Args:
urls (list[str]): A list of URLs to crawl.
Returns:
Chain: The processed chain from the extracted text.
"""
extractedText = self.webCrawler.extractTextFromUrlList(urls=urls)
chain = self.ragChain.returnChain(text=extractedText)
return chain
def youtubeLinks(self, urls: list[str]):
"""
Extract transcripts from YouTube links.
Args:
urls (list[str]): A list of YouTube video URLs.
Returns:
Chain: The processed chain from the extracted transcripts.
"""
extractedText = self.youtubeLoader.getTranscripts(urls=urls)
chain = self.ragChain.returnChain(text=extractedText)
return chain |