Spaces:

techconsptr
/

ConversAI

Sleeping

App Files Files Community

techconsptrs commited on Oct 12, 2024

Commit

1802405

1 Parent(s): d14c53b

UPDATE: code update

Browse files

Files changed (12) hide show

app.py +163 -228
requirements.txt +0 -134
setup.py +8 -6
src/components/loaders/pdfLoader.py +51 -13
src/components/loaders/websiteCrawler.py +55 -20
src/components/loaders/youtubeLoader.py +15 -8
src/components/rag/RAG.py +33 -20
src/components/vectors/vectorstore.py +25 -15
src/pipelines/completePipeline.py +57 -11
src/utils/exceptions.py +16 -0
src/utils/functions.py +27 -0
src/utils/logging.py +5 -1

app.py CHANGED Viewed

@@ -1,314 +1,249 @@
 from src.pipelines.completePipeline import Pipeline
 import gradio as gr
 import spaces
-import os
-# os.system("apt-get update -y")
-# os.system("apt-get upgrade -y")
-# os.system("apt install poppler-utils -y")
-chain = None
-pipeline = Pipeline()
 @spaces.GPU
-def getTextResponse(text: str, inputQuery: str):
     global chain
     if chain is None:
-        chain = pipeline.plainText(text = text)
-    else:
-        pass
-    response = chain.invoke(
-        {
-            "question": inputQuery
-        }
-    )
     return response
 @spaces.GPU
-def getSearchablePdfResponse(path: str, inputQuery: str):
     global chain
     if chain is None:
-        chain = pipeline.searchablePdf(path = path)
-    else:
-        pass
-    response = chain.invoke(
-        {
-            "question": inputQuery
-        }
-    )
     return response
 @spaces.GPU
-def getScannablePdfResponse(path: str, inputQuery: str):
     global chain
     if chain is None:
-        chain = pipeline.scannablePdf(path = path)
-    else:
-        pass
-    response = chain.invoke(
-        {
-            "question": inputQuery
-        }
-    )
     return response
-def clearFunction():
     global chain
     chain = None
 with gr.Blocks() as textInterface:
     with gr.Row():
         inputText = gr.Textbox(
-            label = "Input Text",
-            placeholder = "Enter you text here"
         )
     with gr.Row():
         question = gr.Textbox(
-            label = "Question",
-            placeholder = "Enter your question here"
         )
         answer = gr.Textbox(
-            label = "Response",
-            interactive = False
         )
     with gr.Row():
-        submitButton = gr.Button(
-            value = "Submit",
-            variant = "primary"
-        )
         clearButton = gr.ClearButton(
-            components = [inputText, question, answer],
-            value = "Clear",
-            variant = "secondary"
         )
-    submitButton.click(
-        fn = getTextResponse,
-        inputs = [inputText, question],
-        outputs = [answer]
-    )
-    clearButton.click(
-        fn = clearFunction
-    )
 with gr.Blocks() as searchablePdf:
     with gr.Row():
         inputFile = gr.File(
-            file_types = [".pdf"],
-            file_count = "single",
-            label = "Select PDF"
         )
     with gr.Row():
-        question = gr.Textbox(
-            label = "Question",
-            placeholder = "Enter your question here"
-        )
-        answer = gr.Textbox(
-            label = "Response",
-            interactive = False
-        )
     with gr.Row():
-        submitButton = gr.Button(
-            value = "Submit",
-            variant = "primary"
-        )
         clearButton = gr.ClearButton(
-            components = [inputFile, question, answer],
-            value = "Clear",
-            variant = "secondary"
         )
-    submitButton.click(
-        fn = getSearchablePdfResponse,
-        inputs = [inputFile, question],
-        outputs = [answer]
-    )
-    clearButton.click(
-        fn = clearFunction
-    )
 with gr.Blocks() as scannablePdf:
     with gr.Row():
-        inputFile = gr.File(
-            file_types = [".pdf"],
-            file_count = "single",
-            label = "Select PDF"
-        )
     with gr.Row():
-        question = gr.Textbox(
-            label = "Question",
-            placeholder = "Enter your question here"
-        )
-        answer = gr.Textbox(
-            label = "Response",
-            interactive = False
-        )
     with gr.Row():
-        submitButton = gr.Button(
-            value = "Submit",
-            variant = "primary"
-        )
         clearButton = gr.ClearButton(
-            components = [inputFile, question, answer],
-            value = "Clear",
-            variant = "secondary"
-        )
-    submitButton.click(
-        fn = getScannablePdfResponse,
-        inputs = [inputFile, question],
-        outputs = [answer]
-    )
-    clearButton.click(
-        fn = clearFunction
-    )
-def getLinksButtonFn(baseUrl: str):
-    links = pipeline.webCrawler.getLinks(url = baseUrl)
-    checkboxes = gr.CheckboxGroup(
-        choices = links,
-        label = "Fetched Links",
-        visible = True
-    )
-    row2 = gr.Row(visible = True)
-    row3 = gr.Row(visible = True)
-    return (
-        checkboxes,
-        row2,
-        row3
-    )
-@spaces.GPU
-def getWebsiteResponse(links: list[str], inputQuery: str):
     global chain
     if chain is None:
-        print(links)
-        chain = pipeline.webCrawl(urls = links)
-    else:
-        pass
-    response = chain.invoke(
-        {
-            "question": inputQuery
-        }
-    )
     return response
-def clearWebsiteResponse():
     global chain
-    chain = None
-    checkboxes = gr.CheckboxGroup(
-        choices = [],
-        label = "Fetched Links",
-        visible = False
-    )
     return checkboxes
 with gr.Blocks() as websiteCrawler:
     with gr.Row():
         inputUrl = gr.Textbox(
-            label = "Base URL",
-            placeholder = "Enter the Base URL to fetch other links",
-            scale = 3
-        )
-        getLinksButton = gr.Button(
-            value = "Get Links",
-            variant = "primary",
-            scale = 1
-        )
-    checkboxes = gr.CheckboxGroup(
-        choices = [],
-        label = "Fetched Links",
-    )
-    with gr.Row(visible = False) as row2:
-        question = gr.Textbox(
-            label = "Question",
-            placeholder = "Enter your question here"
-        )
-        answer = gr.Textbox(
-            label = "Response",
-            interactive = False
-        )
-    with gr.Row(visible = False) as row3:
-        submitButton = gr.Button(
-            value = "Submit",
-            variant = "primary"
-        )
         clearButton = gr.ClearButton(
-            components = [question, answer],
-            value = "Clear",
-            variant = "secondary"
         )
-    getLinksButton.click(
-        fn = getLinksButtonFn,
-        inputs = [inputUrl],
-        outputs = [checkboxes, row2, row3]
-    )
-    submitButton.click(
-        fn = getWebsiteResponse,
-        inputs = [checkboxes, question],
-        outputs = [answer]
-    )
-    clearButton.click(
-        fn = clearWebsiteResponse,
-        inputs = None,
-        outputs = [checkboxes]
-    )
 @spaces.GPU
-def getYoutubeResponse(links: str, inputQuery: str):
     global chain
-    links = [link.strip() for link in links.split(",")]
     if chain is None:
-        chain = pipeline.youtubeLinks(urls = links)
-    else:
-        pass
-    response = chain.invoke(
-        {
-            "question": inputQuery
-        }
-    )
     return response
 with gr.Blocks() as youtubeInterface:
     with gr.Row():
         inputLinks = gr.Textbox(
-            label = "Youtube Links",
-            placeholder = 'Enter comma(,)-separated youtube video links'
         )
     with gr.Row():
-        question = gr.Textbox(
-            label = "Question",
-            placeholder = "Enter your question here"
-        )
-        answer = gr.Textbox(
-            label = "Response",
-            interactive = False
-        )
     with gr.Row():
-        submitButton = gr.Button(
-            value = "Submit",
-            variant = "primary"
-        )
         clearButton = gr.ClearButton(
-            components = [inputLinks, question, answer],
-            value = "Clear",
-            variant = "secondary"
         )
-    submitButton.click(
-        fn = getYoutubeResponse,
-        inputs = [inputLinks, question],
-        outputs = [answer]
-    )
-    clearButton.click(
-        fn = clearFunction
-    )
 application = gr.TabbedInterface(
     [textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
     ["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
 )
 application.launch()

+# Import necessary libraries and modules
 from src.pipelines.completePipeline import Pipeline
 import gradio as gr
 import spaces
+# Initialize global variables
+chain = None  # Holds the current processing chain
+pipeline = Pipeline()  # Instantiate the processing pipeline
 @spaces.GPU
+def getTextResponse(text: str, inputQuery: str) -> str:
+    """
+    Generate a response based on the input text and query.
+    Args:
+        text (str): The input text to process.
+        inputQuery (str): The question to be answered.
+    Returns:
+        str: The response generated from the input text.
+    """
     global chain
     if chain is None:
+        chain = pipeline.plainText(text=text)  # Create a new processing chain for plain text
+    response = chain.invoke({"question": inputQuery})  # Process the query
     return response
 @spaces.GPU
+def getSearchablePdfResponse(path: str, inputQuery: str) -> str:
+    """
+    Generate a response based on a searchable PDF and query.
+    Args:
+        path (str): Path to the searchable PDF.
+        inputQuery (str): The question to be answered.
+    Returns:
+        str: The response generated from the searchable PDF.
+    """
     global chain
     if chain is None:
+        chain = pipeline.searchablePdf(path=path)  # Create a new processing chain for the PDF
+    response = chain.invoke({"question": inputQuery})
     return response
 @spaces.GPU
+def getScannablePdfResponse(path: str, inputQuery: str) -> str:
+    """
+    Generate a response based on a scannable PDF and query.
+    Args:
+        path (str): Path to the scannable PDF.
+        inputQuery (str): The question to be answered.
+    Returns:
+        str: The response generated from the scannable PDF.
+    """
     global chain
     if chain is None:
+        chain = pipeline.scannablePdf(path=path)  # Create a new processing chain for the scannable PDF
+    response = chain.invoke({"question": inputQuery})
     return response
+def clearFunction() -> None:
+    """Reset the processing chain to prepare for new queries."""
     global chain
     chain = None
+# User interface for text input
 with gr.Blocks() as textInterface:
     with gr.Row():
         inputText = gr.Textbox(
+            label="Input Text",
+            placeholder="Enter your text here"
         )
     with gr.Row():
         question = gr.Textbox(
+            label="Question",
+            placeholder="Enter your question here"
         )
         answer = gr.Textbox(
+            label="Response",
+            interactive=False  # Make the response field read-only
         )
     with gr.Row():
+        submitButton = gr.Button(value="Submit", variant="primary")
         clearButton = gr.ClearButton(
+            components=[inputText, question, answer],
+            value="Clear",
+            variant="secondary"
         )
+    # Define actions for buttons
+    submitButton.click(fn=getTextResponse, inputs=[inputText, question], outputs=[answer])
+    clearButton.click(fn=clearFunction)
+# User interface for searchable PDF input
 with gr.Blocks() as searchablePdf:
     with gr.Row():
         inputFile = gr.File(
+            file_types=[".pdf"],  # Restrict file types to PDFs
+            file_count="single",   # Allow only one PDF file selection
+            label="Select PDF"
         )
     with gr.Row():
+        question = gr.Textbox(label="Question", placeholder="Enter your question here")
+        answer = gr.Textbox(label="Response", interactive=False)
     with gr.Row():
+        submitButton = gr.Button(value="Submit", variant="primary")
         clearButton = gr.ClearButton(
+            components=[inputFile, question, answer],
+            value="Clear",
+            variant="secondary"
         )
+    # Define actions for buttons
+    submitButton.click(fn=getSearchablePdfResponse, inputs=[inputFile, question], outputs=[answer])
+    clearButton.click(fn=clearFunction)
+# User interface for scannable PDF input
 with gr.Blocks() as scannablePdf:
     with gr.Row():
+        inputFile = gr.File(file_types=[".pdf"], file_count="single", label="Select PDF")
     with gr.Row():
+        question = gr.Textbox(label="Question", placeholder="Enter your question here")
+        answer = gr.Textbox(label="Response", interactive=False)
     with gr.Row():
+        submitButton = gr.Button(value="Submit", variant="primary")
         clearButton = gr.ClearButton(
+            components=[inputFile, question, answer],
+            value="Clear",
+            variant="secondary"
+        )
+    # Define actions for buttons
+    submitButton.click(fn=getScannablePdfResponse, inputs=[inputFile, question], outputs=[answer])
+    clearButton.click(fn=clearFunction)
+def getLinksButtonFn(baseUrl: str) -> tuple:
+    """
+    Fetch links from the specified base URL.
+    Args:
+        baseUrl (str): The base URL from which to fetch links.
+    Returns:
+        tuple: A tuple containing a CheckboxGroup of fetched links and two rows for the UI.
+    """
+    links = pipeline.webCrawler.getLinks(url=baseUrl)  # Fetch links using the web crawler
+    checkboxes = gr.CheckboxGroup(choices=links, label="Fetched Links", visible=True)
+    row2 = gr.Row(visible=True)
+    row3 = gr.Row(visible=True)
+    return checkboxes, row2, row3
+@spaces.GPU
+def getWebsiteResponse(links: list[str], inputQuery: str) -> str:
+    """
+    Generate a response based on fetched website links and a query.
+    Args:
+        links (list[str]): List of links to process.
+        inputQuery (str): The question to be answered.
+    Returns:
+        str: The response generated from the website links.
+    """
     global chain
     if chain is None:
+        chain = pipeline.webCrawl(urls=links)  # Create a new processing chain for web crawling
+    response = chain.invoke({"question": inputQuery})
     return response
+def clearWebsiteResponse() -> gr.CheckboxGroup:
+    """Clear the website response and reset the checkboxes."""
     global chain
+    chain = None  # Reset the chain
+    checkboxes = gr.CheckboxGroup(choices=[], label="Fetched Links", visible=False)
     return checkboxes
+# User interface for website crawling
 with gr.Blocks() as websiteCrawler:
     with gr.Row():
         inputUrl = gr.Textbox(
+            label="Base URL",
+            placeholder="Enter the Base URL to fetch other links",
+            scale=3
+        )
+        getLinksButton = gr.Button(value="Get Links", variant="primary", scale=1)
+    checkboxes = gr.CheckboxGroup(choices=[], label="Fetched Links")
+    with gr.Row(visible=False) as row2:
+        question = gr.Textbox(label="Question", placeholder="Enter your question here")
+        answer = gr.Textbox(label="Response", interactive=False)
+    with gr.Row(visible=False) as row3:
+        submitButton = gr.Button(value="Submit", variant="primary")
         clearButton = gr.ClearButton(
+            components=[question, answer],
+            value="Clear",
+            variant="secondary"
         )
+    # Define actions for buttons
+    getLinksButton.click(fn=getLinksButtonFn, inputs=[inputUrl], outputs=[checkboxes, row2, row3])
+    submitButton.click(fn=getWebsiteResponse, inputs=[checkboxes, question], outputs=[answer])
+    clearButton.click(fn=clearWebsiteResponse, inputs=None, outputs=[checkboxes])
 @spaces.GPU
+def getYoutubeResponse(links: str, inputQuery: str) -> str:
+    """
+    Generate a response based on YouTube video links and a query.
+    Args:
+        links (str): Comma-separated YouTube video links.
+        inputQuery (str): The question to be answered.
+    Returns:
+        str: The response generated from the YouTube videos.
+    """
     global chain
+    links = [link.strip() for link in links.split(",")]  # Split and clean the links
     if chain is None:
+        chain = pipeline.youtubeLinks(urls=links)  # Create a new processing chain for YouTube links
+    response = chain.invoke({"question": inputQuery})
     return response
+# User interface for YouTube links
 with gr.Blocks() as youtubeInterface:
     with gr.Row():
         inputLinks = gr.Textbox(
+            label="Youtube Links",
+            placeholder='Enter comma(,)-separated youtube video links'
         )
     with gr.Row():
+        question = gr.Textbox(label="Question", placeholder="Enter your question here")
+        answer = gr.Textbox(label="Response", interactive=False)
     with gr.Row():
+        submitButton = gr.Button(value="Submit", variant="primary")
         clearButton = gr.ClearButton(
+            components=[inputLinks, question, answer],
+            value="Clear",
+            variant="secondary"
         )
+    # Define actions for buttons
+    submitButton.click(fn=getYoutubeResponse, inputs=[inputLinks, question], outputs=[answer])
+    clearButton.click(fn=clearFunction)
+# Create a tabbed interface for the different functionalities
 application = gr.TabbedInterface(
     [textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
     ["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
 )
+# Launch the Gradio application
 application.launch()

requirements.txt CHANGED Viewed

@@ -1,151 +1,17 @@
-aiofiles==23.2.1
-aiohappyeyeballs==2.4.0
-aiohttp==3.10.6
-aiosignal==1.3.1
-annotated-types==0.7.0
-anyio==4.6.0
-asttokens==2.4.1
-async-timeout==4.0.3
-attrs==24.2.0
-Authlib==1.3.2
 beautifulsoup4==4.12.3
-certifi==2024.8.30
-cffi==1.17.1
-charset-normalizer==3.3.2
-click==8.0.4
-cryptography==43.0.1
-dataclasses-json==0.6.7
-datasets==3.0.0
-decorator==5.1.1
-dill==0.3.8
-distro==1.9.0
 easyocr==1.7.2
-exceptiongroup==1.2.2
-executing==2.1.0
-fastapi==0.115.0
-ffmpy==0.4.0
-filelock==3.16.1
-frozenlist==1.4.1
-fsspec==2024.6.1
 gradio==5.0.2
-gradio_client==1.4.0
-greenlet==3.1.1
-groq==0.11.0
-h11==0.14.0
-hf_transfer==0.1.8
-httpcore==1.0.6
-httpx==0.27.2
-huggingface-hub==0.25.1
-idna==3.10
-imageio==2.35.1
-ipython==8.28.0
-itsdangerous==2.2.0
-jedi==0.19.1
-Jinja2==3.1.4
-joblib==1.4.2
-jsonpatch==1.33
-jsonpointer==3.0.0
 langchain==0.3.3
 langchain-community==0.3.2
 langchain-core==0.3.10
 langchain-groq==0.2.0
 langchain-huggingface==0.1.0
 langchain-text-splitters==0.3.0
-langsmith==0.1.134
-lazy_loader==0.4
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-marshmallow==3.22.0
-matplotlib-inline==0.1.7
-mdurl==0.1.2
-mpmath==1.3.0
-multidict==6.1.0
-multiprocess==0.70.16
-mypy-extensions==1.0.0
-networkx==3.3
-ninja==1.11.1.1
 numpy==1.26.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.20.5
-nvidia-nvjitlink-cu12==12.6.68
-nvidia-nvtx-cu12==12.1.105
-opencv-python-headless==4.10.0.84
-orjson==3.10.7
-packaging==24.1
-pandas==2.2.3
-parso==0.8.4
 pdf2image==1.17.0
-pexpect==4.9.0
-pillow==10.4.0
-prompt_toolkit==3.0.48
-protobuf==3.20.3
-psutil==5.9.8
-ptyprocess==0.7.0
-pure_eval==0.2.3
-pyarrow==17.0.0
-pyclipper==1.3.0.post5
-pycparser==2.22
-pydantic==2.9.2
-pydantic-settings==2.5.2
-pydantic_core==2.23.4
-pydub==0.25.1
-Pygments==2.18.0
 PyMuPDF==1.24.11
-python-bidi==0.6.0
-python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
-python-multipart==0.0.12
-pytz==2024.2
-PyYAML==6.0.2
-regex==2024.9.11
 requests==2.32.3
-requests-toolbelt==1.0.0
-rich==13.9.2
-ruff==0.6.9
-safetensors==0.4.5
-scikit-image==0.24.0
-scikit-learn==1.5.2
-scipy==1.14.1
-semantic-version==2.10.0
-sentence-transformers==3.2.0
-shapely==2.0.6
-shellingham==1.5.4
-six==1.16.0
-sniffio==1.3.1
-soupsieve==2.6
-spaces==0.30.3
-SQLAlchemy==2.0.35
-stack-data==0.6.3
-starlette==0.38.6
-sympy==1.13.3
-tenacity==8.5.0
-threadpoolctl==3.5.0
-tifffile==2024.9.20
-tokenizers==0.20.1
-tomlkit==0.12.0
-torch==2.4.1
-torchvision==0.19.1
-tqdm==4.66.5
-traitlets==5.14.3
-transformers==4.45.2
-triton==3.0.0
-typer==0.12.5
-typing-inspect==0.9.0
-typing_extensions==4.12.2
-tzdata==2024.2
 urllib3==2.2.3
-uvicorn==0.31.1
-wcwidth==0.2.13
-websockets==12.0
-xxhash==3.5.0
-yarl==1.12.1
 youtube-transcript-api==0.6.2
 -e .

 beautifulsoup4==4.12.3
 easyocr==1.7.2
 gradio==5.0.2
 langchain==0.3.3
 langchain-community==0.3.2
 langchain-core==0.3.10
 langchain-groq==0.2.0
 langchain-huggingface==0.1.0
 langchain-text-splitters==0.3.0
 numpy==1.26.4
 pdf2image==1.17.0
 PyMuPDF==1.24.11
 python-dotenv==1.0.1
 requests==2.32.3
 urllib3==2.2.3
 youtube-transcript-api==0.6.2
 -e .

setup.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from setuptools import setup, find_packages
 HYPEN_E_DOT = "-e ."
 def getRequirements(requirementsPath: str) -> list[str]:
     with open(requirementsPath) as file:
         requirements = file.read().split("\n")
@@ -8,10 +9,11 @@ def getRequirements(requirementsPath: str) -> list[str]:
     return requirements
 setup(
-    name = "ConversAI",
-    author = "Rauhan Ahmed Siddiqui",
-    author_email = "rauhaan.siddiqui@gmail.com",
-    version = "0.1",
-    packages = find_packages(),
-    install_requires = getRequirements(requirementsPath = "requirements.txt")
 )

 from setuptools import setup, find_packages
 HYPEN_E_DOT = "-e ."
 def getRequirements(requirementsPath: str) -> list[str]:
     with open(requirementsPath) as file:
         requirements = file.read().split("\n")
     return requirements
 setup(
+    name="ConversAI",
+    author="Rauhan Ahmed Siddiqui",
+    author_email="rauhaan.siddiqui@gmail.com",
+    version="0.1",
+    packages=find_packages(),
+    install_requires=getRequirements(requirementsPath="requirements.txt"),
+    description="ConversAI: An innovative conversational AI framework for intelligent text extraction and querying.",
 )

src/components/loaders/pdfLoader.py CHANGED Viewed

@@ -7,17 +7,37 @@ import numpy as np
 import pymupdf
 import easyocr
 class PdfLoader:
     def __init__(self) -> None:
-        self.config = getConfig(path = "config.ini")
-        self.reader = easyocr.Reader(['en'], gpu = self.config.getboolean("EASYOCR", "gpu"))
-    def extractTextFromPage(self, page):
-        return cleanText(text = page.get_text())
-    def searchablePdf(self, pdfPath: str):
-        try:
             logger.info("Text Extraction Started from Searchable PDF")
             doc = pymupdf.open(pdfPath)
             pages = [doc.load_page(i) for i in range(len(doc))]
@@ -27,12 +47,30 @@ class PdfLoader:
             return "\n".join(texts)
         except Exception as e:
             logger.error(CustomException(e))
-    def getText(self, image):
         text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
-        return cleanText(text = text)
-    def scannablePdf(self, pdfPath: str):
         try:
             logger.info("Text Extraction Started from Scannable PDF")
             allImages = convert_from_path(pdfPath)

 import pymupdf
 import easyocr
 class PdfLoader:
     def __init__(self) -> None:
+        """
+        Initialize the PdfLoader with configuration settings and an EasyOCR reader.
+        """
+        self.config = getConfig(path="config.ini")
+        self.reader = easyocr.Reader(['en'], gpu=self.config.getboolean("EASYOCR", "gpu"))
+    def extractTextFromPage(self, page) -> str:
+        """
+        Extract and clean text from a PDF page.
+        Args:
+            page: A PyMuPDF page object.
+        Returns:
+            str: Cleaned text extracted from the page.
+        """
+        return cleanText(text=page.get_text())
+    def searchablePdf(self, pdfPath: str) -> str:
+        """
+        Extract text from a searchable PDF.
+        Args:
+            pdfPath (str): The file path to the searchable PDF.
+        Returns:
+            str: All extracted text from the PDF.
+        """
+        try:
             logger.info("Text Extraction Started from Searchable PDF")
             doc = pymupdf.open(pdfPath)
             pages = [doc.load_page(i) for i in range(len(doc))]
             return "\n".join(texts)
         except Exception as e:
             logger.error(CustomException(e))
+    def getText(self, image) -> str:
+        """
+        Extract and clean text from an image using EasyOCR.
+        Args:
+            image: An image (numpy array).
+        Returns:
+            str: Cleaned text extracted from the image.
+        """
         text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
+        return cleanText(text=text)
+    def scannablePdf(self, pdfPath: str) -> str:
+        """
+        Extract text from a scannable PDF using OCR.
+        Args:
+            pdfPath (str): The file path to the scannable PDF.
+        Returns:
+            str: All extracted text from the PDF.
+        """
         try:
             logger.info("Text Extraction Started from Scannable PDF")
             allImages = convert_from_path(pdfPath)

src/components/loaders/websiteCrawler.py CHANGED Viewed

@@ -1,65 +1,100 @@
 from concurrent.futures import ThreadPoolExecutor
 from src.utils.exceptions import CustomException
 from urllib.parse import urlparse, urljoin
-from src.utils.functions import getConfig
-from src.utils.functions import cleanText
 from src.utils.logging import logger
 from bs4 import BeautifulSoup
 import time
 import requests
 class WebsiteCrawler:
     def __init__(self):
-        self.config = getConfig(path = "config.ini")
-    def getLinksFromPage(self, url: str):
         response = requests.get(url)
         soup = BeautifulSoup(response.content, "html.parser")
         anchors = soup.find_all("a")
         links = []
         for anchor in anchors:
             if "href" in anchor.attrs:
                 if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
                     links.append(anchor.attrs["href"])
                 elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
                     links.append(urljoin(url + "/", anchor.attrs["href"]))
-                else:
-                    pass
                 links = [link for link in links if "#" not in link]
                 links = list(set(links))
-            else:
-                continue
         return links
-    def getLinks(self, url: str):
         try:
-            logger.info("fetching links from url")
             start = time.time()
             links = self.getLinksFromPage(url)
             uniqueLinks = set()
             for link in links:
                 now = time.time()
                 if now - start > self.config.getint("WEBCRAWLER", "timeout"):
                     break
-                else:
-                    uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
-            return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
         except Exception as e:
             logger.error(CustomException(e))
-    def extractTextFromUrl(self, url: str):
         response = requests.get(url)
         response.raise_for_status()
         html = response.text
         soup = BeautifulSoup(html, 'html.parser')
-        return cleanText(text = soup.get_text(separator=' ', strip=True))
-    def extractTextFromUrlList(self, urls: list[str]):
         try:
-            logger.info("extracting text from urls")
             with ThreadPoolExecutor() as executor:
                 texts = list(executor.map(self.extractTextFromUrl, urls))
-            return "\n".join(texts)
         except Exception as e:
             logger.error(CustomException(e))

 from concurrent.futures import ThreadPoolExecutor
 from src.utils.exceptions import CustomException
 from urllib.parse import urlparse, urljoin
+from src.utils.functions import getConfig, cleanText
 from src.utils.logging import logger
 from bs4 import BeautifulSoup
 import time
 import requests
 class WebsiteCrawler:
     def __init__(self):
+        """Initialize the WebsiteCrawler with configuration settings."""
+        self.config = getConfig(path="config.ini")
+    def getLinksFromPage(self, url: str) -> list[str]:
+        """
+        Extract all valid links from a given webpage.
+        Args:
+            url (str): The URL of the webpage to extract links from.
+        Returns:
+            list[str]: A list of extracted links from the page.
+        """
         response = requests.get(url)
         soup = BeautifulSoup(response.content, "html.parser")
         anchors = soup.find_all("a")
         links = []
         for anchor in anchors:
             if "href" in anchor.attrs:
                 if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
                     links.append(anchor.attrs["href"])
                 elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
                     links.append(urljoin(url + "/", anchor.attrs["href"]))
                 links = [link for link in links if "#" not in link]
                 links = list(set(links))
         return links
+    def getLinks(self, url: str) -> list[str]:
+        """
+        Fetch and return all unique links found from the given URL.
+        Args:
+            url (str): The starting URL to fetch links from.
+        Returns:
+            list[str]: A list of unique links found.
+        """
         try:
+            logger.info("Fetching links from URL")
             start = time.time()
             links = self.getLinksFromPage(url)
             uniqueLinks = set()
             for link in links:
                 now = time.time()
                 if now - start > self.config.getint("WEBCRAWLER", "timeout"):
                     break
+                uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
+            return list(set([x[:-1] if x[-1] == "/" else x for x in uniqueLinks]))
         except Exception as e:
             logger.error(CustomException(e))
+    def extractTextFromUrl(self, url: str) -> str:
+        """
+        Extract and clean text content from a given URL.
+        Args:
+            url (str): The URL of the webpage to extract text from.
+        Returns:
+            str: Cleaned text extracted from the webpage.
+        """
         response = requests.get(url)
         response.raise_for_status()
         html = response.text
         soup = BeautifulSoup(html, 'html.parser')
+        return cleanText(text=soup.get_text(separator=' ', strip=True))
+    def extractTextFromUrlList(self, urls: list[str]) -> str:
+        """
+        Extract text from a list of URLs concurrently.
+        Args:
+            urls (list[str]): A list of URLs to extract text from.
+        Returns:
+            str: All extracted text combined into a single string.
+        """
         try:
+            logger.info("Extracting text from URLs")
             with ThreadPoolExecutor() as executor:
                 texts = list(executor.map(self.extractTextFromUrl, urls))
+            return "\n".join(texts)
         except Exception as e:
             logger.error(CustomException(e))

src/components/loaders/youtubeLoader.py CHANGED Viewed

@@ -3,22 +3,29 @@ from src.utils.exceptions import CustomException
 from src.utils.functions import cleanText
 from src.utils.logging import logger
 class YoutubeTranscriptLoader:
     def __init__(self):
         pass
-    def getTranscripts(self, urls: str):
         texts = []
         for url in set(urls):
             try:
-                loader = YoutubeLoader.from_youtube_url(
-                    url, add_video_info=False
-                )
                 doc = " ".join([x.page_content for x in loader.load()])
-                texts.append(cleanText(text = doc))
             except Exception as e:
                 logger.error(CustomException(e))
-                doc = ""
-                texts.append(doc)
         return "\n".join(texts)

 from src.utils.functions import cleanText
 from src.utils.logging import logger
 class YoutubeTranscriptLoader:
     def __init__(self):
+        """Initialize the YoutubeTranscriptLoader."""
         pass
+    def getTranscripts(self, urls: str) -> str:
+        """
+        Retrieve transcripts from a list of YouTube URLs.
+        Args:
+            urls (str): Comma-separated YouTube URLs to fetch transcripts from.
+        Returns:
+            str: Combined transcripts cleaned and joined by newlines.
+        """
         texts = []
         for url in set(urls):
             try:
+                loader = YoutubeLoader.from_youtube_url(url, add_video_info=False)
                 doc = " ".join([x.page_content for x in loader.load()])
+                texts.append(cleanText(text=doc))
             except Exception as e:
                 logger.error(CustomException(e))
+                texts.append("")  # Append an empty string on error
         return "\n".join(texts)

src/components/rag/RAG.py CHANGED Viewed

@@ -3,39 +3,52 @@ from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableLambda
 from src.utils.exceptions import CustomException
-from src.utils.functions import getConfig
-from src.utils.functions import loadYaml
 from src.utils.logging import logger
 from langchain_groq import ChatGroq
 class Chain:
     def __init__(self):
-        self.config = getConfig(path = "config.ini")
         self.store = VectorStore()
-        prompt = loadYaml(path = "params.yaml")["prompt"]
         self.prompt = ChatPromptTemplate.from_template(prompt)
-    def formatDocs(self, docs):
-        context = ""
-        for doc in docs:
-            context += f"{doc}\n\n\n"
-        if context == "":
-            context = "No Context Found"
-        else:
-            pass
         return context
     def returnChain(self, text: str):
         try:
-            logger.info("preparing chain")
-            store = self.store.setupStore(text = text)
             chain = (
-                    {"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
-                     "question": RunnableLambda(lambda x: x["question"])}
-                    | self.prompt
-                    | ChatGroq(model_name = self.config.get("LLM", "llmModel"), temperature = self.config.getfloat("LLM", "temperature"), max_tokens = self.config.getint("LLM", "maxTokens"))
-                    | StrOutputParser()
             )
             return chain
         except Exception as e:

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableLambda
 from src.utils.exceptions import CustomException
+from src.utils.functions import getConfig, loadYaml
 from src.utils.logging import logger
 from langchain_groq import ChatGroq
 class Chain:
     def __init__(self):
+        """Initialize the Chain with configuration and prompt template."""
+        self.config = getConfig(path="config.ini")
         self.store = VectorStore()
+        prompt = loadYaml(path="params.yaml")["prompt"]
         self.prompt = ChatPromptTemplate.from_template(prompt)
+    def formatDocs(self, docs) -> str:
+        """
+        Format a list of documents into a single string.
+        Args:
+            docs: A list of documents to format.
+        Returns:
+            str: Formatted string with documents or a placeholder if empty.
+        """
+        context = "\n\n\n".join(docs) or "No Context Found"
         return context
     def returnChain(self, text: str):
+        """
+        Create and return a processing chain based on the input text.
+        Args:
+            text (str): Input text to prepare the chain.
+        Returns:
+            Chain: Configured chain for processing input.
+        """
         try:
+            logger.info("Preparing chain")
+            store = self.store.setupStore(text=text)
             chain = (
+                {"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
+                 "question": RunnableLambda(lambda x: x["question"])}
+                | self.prompt
+                | ChatGroq(model_name=self.config.get("LLM", "llmModel"),
+                           temperature=self.config.getfloat("LLM", "temperature"),
+                           max_tokens=self.config.getint("LLM", "maxTokens"))
+                | StrOutputParser()
             )
             return chain
         except Exception as e:

src/components/vectors/vectorstore.py CHANGED Viewed

@@ -8,31 +8,41 @@ from src.utils.logging import logger
 class VectorStore:
     def __init__(self):
-        self.config = getConfig(path = "config.ini")
         self.vectorEmbeddings = HuggingFaceEmbeddings(
-            model_name = self.config.get("EMBEDDINGS", "embeddingModel"),
-            model_kwargs = {"device": self.config.get("EMBEDDINGS", "device")},
-            encode_kwargs = {"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
         )
         self.splitter = RecursiveCharacterTextSplitter(
-            chunk_size = self.config.getint("VECTORSTORE", "chunkSize"),
-            chunk_overlap = self.config.getint("VECTORSTORE", "chunkOverlap"),
-            add_start_index = self.config.getboolean("VECTORSTORE", "addStartIndex")
         )
     def setupStore(self, text: str):
         try:
             store = InMemoryVectorStore(self.vectorEmbeddings)
-            textDocument = Document(page_content = text)
             documents = self.splitter.split_documents([textDocument])
-            store.add_documents(documents = documents)
             return store.as_retriever(
-                search_type = self.config.get("RETRIEVER", "searchType"),
-                search_kwargs = {
                     "k": self.config.getint("RETRIEVER", "k"),
-                    "fetch_k": self.config.getint("RETRIEVER", "fetchK")
                 }
-            )
         except Exception as e:
-            print(CustomException(e))
-            logger.error(CustomException(e))

 class VectorStore:
     def __init__(self):
+        """Initialize the VectorStore with configuration, embeddings, and text splitter."""
+        self.config = getConfig(path="config.ini")
         self.vectorEmbeddings = HuggingFaceEmbeddings(
+            model_name=self.config.get("EMBEDDINGS", "embeddingModel"),
+            model_kwargs={"device": self.config.get("EMBEDDINGS", "device")},
+            encode_kwargs={"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
         )
         self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.config.getint("VECTORSTORE", "chunkSize"),
+            chunk_overlap=self.config.getint("VECTORSTORE", "chunkOverlap"),
+            add_start_index=self.config.getboolean("VECTORSTORE", "addStartIndex")
         )
     def setupStore(self, text: str):
+        """
+        Set up the vector store with the provided text.
+        Args:
+            text (str): The text to store and process.
+        Returns:
+            Retriever: A retriever for querying the vector store.
+        """
         try:
             store = InMemoryVectorStore(self.vectorEmbeddings)
+            textDocument = Document(page_content=text)
             documents = self.splitter.split_documents([textDocument])
+            store.add_documents(documents=documents)
             return store.as_retriever(
+                search_type=self.config.get("RETRIEVER", "searchType"),
+                search_kwargs={
                     "k": self.config.getint("RETRIEVER", "k"),
+                    "fetch_k": self.config.getint("RETRIEVER", "fetchK")
                 }
+            )
         except Exception as e:
+            logger.error(CustomException(e))
+            print(CustomException(e))

src/pipelines/completePipeline.py CHANGED Viewed

@@ -8,31 +8,77 @@ load_dotenv("secrets.env")
 class Pipeline:
     def __init__(self):
         self.pdfLoader = PdfLoader()
         self.webCrawler = WebsiteCrawler()
         self.youtubeLoader = YoutubeTranscriptLoader()
         self.ragChain = Chain()
     def plainText(self, text: str):
-        chain = self.ragChain.returnChain(text = text)
         return chain
     def searchablePdf(self, path: str):
-        extractedText = self.pdfLoader.searchablePdf(pdfPath = path)
-        chain = self.ragChain.returnChain(text = extractedText)
         return chain
     def scannablePdf(self, path: str):
-        extractedText = self.pdfLoader.scannablePdf(pdfPath = path)
-        chain = self.ragChain.returnChain(text = extractedText)
         return chain
     def webCrawl(self, urls: list[str]):
-        extractedText = self.webCrawler.extractTextFromUrlList(urls = urls)
-        chain = self.ragChain.returnChain(text = extractedText)
         return chain
     def youtubeLinks(self, urls: list[str]):
-        extractedText = self.youtubeLoader.getTranscripts(urls = urls)
-        chain = self.ragChain.returnChain(text = extractedText)
-        return chain

 class Pipeline:
     def __init__(self):
+        """Initialize the Pipeline with loaders and the RAG chain."""
         self.pdfLoader = PdfLoader()
         self.webCrawler = WebsiteCrawler()
         self.youtubeLoader = YoutubeTranscriptLoader()
         self.ragChain = Chain()
     def plainText(self, text: str):
+        """
+        Process plain text through the RAG chain.
+        Args:
+            text (str): The input text to process.
+        Returns:
+            Chain: The processed chain for the input text.
+        """
+        chain = self.ragChain.returnChain(text=text)
         return chain
     def searchablePdf(self, path: str):
+        """
+        Process a searchable PDF file.
+        Args:
+            path (str): The path to the PDF file.
+        Returns:
+            Chain: The processed chain from the extracted text.
+        """
+        extractedText = self.pdfLoader.searchablePdf(pdfPath=path)
+        chain = self.ragChain.returnChain(text=extractedText)
         return chain
     def scannablePdf(self, path: str):
+        """
+        Process a scannable PDF file.
+        Args:
+            path (str): The path to the PDF file.
+        Returns:
+            Chain: The processed chain from the extracted text.
+        """
+        extractedText = self.pdfLoader.scannablePdf(pdfPath=path)
+        chain = self.ragChain.returnChain(text=extractedText)
         return chain
     def webCrawl(self, urls: list[str]):
+        """
+        Crawl the web for text extraction from provided URLs.
+        Args:
+            urls (list[str]): A list of URLs to crawl.
+        Returns:
+            Chain: The processed chain from the extracted text.
+        """
+        extractedText = self.webCrawler.extractTextFromUrlList(urls=urls)
+        chain = self.ragChain.returnChain(text=extractedText)
         return chain
     def youtubeLinks(self, urls: list[str]):
+        """
+        Extract transcripts from YouTube links.
+        Args:
+            urls (list[str]): A list of YouTube video URLs.
+        Returns:
+            Chain: The processed chain from the extracted transcripts.
+        """
+        extractedText = self.youtubeLoader.getTranscripts(urls=urls)
+        chain = self.ragChain.returnChain(text=extractedText)
+        return chain

src/utils/exceptions.py CHANGED Viewed

@@ -1,6 +1,15 @@
 import sys
 def error_message_detail(error):
     _, _, exc_info = sys.exc_info()
     filename = exc_info.tb_frame.f_code.co_filename
     lineno = exc_info.tb_lineno
@@ -9,8 +18,15 @@ def error_message_detail(error):
 class CustomException(Exception):
     def __init__(self, error_message):
         super().__init__(error_message)
         self.error_message = error_message_detail(error_message)
     def __str__(self) -> str:
         return self.error_message

 import sys
 def error_message_detail(error):
+    """
+    Generate a detailed error message.
+    Args:
+        error: The error object.
+    Returns:
+        str: A formatted error message including line number and filename.
+    """
     _, _, exc_info = sys.exc_info()
     filename = exc_info.tb_frame.f_code.co_filename
     lineno = exc_info.tb_lineno
 class CustomException(Exception):
     def __init__(self, error_message):
+        """
+        Initialize a CustomException with a detailed error message.
+        Args:
+            error_message (str): The error message to be logged.
+        """
         super().__init__(error_message)
         self.error_message = error_message_detail(error_message)
     def __str__(self) -> str:
+        """Return the detailed error message."""
         return self.error_message

src/utils/functions.py CHANGED Viewed

@@ -3,15 +3,42 @@ import string
 import yaml
 def getConfig(path: str):
     config = configparser.ConfigParser()
     config.read(path)
     return config
 def cleanText(text: str):
     text = text.replace("\n", " ")
     text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
     return text
 def loadYaml(path: str):
     with open(path) as file:
         return yaml.safe_load(file)

 import yaml
 def getConfig(path: str):
+    """
+    Load configuration from a specified file.
+    Args:
+        path (str): The path to the configuration file.
+    Returns:
+        ConfigParser: The loaded configuration object.
+    """
     config = configparser.ConfigParser()
     config.read(path)
     return config
 def cleanText(text: str):
+    """
+    Clean the input text by removing newline characters and punctuation.
+    Args:
+        text (str): The text to be cleaned.
+    Returns:
+        str: The cleaned text.
+    """
     text = text.replace("\n", " ")
     text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
     return text
 def loadYaml(path: str):
+    """
+    Load YAML content from a specified file.
+    Args:
+        path (str): The path to the YAML file.
+    Returns:
+        dict: The parsed content of the YAML file.
+    """
     with open(path) as file:
         return yaml.safe_load(file)

src/utils/logging.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
-logFormatter = logging.Formatter(fmt = logFormat, style = "%")
 streamHandler = logging.StreamHandler()
 streamHandler.setFormatter(logFormatter)
 logger.addHandler(streamHandler)

 import logging
+# Set up the logger for the current module
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+# Define the log format
 logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
+logFormatter = logging.Formatter(fmt=logFormat, style="%")
+# Set up a stream handler to output logs to the console
 streamHandler = logging.StreamHandler()
 streamHandler.setFormatter(logFormatter)
+# Add the stream handler to the logger
 logger.addHandler(streamHandler)