Spaces:

gArthur98
/

RAG_App

Sleeping

App Files Files Community

gArthur98 commited on Jun 16, 2025

Commit

1aa76cb

1 Parent(s): 2a5e9b0

Update made

Browse files

Files changed (8) hide show

.gitignore +39 -0
app.py +85 -0
pyproject.toml +27 -0
requirements.txt +10 -0
src/rag_builder/Ingesting_phase.py +138 -0
src/rag_builder/LLM_Inference.py +39 -0
src/rag_builder/Retrival_phase.py +30 -0
src/rag_builder/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,39 @@

+# Byte-compiled / cache
+__pycache__/
+*.py[cod]
+# Virtual env
+venv/
+.env
+.env.*
+env
+# Jupyter checkpoints
+.ipynb_checkpoints/
+# IDE configs
+.vscode/
+.idea/
+*.sublime-workspace
+*.sublime-project
+# macOS
+.DS_Store
+# Logs
+*.log
+# Test output
+htmlcov/
+.coverage
+# Distribution / packaging
+build/
+dist/
+*.egg-info/
+# Data files (if you don’t want to commit raw data)
+data/
+# Python coverage
+.coverage.*

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+##importing relevant libraries and modules
+import os
+import nltk
+import requests
+import gradio as gr
+from pathlib import Path
+from dotenv import load_dotenv
+# Importing my personal rag packages and modules
+from rag_builder.Ingesting_phase import DocumentLoader
+from rag_builder.Retrival_phase import dv, reset_database
+from rag_builder.LLM_Inference import get_response
+nltk.download("punkt")
+#this is to load the env
+load_dotenv()
+# buidling the gradio logic
+def run_app(file_obj, url_input, user_query):
+    # Clearing out any previous input
+    reset_database()
+    # handling the ingestion
+    if url_input:
+        html = requests.get(url_input).text
+        temp_path = Path("./temp_url.html")
+        temp_path.write_text(html, encoding="utf-8")
+        loader = DocumentLoader(str(temp_path))
+        orig_chunks, proc_chunks = loader.load_html()
+        dv.original_docs.extend(orig_chunks)
+        dv.add_documents(proc_chunks)
+        temp_path.unlink()
+    elif file_obj:
+        ext = Path(file_obj.name).suffix.lower().lstrip('.')
+        loader = DocumentLoader(file_obj.name)
+        if ext == 'pdf':
+            orig_chunks, proc_chunks = loader.load_pdf()
+        elif ext == 'txt':
+            orig_chunks, proc_chunks = loader.load_text()
+        else:
+            return "Unsupported file type.\nPlease upload PDF or TXT.", ""
+        dv.original_docs.extend(orig_chunks)
+        dv.add_documents(proc_chunks)
+    else:
+        return "Please upload a file or enter a URL.", ""
+    # Base model output to handle cases with no context
+    base_output = get_response(user_query, "")
+            ##gathering the best matches as context
+    matches = dv.find_best_matches(user_query)
+    flat_context = []
+    for m in matches:
+        if isinstance(m, list):
+            flat_context.extend(m)
+        else:
+            flat_context.append(m)
+    context = "".join(flat_context)
+    rag_output = get_response(user_query, context)
+    return base_output, rag_output
+# buidling the gradio interface
+def main():
+    with gr.Blocks() as demo:
+        gr.Markdown("## RAG vs. Base Model Comparison: Kindly Provide A Document or A Link And Ask Questions")
+        with gr.Row():
+            file_input = gr.File(label="Upload PDF/TXT", file_types=[".pdf", ".txt"])
+            url_input = gr.Textbox(label="Or enter HTML URL", placeholder="https://...")
+        query_input = gr.Textbox(label="Ask a question:")
+        run_btn = gr.Button("Run")
+        out_base = gr.Textbox(label="Base Model Output", lines=5)
+        out_rag = gr.Textbox(label="RAG-Enhanced Output", lines=5)
+        run_btn.click(fn=run_app,
+                      inputs=[file_input, url_input, query_input],
+                      outputs=[out_base, out_rag])
+    demo.launch(share= True)
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name        = "rag_builder"
+version     = "0.1.0"
+description = "This package helps you build RAG projects"
+authors = [
+  { name  = "Gregory Arthur", email = "gregoryarthur98@gmail.com" }
+]
+dependencies = [
+  "beautifulsoup4==4.13.4",
+  "cohere==5.15.0",
+  "nltk==3.9.1",
+  "PyPDF2==3.0.1",
+  "python-dotenv==1.1.0",
+  "requests==2.32.4",
+  "scikit-learn==1.7.0"
+]
+[tool.setuptools]
+package-dir = { "" = "src" }
+[tool.setuptools.packages.find]
+where   = ["src"]
+include = ["rag_builder*"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+-e .
+beautifulsoup4==4.13.4
+cohere==5.15.0
+gradio==5.34.0
+nltk==3.9.1
+PyPDF2==3.0.1
+python-dotenv==1.1.0
+Requests==2.32.4
+scikit_learn==1.7.0

src/rag_builder/Ingesting_phase.py ADDED Viewed

	@@ -0,0 +1,138 @@

+##importing relevant dependencies
+import os
+import nltk ##for data preprocessing
+from PyPDF2 import PdfReader  ##for handling reading of the PDFs
+from bs4 import BeautifulSoup ##for web scrapping
+from nltk.stem import PorterStemmer ##for stemming
+from nltk.tokenize import sent_tokenize ##for tokenizing our inputs
+nltk.download("punkt_tab") ##for handling punctuations
+from sklearn.feature_extraction.text import TfidfVectorizer
+# Step 1: Data Pre-processing
+##Stemming of the inputting data
+stemmer= PorterStemmer() ##initiallizng our stemmer
+## building logic for stemming and ddata processing
+Chunk_size= 999999 ##declaring my defualt chunk size in case the user doesn't specify
+def process_text(text, Chunk_size= Chunk_size):
+    sentences = sent_tokenize(text) ##tokenizing any text we recieve
+    ##I will be creating three variables
+    original_text= [] #this is for storing the original text we got from the user for easy retrival
+    processed_text= [] #this will store our processed text after the original has been passed through this function
+    segments= ""  ##this is for storing the chunked up data
+    ##I will explain this code in a string below
+    for text in sentences:
+        if len(segments) + len(text) > Chunk_size:
+            original_text.append(segments)
+            processed_text.append(" ".join([stemmer.stem(word) for word in segments.split()]))
+            segments = text
+        segments += " " + text
+        # Split text into chunks of at most Chunk_size:
+        # when adding the next sentence would overflow the chunk,
+        # flush the current segment (unchanged and stemmed) to the outputs
+        # and start a new segment.
+        ##Handling the last sequence
+    if segments:
+        original_text.append(segments)
+        processed_text.append(" ". join([stemmer.stem(word) for word in segments.split()]))
+    return original_text, processed_text
+# Step 2: Ingesting the file : We will allow the file take in a PDF, text, or HTML file
+##the initial code consisted of three functions but I refactored them into a single class
+class DocumentLoader:
+    def __init__(self, file_path):
+        self.file_path= file_path
+## a method for loading and reading PDFs
+    def load_pdf(self):
+        with open(self.file_path, "rb") as f: ## We are using 'rb' since PDFs are compressed so we use rb to read it instead of reading it raw as we would with text files
+            reader= PdfReader(f)
+            text= ""
+            for x in reader.pages:
+                text += x.extract_text()
+            return process_text(text)
+## a method for handling txt files
+    def load_text(self):
+        with open(self.file_path, "r") as f: ## we are using r to have it read the raw text
+            text= f.read()
+            return process_text(text)
+## A method for handling html files
+    def load_html(self):
+        with open(self.file_path, "r") as f:
+            data= BeautifulSoup(f, "html.parser")
+            text= data.get_text()
+            return process_text(text)
+# Step 3  Vectorization and Similarity Searching  I am creating one class for the vectorizer
+## a class for handling adding documents
+class Doc_Vectorizer:
+    def __init__(self):
+        self.vectorizer= TfidfVectorizer()
+        self.vectorized_docs= []
+        self.original_docs= []
+        self.vectors= None
+    def add_documents(self, text):
+        self.vectorized_docs.extend(text)
+        self.vectors= self.vectorizer.fit_transform(self.vectorized_docs)
+        return self.vectors
+    def process_and_add_documents(self, file_path, file_type): ##this is a method for handling multiple
+        file_type= file_type ##this should help handle various casing inputs of the variables
+        doc_loader= DocumentLoader(file_path) ##intiating the documentloader class
+        if file_type == "pdf":
+            original_data, processed_data= doc_loader.load_pdf()
+        elif file_type== "txt":
+            original_data, processed_data= doc_loader.load_text()
+        elif file_type== "html":
+            original_data, processed_data= doc_loader.load_html()
+        else:
+            raise TypeError("You provided an incorrect file type")
+        self.original_docs.append(original_data)
+        self.vectors= self.add_documents(processed_data)
+        return self.vectors
+    def find_best_matches(self, query, k=3):
+        process_query = process_text(query)[1]
+        query_vector= self.vectorizer.transform(process_query)
+        similarity= (query_vector * self.vectors.T).toarray() ##we are using the dot product to find out the similarity score
+        best_match= similarity.argsort()[0][-k:][::-1] ##this code should sort the output, and then grab the top k (3) and then reverse them
+        return [self.original_docs[i] for i in best_match], [self.vectorized_docs[i] for i in best_match]

src/rag_builder/LLM_Inference.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import cohere
+import os
+from dotenv import load_dotenv
+##this ensures
+load_dotenv()
+API_KEY = os.getenv("SECRET_API_KEY")
+if API_KEY is None:
+    raise RuntimeError("SECRET_API_KEY not set in environment")
+model = cohere.ClientV2(API_KEY) ##trying to get me api key
+def get_response(query, context=""):  ##creating a function for the
+    messages = [{
+        "role": "system",
+        "content": (
+            "You are an AI assistant. Use the context provided by the user to give the user a concise answer to their prompt. "
+            "If the answer isn't present do not make it up, rather, inform the user that you do not know the answer"
+        )
+    }]
+    if context:  # only add the context when non-empty
+        messages.append({"role": "system", "content": context})
+    messages.append({"role": "user", "content": query})
+    response = model.chat(
+        model="command-a-03-2025",
+        messages=messages
+    )
+    return response.message.content[0].text.strip()
+## testing model:
+get_response("How are you?", "Just reply")

src/rag_builder/Retrival_phase.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from .LLM_Inference import get_response
+from .Ingesting_phase import Doc_Vectorizer ##importing the doc_vectorizer class
+import requests
+dv=Doc_Vectorizer() ##instantiting the Doc_vectorizer class
+#the code below should clear each storage variable after each session so previous docs don't interupt with new inputs
+def reset_database():
+    dv.vectorized_docs.clear()
+    dv.original_docs.clear()
+    dv.vectors= None
+# this should take the file type using the file name, e.g if you input a .txt file it should be able to infer that you are using a txt file
+def initialize(file_name):
+    file_type= file_name.split(".")[-1]
+    return dv.process_and_add_documents(file_path=file_name, file_type=file_type)
+def chat(user_query, is_debug= False):
+    original_best_match, processed_best_match= dv.find_best_matches(user_query)
+    context= "\n\n".join(original_best_match[0])
+    if is_debug: ##this is ensure that our model can give us some context when we aren't able to get a response
+        print(f"Context: {context}")
+    resp= get_response(user_query, context)
+    return resp

src/rag_builder/__init__.py ADDED Viewed

File without changes