gArthur98 commited on
Commit
1aa76cb
·
1 Parent(s): 2a5e9b0

Update made

Browse files
.gitignore ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / cache
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # Virtual env
6
+ venv/
7
+ .env
8
+ .env.*
9
+ env
10
+
11
+ # Jupyter checkpoints
12
+ .ipynb_checkpoints/
13
+
14
+ # IDE configs
15
+ .vscode/
16
+ .idea/
17
+ *.sublime-workspace
18
+ *.sublime-project
19
+
20
+ # macOS
21
+ .DS_Store
22
+
23
+ # Logs
24
+ *.log
25
+
26
+ # Test output
27
+ htmlcov/
28
+ .coverage
29
+
30
+ # Distribution / packaging
31
+ build/
32
+ dist/
33
+ *.egg-info/
34
+
35
+ # Data files (if you don’t want to commit raw data)
36
+ data/
37
+
38
+ # Python coverage
39
+ .coverage.*
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##importing relevant libraries and modules
2
+ import os
3
+ import nltk
4
+ import requests
5
+ import gradio as gr
6
+ from pathlib import Path
7
+ from dotenv import load_dotenv
8
+
9
+ # Importing my personal rag packages and modules
10
+ from rag_builder.Ingesting_phase import DocumentLoader
11
+ from rag_builder.Retrival_phase import dv, reset_database
12
+ from rag_builder.LLM_Inference import get_response
13
+
14
+
15
+ nltk.download("punkt")
16
+
17
+
18
+ #this is to load the env
19
+ load_dotenv()
20
+
21
+ # buidling the gradio logic
22
+ def run_app(file_obj, url_input, user_query):
23
+ # Clearing out any previous input
24
+ reset_database()
25
+
26
+ # handling the ingestion
27
+ if url_input:
28
+ html = requests.get(url_input).text
29
+ temp_path = Path("./temp_url.html")
30
+ temp_path.write_text(html, encoding="utf-8")
31
+ loader = DocumentLoader(str(temp_path))
32
+ orig_chunks, proc_chunks = loader.load_html()
33
+ dv.original_docs.extend(orig_chunks)
34
+ dv.add_documents(proc_chunks)
35
+ temp_path.unlink()
36
+ elif file_obj:
37
+ ext = Path(file_obj.name).suffix.lower().lstrip('.')
38
+ loader = DocumentLoader(file_obj.name)
39
+ if ext == 'pdf':
40
+ orig_chunks, proc_chunks = loader.load_pdf()
41
+ elif ext == 'txt':
42
+ orig_chunks, proc_chunks = loader.load_text()
43
+ else:
44
+ return "Unsupported file type.\nPlease upload PDF or TXT.", ""
45
+ dv.original_docs.extend(orig_chunks)
46
+ dv.add_documents(proc_chunks)
47
+ else:
48
+ return "Please upload a file or enter a URL.", ""
49
+
50
+ # Base model output to handle cases with no context
51
+ base_output = get_response(user_query, "")
52
+
53
+ ##gathering the best matches as context
54
+ matches = dv.find_best_matches(user_query)
55
+ flat_context = []
56
+ for m in matches:
57
+ if isinstance(m, list):
58
+ flat_context.extend(m)
59
+ else:
60
+ flat_context.append(m)
61
+ context = "".join(flat_context)
62
+ rag_output = get_response(user_query, context)
63
+
64
+ return base_output, rag_output
65
+
66
+ # buidling the gradio interface
67
+ def main():
68
+ with gr.Blocks() as demo:
69
+ gr.Markdown("## RAG vs. Base Model Comparison: Kindly Provide A Document or A Link And Ask Questions")
70
+ with gr.Row():
71
+ file_input = gr.File(label="Upload PDF/TXT", file_types=[".pdf", ".txt"])
72
+ url_input = gr.Textbox(label="Or enter HTML URL", placeholder="https://...")
73
+ query_input = gr.Textbox(label="Ask a question:")
74
+ run_btn = gr.Button("Run")
75
+ out_base = gr.Textbox(label="Base Model Output", lines=5)
76
+ out_rag = gr.Textbox(label="RAG-Enhanced Output", lines=5)
77
+
78
+ run_btn.click(fn=run_app,
79
+ inputs=[file_input, url_input, query_input],
80
+ outputs=[out_base, out_rag])
81
+
82
+ demo.launch(share= True)
83
+
84
+ if __name__ == "__main__":
85
+ main()
pyproject.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rag_builder"
7
+ version = "0.1.0"
8
+ description = "This package helps you build RAG projects"
9
+ authors = [
10
+ { name = "Gregory Arthur", email = "gregoryarthur98@gmail.com" }
11
+ ]
12
+ dependencies = [
13
+ "beautifulsoup4==4.13.4",
14
+ "cohere==5.15.0",
15
+ "nltk==3.9.1",
16
+ "PyPDF2==3.0.1",
17
+ "python-dotenv==1.1.0",
18
+ "requests==2.32.4",
19
+ "scikit-learn==1.7.0"
20
+ ]
21
+
22
+ [tool.setuptools]
23
+ package-dir = { "" = "src" }
24
+
25
+ [tool.setuptools.packages.find]
26
+ where = ["src"]
27
+ include = ["rag_builder*"]
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ -e .
2
+ beautifulsoup4==4.13.4
3
+ cohere==5.15.0
4
+ gradio==5.34.0
5
+ nltk==3.9.1
6
+ PyPDF2==3.0.1
7
+ python-dotenv==1.1.0
8
+ Requests==2.32.4
9
+ scikit_learn==1.7.0
10
+
src/rag_builder/Ingesting_phase.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##importing relevant dependencies
2
+
3
+ import os
4
+ import nltk ##for data preprocessing
5
+ from PyPDF2 import PdfReader ##for handling reading of the PDFs
6
+ from bs4 import BeautifulSoup ##for web scrapping
7
+ from nltk.stem import PorterStemmer ##for stemming
8
+ from nltk.tokenize import sent_tokenize ##for tokenizing our inputs
9
+ nltk.download("punkt_tab") ##for handling punctuations
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+
12
+
13
+ # Step 1: Data Pre-processing
14
+
15
+ ##Stemming of the inputting data
16
+
17
+ stemmer= PorterStemmer() ##initiallizng our stemmer
18
+
19
+ ## building logic for stemming and ddata processing
20
+
21
+ Chunk_size= 999999 ##declaring my defualt chunk size in case the user doesn't specify
22
+
23
+ def process_text(text, Chunk_size= Chunk_size):
24
+ sentences = sent_tokenize(text) ##tokenizing any text we recieve
25
+ ##I will be creating three variables
26
+
27
+ original_text= [] #this is for storing the original text we got from the user for easy retrival
28
+ processed_text= [] #this will store our processed text after the original has been passed through this function
29
+ segments= "" ##this is for storing the chunked up data
30
+ ##I will explain this code in a string below
31
+ for text in sentences:
32
+ if len(segments) + len(text) > Chunk_size:
33
+ original_text.append(segments)
34
+ processed_text.append(" ".join([stemmer.stem(word) for word in segments.split()]))
35
+ segments = text
36
+ segments += " " + text
37
+
38
+ # Split text into chunks of at most Chunk_size:
39
+ # when adding the next sentence would overflow the chunk,
40
+ # flush the current segment (unchanged and stemmed) to the outputs
41
+ # and start a new segment.
42
+
43
+
44
+ ##Handling the last sequence
45
+
46
+ if segments:
47
+ original_text.append(segments)
48
+ processed_text.append(" ". join([stemmer.stem(word) for word in segments.split()]))
49
+
50
+ return original_text, processed_text
51
+
52
+
53
+ # Step 2: Ingesting the file : We will allow the file take in a PDF, text, or HTML file
54
+
55
+ ##the initial code consisted of three functions but I refactored them into a single class
56
+
57
+
58
+ class DocumentLoader:
59
+
60
+ def __init__(self, file_path):
61
+ self.file_path= file_path
62
+
63
+ ## a method for loading and reading PDFs
64
+ def load_pdf(self):
65
+ with open(self.file_path, "rb") as f: ## We are using 'rb' since PDFs are compressed so we use rb to read it instead of reading it raw as we would with text files
66
+ reader= PdfReader(f)
67
+ text= ""
68
+ for x in reader.pages:
69
+ text += x.extract_text()
70
+ return process_text(text)
71
+
72
+ ## a method for handling txt files
73
+
74
+ def load_text(self):
75
+ with open(self.file_path, "r") as f: ## we are using r to have it read the raw text
76
+ text= f.read()
77
+ return process_text(text)
78
+
79
+ ## A method for handling html files
80
+ def load_html(self):
81
+ with open(self.file_path, "r") as f:
82
+ data= BeautifulSoup(f, "html.parser")
83
+ text= data.get_text()
84
+ return process_text(text)
85
+
86
+
87
+ # Step 3 Vectorization and Similarity Searching I am creating one class for the vectorizer
88
+
89
+
90
+
91
+ ## a class for handling adding documents
92
+
93
+ class Doc_Vectorizer:
94
+
95
+ def __init__(self):
96
+
97
+ self.vectorizer= TfidfVectorizer()
98
+
99
+ self.vectorized_docs= []
100
+
101
+ self.original_docs= []
102
+
103
+ self.vectors= None
104
+
105
+
106
+ def add_documents(self, text):
107
+ self.vectorized_docs.extend(text)
108
+ self.vectors= self.vectorizer.fit_transform(self.vectorized_docs)
109
+ return self.vectors
110
+
111
+ def process_and_add_documents(self, file_path, file_type): ##this is a method for handling multiple
112
+ file_type= file_type ##this should help handle various casing inputs of the variables
113
+ doc_loader= DocumentLoader(file_path) ##intiating the documentloader class
114
+ if file_type == "pdf":
115
+ original_data, processed_data= doc_loader.load_pdf()
116
+ elif file_type== "txt":
117
+ original_data, processed_data= doc_loader.load_text()
118
+ elif file_type== "html":
119
+ original_data, processed_data= doc_loader.load_html()
120
+ else:
121
+ raise TypeError("You provided an incorrect file type")
122
+ self.original_docs.append(original_data)
123
+ self.vectors= self.add_documents(processed_data)
124
+
125
+ return self.vectors
126
+
127
+ def find_best_matches(self, query, k=3):
128
+ process_query = process_text(query)[1]
129
+ query_vector= self.vectorizer.transform(process_query)
130
+ similarity= (query_vector * self.vectors.T).toarray() ##we are using the dot product to find out the similarity score
131
+ best_match= similarity.argsort()[0][-k:][::-1] ##this code should sort the output, and then grab the top k (3) and then reverse them
132
+ return [self.original_docs[i] for i in best_match], [self.vectorized_docs[i] for i in best_match]
133
+
134
+
135
+
136
+
137
+
138
+
src/rag_builder/LLM_Inference.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cohere
2
+ import os
3
+ from dotenv import load_dotenv
4
+ ##this ensures
5
+ load_dotenv()
6
+
7
+ API_KEY = os.getenv("SECRET_API_KEY")
8
+ if API_KEY is None:
9
+ raise RuntimeError("SECRET_API_KEY not set in environment")
10
+
11
+
12
+ model = cohere.ClientV2(API_KEY) ##trying to get me api key
13
+
14
+
15
+ def get_response(query, context=""): ##creating a function for the
16
+ messages = [{
17
+ "role": "system",
18
+ "content": (
19
+ "You are an AI assistant. Use the context provided by the user to give the user a concise answer to their prompt. "
20
+ "If the answer isn't present do not make it up, rather, inform the user that you do not know the answer"
21
+ )
22
+ }]
23
+ if context: # only add the context when non-empty
24
+ messages.append({"role": "system", "content": context})
25
+ messages.append({"role": "user", "content": query})
26
+
27
+ response = model.chat(
28
+ model="command-a-03-2025",
29
+ messages=messages
30
+ )
31
+ return response.message.content[0].text.strip()
32
+
33
+
34
+
35
+
36
+
37
+ ## testing model:
38
+
39
+ get_response("How are you?", "Just reply")
src/rag_builder/Retrival_phase.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .LLM_Inference import get_response
2
+ from .Ingesting_phase import Doc_Vectorizer ##importing the doc_vectorizer class
3
+ import requests
4
+
5
+
6
+ dv=Doc_Vectorizer() ##instantiting the Doc_vectorizer class
7
+
8
+ #the code below should clear each storage variable after each session so previous docs don't interupt with new inputs
9
+ def reset_database():
10
+ dv.vectorized_docs.clear()
11
+ dv.original_docs.clear()
12
+ dv.vectors= None
13
+
14
+ # this should take the file type using the file name, e.g if you input a .txt file it should be able to infer that you are using a txt file
15
+ def initialize(file_name):
16
+ file_type= file_name.split(".")[-1]
17
+ return dv.process_and_add_documents(file_path=file_name, file_type=file_type)
18
+
19
+
20
+ def chat(user_query, is_debug= False):
21
+ original_best_match, processed_best_match= dv.find_best_matches(user_query)
22
+ context= "\n\n".join(original_best_match[0])
23
+
24
+ if is_debug: ##this is ensure that our model can give us some context when we aren't able to get a response
25
+ print(f"Context: {context}")
26
+
27
+ resp= get_response(user_query, context)
28
+ return resp
29
+
30
+
src/rag_builder/__init__.py ADDED
File without changes