commit to add lecture pdfs in context
Browse files- code/modules/data_loader.py +34 -2
- code/modules/helpers.py +15 -6
- code/modules/vector_db.py +11 -0
- requirements.txt +1 -0
code/modules/data_loader.py
CHANGED
|
@@ -48,6 +48,27 @@ class DataLoader:
|
|
| 48 |
self.splitter = None
|
| 49 |
logger.info("InfoLoader instance created")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def get_chunks(self, uploaded_files, weblinks):
|
| 52 |
# Main list of all documents
|
| 53 |
self.document_chunks_full = []
|
|
@@ -78,6 +99,13 @@ class DataLoader:
|
|
| 78 |
logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
|
| 79 |
return document_chunks
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def get_pdf(temp_file_path: str, title: str):
|
| 82 |
"""
|
| 83 |
Function to process PDF files
|
|
@@ -201,7 +229,10 @@ class DataLoader:
|
|
| 201 |
|
| 202 |
# Handle different file types
|
| 203 |
if file_type == "pdf":
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
| 205 |
elif file_type == "txt":
|
| 206 |
title, document_chunks = get_txt(file_path, file_name)
|
| 207 |
elif file_type == "docx":
|
|
@@ -215,7 +246,7 @@ class DataLoader:
|
|
| 215 |
if self.config["splitter_options"]["remove_chunks"]:
|
| 216 |
document_chunks = remove_chunks(document_chunks)
|
| 217 |
|
| 218 |
-
logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
|
| 219 |
self.document_names.append(title)
|
| 220 |
self.document_chunks_full.extend(document_chunks)
|
| 221 |
|
|
@@ -243,6 +274,7 @@ class DataLoader:
|
|
| 243 |
self.document_chunks_full.extend(document_chunks)
|
| 244 |
except:
|
| 245 |
logger.info(f"\t\tError splitting link {link_index+1} : {link}")
|
|
|
|
| 246 |
|
| 247 |
logger.info(
|
| 248 |
f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
|
|
|
|
| 48 |
self.splitter = None
|
| 49 |
logger.info("InfoLoader instance created")
|
| 50 |
|
| 51 |
+
def extract_text_from_pdf(self, pdf_path):
|
| 52 |
+
text = ""
|
| 53 |
+
with open(pdf_path, "rb") as file:
|
| 54 |
+
reader = PyPDF2.PdfReader(file)
|
| 55 |
+
num_pages = len(reader.pages)
|
| 56 |
+
for page_num in range(num_pages):
|
| 57 |
+
page = reader.pages[page_num]
|
| 58 |
+
text += page.extract_text()
|
| 59 |
+
return text
|
| 60 |
+
|
| 61 |
+
def download_pdf_from_url(self, pdf_url):
|
| 62 |
+
response = requests.get(pdf_url)
|
| 63 |
+
if response.status_code == 200:
|
| 64 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 65 |
+
temp_file.write(response.content)
|
| 66 |
+
temp_file_path = temp_file.name
|
| 67 |
+
return temp_file_path
|
| 68 |
+
else:
|
| 69 |
+
print("Failed to download PDF from URL:", pdf_url)
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
def get_chunks(self, uploaded_files, weblinks):
|
| 73 |
# Main list of all documents
|
| 74 |
self.document_chunks_full = []
|
|
|
|
| 99 |
logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
|
| 100 |
return document_chunks
|
| 101 |
|
| 102 |
+
def get_pdf_from_url(pdf_url: str):
|
| 103 |
+
temp_pdf_path = self.download_pdf_from_url(pdf_url)
|
| 104 |
+
if temp_pdf_path:
|
| 105 |
+
title, document_chunks = get_pdf(temp_pdf_path, pdf_url)
|
| 106 |
+
os.remove(temp_pdf_path)
|
| 107 |
+
return title, document_chunks
|
| 108 |
+
|
| 109 |
def get_pdf(temp_file_path: str, title: str):
|
| 110 |
"""
|
| 111 |
Function to process PDF files
|
|
|
|
| 229 |
|
| 230 |
# Handle different file types
|
| 231 |
if file_type == "pdf":
|
| 232 |
+
try:
|
| 233 |
+
title, document_chunks = get_pdf(file_path, file_name)
|
| 234 |
+
except:
|
| 235 |
+
title, document_chunks = get_pdf_from_url(file_path)
|
| 236 |
elif file_type == "txt":
|
| 237 |
title, document_chunks = get_txt(file_path, file_name)
|
| 238 |
elif file_type == "docx":
|
|
|
|
| 246 |
if self.config["splitter_options"]["remove_chunks"]:
|
| 247 |
document_chunks = remove_chunks(document_chunks)
|
| 248 |
|
| 249 |
+
logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)} from {file_name}")
|
| 250 |
self.document_names.append(title)
|
| 251 |
self.document_chunks_full.extend(document_chunks)
|
| 252 |
|
|
|
|
| 274 |
self.document_chunks_full.extend(document_chunks)
|
| 275 |
except:
|
| 276 |
logger.info(f"\t\tError splitting link {link_index+1} : {link}")
|
| 277 |
+
exit()
|
| 278 |
|
| 279 |
logger.info(
|
| 280 |
f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
|
code/modules/helpers.py
CHANGED
|
@@ -36,6 +36,10 @@ class WebpageCrawler:
|
|
| 36 |
soup = BeautifulSoup(html_data, "html.parser")
|
| 37 |
list_links = []
|
| 38 |
for link in soup.find_all("a", href=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# Append to list if new link contains original link
|
| 40 |
if str(link["href"]).startswith((str(website_link))):
|
| 41 |
list_links.append(link["href"])
|
|
@@ -56,14 +60,19 @@ class WebpageCrawler:
|
|
| 56 |
|
| 57 |
def get_subpage_links(self, l, base_url):
|
| 58 |
for link in tqdm(l):
|
| 59 |
-
|
| 60 |
-
if
|
| 61 |
-
dict_links_subpages = self.get_links(link, base_url)
|
| 62 |
-
# Change the dictionary value of the link to "Checked"
|
| 63 |
l[link] = "Checked"
|
| 64 |
-
else:
|
| 65 |
-
# Create an empty dictionary in case every link is checked
|
| 66 |
dict_links_subpages = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# Add new dictionary to old dictionary
|
| 68 |
l = {**dict_links_subpages, **l}
|
| 69 |
return l
|
|
|
|
| 36 |
soup = BeautifulSoup(html_data, "html.parser")
|
| 37 |
list_links = []
|
| 38 |
for link in soup.find_all("a", href=True):
|
| 39 |
+
|
| 40 |
+
# clean the link
|
| 41 |
+
# remove empty spaces
|
| 42 |
+
link["href"] = link["href"].strip()
|
| 43 |
# Append to list if new link contains original link
|
| 44 |
if str(link["href"]).startswith((str(website_link))):
|
| 45 |
list_links.append(link["href"])
|
|
|
|
| 60 |
|
| 61 |
def get_subpage_links(self, l, base_url):
|
| 62 |
for link in tqdm(l):
|
| 63 |
+
print('checking link:', link)
|
| 64 |
+
if not link.endswith("/"):
|
|
|
|
|
|
|
| 65 |
l[link] = "Checked"
|
|
|
|
|
|
|
| 66 |
dict_links_subpages = {}
|
| 67 |
+
else:
|
| 68 |
+
# If not crawled through this page start crawling and get links
|
| 69 |
+
if l[link] == "Not-checked":
|
| 70 |
+
dict_links_subpages = self.get_links(link, base_url)
|
| 71 |
+
# Change the dictionary value of the link to "Checked"
|
| 72 |
+
l[link] = "Checked"
|
| 73 |
+
else:
|
| 74 |
+
# Create an empty dictionary in case every link is checked
|
| 75 |
+
dict_links_subpages = {}
|
| 76 |
# Add new dictionary to old dictionary
|
| 77 |
l = {**dict_links_subpages, **l}
|
| 78 |
return l
|
code/modules/vector_db.py
CHANGED
|
@@ -60,6 +60,14 @@ class VectorDB:
|
|
| 60 |
urls = all_urls
|
| 61 |
return files, urls
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def create_embedding_model(self):
|
| 64 |
self.logger.info("Creating embedding function")
|
| 65 |
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
|
@@ -79,6 +87,9 @@ class VectorDB:
|
|
| 79 |
data_loader = DataLoader(self.config)
|
| 80 |
self.logger.info("Loading data")
|
| 81 |
files, urls = self.load_files()
|
|
|
|
|
|
|
|
|
|
| 82 |
document_chunks, document_names = data_loader.get_chunks(files, urls)
|
| 83 |
self.logger.info("Completed loading data")
|
| 84 |
|
|
|
|
| 60 |
urls = all_urls
|
| 61 |
return files, urls
|
| 62 |
|
| 63 |
+
def clean_url_list(self, urls):
|
| 64 |
+
# get lecture pdf links
|
| 65 |
+
lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
|
| 66 |
+
lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
|
| 67 |
+
urls = [link for link in urls if link.endswith("/")] # only keep links that end with a '/'. Extract Files Seperately
|
| 68 |
+
|
| 69 |
+
return urls, lecture_pdfs
|
| 70 |
+
|
| 71 |
def create_embedding_model(self):
|
| 72 |
self.logger.info("Creating embedding function")
|
| 73 |
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
|
|
|
| 87 |
data_loader = DataLoader(self.config)
|
| 88 |
self.logger.info("Loading data")
|
| 89 |
files, urls = self.load_files()
|
| 90 |
+
urls, lecture_pdfs = self.clean_url_list(urls)
|
| 91 |
+
files += lecture_pdfs
|
| 92 |
+
files.remove('storage/data/urls.txt')
|
| 93 |
document_chunks, document_names = data_loader.get_chunks(files, urls)
|
| 94 |
self.logger.info("Completed loading data")
|
| 95 |
|
requirements.txt
CHANGED
|
@@ -16,3 +16,4 @@ beautifulsoup4==4.12.2
|
|
| 16 |
fake-useragent==1.4.0
|
| 17 |
git+https://github.com/huggingface/accelerate.git
|
| 18 |
llama-cpp-python
|
|
|
|
|
|
| 16 |
fake-useragent==1.4.0
|
| 17 |
git+https://github.com/huggingface/accelerate.git
|
| 18 |
llama-cpp-python
|
| 19 |
+
PyPDF2==3.0.1
|