File size: 4,910 Bytes
1aa76cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
##importing relevant dependencies
import os
import nltk ##for data preprocessing
from PyPDF2 import PdfReader ##for handling reading of the PDFs
from bs4 import BeautifulSoup ##for web scrapping
from nltk.stem import PorterStemmer ##for stemming
from nltk.tokenize import sent_tokenize ##for tokenizing our inputs
nltk.download("punkt_tab") ##for handling punctuations
from sklearn.feature_extraction.text import TfidfVectorizer
# Step 1: Data Pre-processing
##Stemming of the inputting data
stemmer= PorterStemmer() ##initiallizng our stemmer
## building logic for stemming and ddata processing
Chunk_size= 999999 ##declaring my defualt chunk size in case the user doesn't specify
def process_text(text, Chunk_size= Chunk_size):
sentences = sent_tokenize(text) ##tokenizing any text we recieve
##I will be creating three variables
original_text= [] #this is for storing the original text we got from the user for easy retrival
processed_text= [] #this will store our processed text after the original has been passed through this function
segments= "" ##this is for storing the chunked up data
##I will explain this code in a string below
for text in sentences:
if len(segments) + len(text) > Chunk_size:
original_text.append(segments)
processed_text.append(" ".join([stemmer.stem(word) for word in segments.split()]))
segments = text
segments += " " + text
# Split text into chunks of at most Chunk_size:
# when adding the next sentence would overflow the chunk,
# flush the current segment (unchanged and stemmed) to the outputs
# and start a new segment.
##Handling the last sequence
if segments:
original_text.append(segments)
processed_text.append(" ". join([stemmer.stem(word) for word in segments.split()]))
return original_text, processed_text
# Step 2: Ingesting the file : We will allow the file take in a PDF, text, or HTML file
##the initial code consisted of three functions but I refactored them into a single class
class DocumentLoader:
def __init__(self, file_path):
self.file_path= file_path
## a method for loading and reading PDFs
def load_pdf(self):
with open(self.file_path, "rb") as f: ## We are using 'rb' since PDFs are compressed so we use rb to read it instead of reading it raw as we would with text files
reader= PdfReader(f)
text= ""
for x in reader.pages:
text += x.extract_text()
return process_text(text)
## a method for handling txt files
def load_text(self):
with open(self.file_path, "r") as f: ## we are using r to have it read the raw text
text= f.read()
return process_text(text)
## A method for handling html files
def load_html(self):
with open(self.file_path, "r") as f:
data= BeautifulSoup(f, "html.parser")
text= data.get_text()
return process_text(text)
# Step 3 Vectorization and Similarity Searching I am creating one class for the vectorizer
## a class for handling adding documents
class Doc_Vectorizer:
def __init__(self):
self.vectorizer= TfidfVectorizer()
self.vectorized_docs= []
self.original_docs= []
self.vectors= None
def add_documents(self, text):
self.vectorized_docs.extend(text)
self.vectors= self.vectorizer.fit_transform(self.vectorized_docs)
return self.vectors
def process_and_add_documents(self, file_path, file_type): ##this is a method for handling multiple
file_type= file_type ##this should help handle various casing inputs of the variables
doc_loader= DocumentLoader(file_path) ##intiating the documentloader class
if file_type == "pdf":
original_data, processed_data= doc_loader.load_pdf()
elif file_type== "txt":
original_data, processed_data= doc_loader.load_text()
elif file_type== "html":
original_data, processed_data= doc_loader.load_html()
else:
raise TypeError("You provided an incorrect file type")
self.original_docs.append(original_data)
self.vectors= self.add_documents(processed_data)
return self.vectors
def find_best_matches(self, query, k=3):
process_query = process_text(query)[1]
query_vector= self.vectorizer.transform(process_query)
similarity= (query_vector * self.vectors.T).toarray() ##we are using the dot product to find out the similarity score
best_match= similarity.argsort()[0][-k:][::-1] ##this code should sort the output, and then grab the top k (3) and then reverse them
return [self.original_docs[i] for i in best_match], [self.vectorized_docs[i] for i in best_match]
|