|
|
|
|
|
|
|
|
import os |
|
|
import nltk |
|
|
from PyPDF2 import PdfReader |
|
|
from bs4 import BeautifulSoup |
|
|
from nltk.stem import PorterStemmer |
|
|
from nltk.tokenize import sent_tokenize |
|
|
nltk.download("punkt_tab") |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stemmer= PorterStemmer() |
|
|
|
|
|
|
|
|
|
|
|
Chunk_size= 999999 |
|
|
|
|
|
def process_text(text, Chunk_size= Chunk_size): |
|
|
sentences = sent_tokenize(text) |
|
|
|
|
|
|
|
|
original_text= [] |
|
|
processed_text= [] |
|
|
segments= "" |
|
|
|
|
|
for text in sentences: |
|
|
if len(segments) + len(text) > Chunk_size: |
|
|
original_text.append(segments) |
|
|
processed_text.append(" ".join([stemmer.stem(word) for word in segments.split()])) |
|
|
segments = text |
|
|
segments += " " + text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if segments: |
|
|
original_text.append(segments) |
|
|
processed_text.append(" ". join([stemmer.stem(word) for word in segments.split()])) |
|
|
|
|
|
return original_text, processed_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentLoader: |
|
|
|
|
|
def __init__(self, file_path): |
|
|
self.file_path= file_path |
|
|
|
|
|
|
|
|
def load_pdf(self): |
|
|
with open(self.file_path, "rb") as f: |
|
|
reader= PdfReader(f) |
|
|
text= "" |
|
|
for x in reader.pages: |
|
|
text += x.extract_text() |
|
|
return process_text(text) |
|
|
|
|
|
|
|
|
|
|
|
def load_text(self): |
|
|
with open(self.file_path, "r") as f: |
|
|
text= f.read() |
|
|
return process_text(text) |
|
|
|
|
|
|
|
|
def load_html(self): |
|
|
with open(self.file_path, "r") as f: |
|
|
data= BeautifulSoup(f, "html.parser") |
|
|
text= data.get_text() |
|
|
return process_text(text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Doc_Vectorizer: |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.vectorizer= TfidfVectorizer() |
|
|
|
|
|
self.vectorized_docs= [] |
|
|
|
|
|
self.original_docs= [] |
|
|
|
|
|
self.vectors= None |
|
|
|
|
|
|
|
|
def add_documents(self, text): |
|
|
self.vectorized_docs.extend(text) |
|
|
self.vectors= self.vectorizer.fit_transform(self.vectorized_docs) |
|
|
return self.vectors |
|
|
|
|
|
def process_and_add_documents(self, file_path, file_type): |
|
|
file_type= file_type |
|
|
doc_loader= DocumentLoader(file_path) |
|
|
if file_type == "pdf": |
|
|
original_data, processed_data= doc_loader.load_pdf() |
|
|
elif file_type== "txt": |
|
|
original_data, processed_data= doc_loader.load_text() |
|
|
elif file_type== "html": |
|
|
original_data, processed_data= doc_loader.load_html() |
|
|
else: |
|
|
raise TypeError("You provided an incorrect file type") |
|
|
self.original_docs.append(original_data) |
|
|
self.vectors= self.add_documents(processed_data) |
|
|
|
|
|
return self.vectors |
|
|
|
|
|
def find_best_matches(self, query, k=3): |
|
|
process_query = process_text(query)[1] |
|
|
query_vector= self.vectorizer.transform(process_query) |
|
|
similarity= (query_vector * self.vectors.T).toarray() |
|
|
best_match= similarity.argsort()[0][-k:][::-1] |
|
|
return [self.original_docs[i] for i in best_match], [self.vectorized_docs[i] for i in best_match] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|