File size: 4,910 Bytes
1aa76cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
##importing relevant dependencies 

import os 
import nltk ##for data preprocessing 
from PyPDF2 import PdfReader  ##for handling reading of the PDFs
from bs4 import BeautifulSoup ##for web scrapping 
from nltk.stem import PorterStemmer ##for stemming
from nltk.tokenize import sent_tokenize ##for tokenizing our inputs
nltk.download("punkt_tab") ##for handling punctuations 
from sklearn.feature_extraction.text import TfidfVectorizer


# Step 1: Data Pre-processing 

##Stemming of the inputting data 

stemmer= PorterStemmer() ##initiallizng our stemmer

## building logic for stemming and ddata processing

Chunk_size= 999999 ##declaring my defualt chunk size in case the user doesn't specify 

def process_text(text, Chunk_size= Chunk_size):
    sentences = sent_tokenize(text) ##tokenizing any text we recieve 
    ##I will be creating three variables 

    original_text= [] #this is for storing the original text we got from the user for easy retrival
    processed_text= [] #this will store our processed text after the original has been passed through this function
    segments= ""  ##this is for storing the chunked up data 
    ##I will explain this code in a string below
    for text in sentences:
        if len(segments) + len(text) > Chunk_size:
            original_text.append(segments)
            processed_text.append(" ".join([stemmer.stem(word) for word in segments.split()]))
            segments = text
        segments += " " + text

        # Split text into chunks of at most Chunk_size:
        # when adding the next sentence would overflow the chunk,
        # flush the current segment (unchanged and stemmed) to the outputs
        # and start a new segment.


        ##Handling the last sequence 

    if segments:
        original_text.append(segments)
        processed_text.append(" ". join([stemmer.stem(word) for word in segments.split()]))

    return original_text, processed_text


# Step 2: Ingesting the file : We will allow the file take in a PDF, text, or HTML file

##the initial code consisted of three functions but I refactored them into a single class 


class DocumentLoader:

    def __init__(self, file_path):
        self.file_path= file_path

## a method for loading and reading PDFs
    def load_pdf(self):
        with open(self.file_path, "rb") as f: ## We are using 'rb' since PDFs are compressed so we use rb to read it instead of reading it raw as we would with text files 
            reader= PdfReader(f)
            text= ""
            for x in reader.pages:
                text += x.extract_text()
            return process_text(text)

## a method for handling txt files 

    def load_text(self):
        with open(self.file_path, "r") as f: ## we are using r to have it read the raw text 
            text= f.read()
            return process_text(text)
    
## A method for handling html files 
    def load_html(self):
        with open(self.file_path, "r") as f:
            data= BeautifulSoup(f, "html.parser")
            text= data.get_text()
            return process_text(text)


# Step 3  Vectorization and Similarity Searching  I am creating one class for the vectorizer 



## a class for handling adding documents 

class Doc_Vectorizer:

    def __init__(self):

        self.vectorizer= TfidfVectorizer()

        self.vectorized_docs= []

        self.original_docs= []

        self.vectors= None 


    def add_documents(self, text):
        self.vectorized_docs.extend(text)
        self.vectors= self.vectorizer.fit_transform(self.vectorized_docs)
        return self.vectors 

    def process_and_add_documents(self, file_path, file_type): ##this is a method for handling multiple 
        file_type= file_type ##this should help handle various casing inputs of the variables 
        doc_loader= DocumentLoader(file_path) ##intiating the documentloader class
        if file_type == "pdf":
            original_data, processed_data= doc_loader.load_pdf()
        elif file_type== "txt":
            original_data, processed_data= doc_loader.load_text()
        elif file_type== "html":
            original_data, processed_data= doc_loader.load_html()
        else: 
            raise TypeError("You provided an incorrect file type")
        self.original_docs.append(original_data)
        self.vectors= self.add_documents(processed_data)

        return self.vectors 
        
    def find_best_matches(self, query, k=3):
        process_query = process_text(query)[1]
        query_vector= self.vectorizer.transform(process_query)
        similarity= (query_vector * self.vectors.T).toarray() ##we are using the dot product to find out the similarity score
        best_match= similarity.argsort()[0][-k:][::-1] ##this code should sort the output, and then grab the top k (3) and then reverse them
        return [self.original_docs[i] for i in best_match], [self.vectorized_docs[i] for i in best_match]