File size: 1,168 Bytes
08ded12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import download_nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.data.path.append("./nltk_data")
class Preprocessing:
    def preprocess_query(text):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text)
        text = text.lower()
        tokens = text.split()
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(filtered_tokens)
    def preprocess_content(text):
        # Similar preprocessing as preprocess_query, but for content
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text)
        text = text.lower()
        tokens = text.split()
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(filtered_tokens)