Spaces:
Runtime error
Runtime error
| import download_nltk | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.stem import PorterStemmer | |
| nltk.data.path.append("./nltk_data") | |
| class Preprocessing: | |
| def preprocess_query(text): | |
| lemmatizer = WordNetLemmatizer() | |
| stop_words = set(stopwords.words('english')) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text) | |
| text = text.lower() | |
| tokens = text.split() | |
| filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] | |
| return ' '.join(filtered_tokens) | |
| def preprocess_content(text): | |
| # Similar preprocessing as preprocess_query, but for content | |
| lemmatizer = WordNetLemmatizer() | |
| stop_words = set(stopwords.words('english')) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text) | |
| text = text.lower() | |
| tokens = text.split() | |
| filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] | |
| return ' '.join(filtered_tokens) | |