File size: 1,352 Bytes
8a86e97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import re
import json
import string
import emoji
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Load chat words and number meanings if needed
with open(r'chat_words.json','r') as f:
chat_words = json.load(f)
with open(r'number_meanings.json','r') as f:
number_meanings = json.load(f)
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
# Lowercase text
text = text.lower()
# Clean spaces
text = ' '.join(text.split())
# Remove HTML tags
text = re.sub(r'<.*?>', '', text)
# Remove URLs
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# Replace chat words
new_text = []
for word in text.split():
if word.upper() in chat_words:
new_text.append(chat_words[word.upper()])
else:
new_text.append(word)
text = " ".join(new_text)
# Convert emojis to text
text = emoji.demojize(text, delimiters=("", ""))
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize text
tokens = word_tokenize(text)
# Lemmatize tokens
tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
# Join tokens back into sentence
text = ' '.join(tokens)
return text |