File size: 1,352 Bytes
8a86e97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import json
import string
import emoji
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Load chat words and number meanings if needed
with open(r'chat_words.json','r') as f:
    chat_words = json.load(f)

with open(r'number_meanings.json','r') as f:
    number_meanings = json.load(f)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Lowercase text
    text = text.lower()

    # Clean spaces
    text = ' '.join(text.split())

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Replace chat words
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    text = " ".join(new_text)

    # Convert emojis to text
    text = emoji.demojize(text, delimiters=("", ""))

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]

    # Join tokens back into sentence
    text = ' '.join(tokens)

    return text