File size: 1,417 Bytes
42da79c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string  # Import the string module

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Normalize line breaks and remove unnecessary spaces
    text = re.sub(r'\s+', ' ', text.strip())

    # Split alphanumeric combinations (e.g., "hello1234world" -> "hello 1234 world")
    text = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', text)
    text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', text)

    # Tokenize the text into words, numbers, and special characters
    tokens = word_tokenize(text)

    # Process tokens: lemmatize words, keep numbers and special characters
    cleaned_tokens = []
    for token in tokens:
        if token.isalpha():  # Alphabetic words
            if token not in stop_words:
                cleaned_tokens.append(lemmatizer.lemmatize(token))
        elif token.isnumeric():  # Numbers
            cleaned_tokens.append(token)
        elif not token.isalnum() and token not in string.punctuation:  # Special characters (excluding punctuation)
            cleaned_tokens.append(token)

    # Join the tokens back into a single string
    return ' '.join(cleaned_tokens)