File size: 5,375 Bytes
b7b041e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import re
import string
import pandas as pd
import spacy
import emoji
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
from pathlib import Path 

from resource_path import resource_path


class MultilingualPreprocessor:
    """
    A robust text preprocessor using spaCy for multilingual support.
    """
    def __init__(self, language: str):
        """
        Initializes the preprocessor and loads the appropriate spaCy model.

        Args:
            language (str): 'english' or 'multilingual'.
        """
        import sys

        model_map = {
            'english': 'en_core_web_sm',
            'multilingual': 'xx_ent_wiki_sm'
        }
        self.model_name = model_map.get(language, 'xx_ent_wiki_sm')

        try:
            # Check if running from PyInstaller bundle
            if hasattr(sys, '_MEIPASS'):
                # PyInstaller mode: load from bundled path
                model_path_obj = Path(resource_path(self.model_name))
                self.nlp = spacy.util.load_model_from_path(model_path_obj)
            else:
                # Normal development mode: load by model name
                self.nlp = spacy.load(self.model_name)

        except OSError as e:
            print(f"spaCy Model Error: Could not load model '{self.model_name}'")
            print(f"Please run: python -m spacy download {self.model_name}")
            raise

        # Customize tokenizer to not split on hyphens in words
        # CORRECTED LINE: CONCAT_QUOTES is wrapped in a list []
        infixes = LIST_ELLIPSES + LIST_ICONS + [CONCAT_QUOTES]
        infix_regex = compile_infix_regex(infixes)
        self.nlp.tokenizer.infix_finditer = infix_regex.finditer

    def preprocess_series(self, text_series: pd.Series, options: dict, n_process_spacy: int = -1) -> pd.Series:
        """
        Applies a series of cleaning steps to a pandas Series of text.
        
        Args:
            text_series (pd.Series): The text to be cleaned.
            options (dict): A dictionary of preprocessing options.

        Returns:
            pd.Series: The cleaned text Series.
        """
        # --- Stage 1: Fast, Regex-based cleaning (combined for performance) ---
        processed_text = text_series.copy().astype(str)

        # Combine all regex patterns into a single pass for better performance
        regex_patterns = []
        if options.get("remove_html"):
            regex_patterns.append(r"<.*?>")
        if options.get("remove_urls"):
            regex_patterns.append(r"http\S+|www\.\S+")
        if options.get("handle_hashtags") == "Remove Hashtags":
            regex_patterns.append(r"#\w+")
        if options.get("handle_mentions") == "Remove Mentions":
            regex_patterns.append(r"@\w+")

        # Apply all regex replacements in a single pass
        if regex_patterns:
            combined_pattern = "|".join(regex_patterns)
            processed_text = processed_text.str.replace(combined_pattern, "", regex=True)

        # Emoji handling (separate as it needs special library)
        emoji_option = options.get("handle_emojis", "Keep Emojis")
        if emoji_option == "Remove Emojis":
            processed_text = processed_text.apply(lambda s: emoji.replace_emoji(s, replace=''))
        elif emoji_option == "Convert Emojis to Text":
            processed_text = processed_text.apply(emoji.demojize)

        # --- Stage 2: spaCy-based advanced processing ---
        # Using nlp.pipe for efficiency on a Series
        cleaned_docs = []
        # docs = self.nlp.pipe(processed_text, n_process=-1, batch_size=500)
        docs = self.nlp.pipe(processed_text, n_process=n_process_spacy, batch_size=500)

        
        # Get custom stopwords and convert to lowercase set for fast lookups
        custom_stopwords = set(options.get("custom_stopwords", []))

        for doc in docs:
            tokens = []
            for token in doc:
                # Punctuation and Number handling
                if options.get("remove_punctuation") and token.is_punct:
                    continue
                if options.get("remove_numbers") and (token.is_digit or token.like_num):
                    continue
                
                # Stopword handling (including custom stopwords)
                is_stopword = token.is_stop or token.text.lower() in custom_stopwords
                if options.get("remove_stopwords") and is_stopword:
                    continue
                
                # Use lemma if lemmatization is on, otherwise use the original text
                token_text = token.lemma_ if options.get("lemmatize") else token.text
                
                # Lowercasing (language-aware)
                if options.get("lowercase"):
                    token_text = token_text.lower()
                
                # Remove any leftover special characters or whitespace
                if options.get("remove_special_chars"):
                    token_text = re.sub(r'[^\w\s-]', '', token_text)

                if token_text.strip():
                    tokens.append(token_text.strip())
            
            cleaned_docs.append(" ".join(tokens))
            
        return pd.Series(cleaned_docs, index=text_series.index)