import re import string import pandas as pd import spacy import emoji from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS from spacy.util import compile_infix_regex from pathlib import Path from resource_path import resource_path class MultilingualPreprocessor: """ A robust text preprocessor using spaCy for multilingual support. """ def __init__(self, language: str): """ Initializes the preprocessor and loads the appropriate spaCy model. Args: language (str): 'english' or 'multilingual'. """ import sys model_map = { 'english': 'en_core_web_sm', 'multilingual': 'xx_ent_wiki_sm' } self.model_name = model_map.get(language, 'xx_ent_wiki_sm') try: # Check if running from PyInstaller bundle if hasattr(sys, '_MEIPASS'): # PyInstaller mode: load from bundled path model_path_obj = Path(resource_path(self.model_name)) self.nlp = spacy.util.load_model_from_path(model_path_obj) else: # Normal development mode: load by model name self.nlp = spacy.load(self.model_name) except OSError as e: print(f"spaCy Model Error: Could not load model '{self.model_name}'") print(f"Please run: python -m spacy download {self.model_name}") raise # Customize tokenizer to not split on hyphens in words # CORRECTED LINE: CONCAT_QUOTES is wrapped in a list [] infixes = LIST_ELLIPSES + LIST_ICONS + [CONCAT_QUOTES] infix_regex = compile_infix_regex(infixes) self.nlp.tokenizer.infix_finditer = infix_regex.finditer def preprocess_series(self, text_series: pd.Series, options: dict, n_process_spacy: int = -1) -> pd.Series: """ Applies a series of cleaning steps to a pandas Series of text. Args: text_series (pd.Series): The text to be cleaned. options (dict): A dictionary of preprocessing options. Returns: pd.Series: The cleaned text Series. """ # --- Stage 1: Fast, Regex-based cleaning (combined for performance) --- processed_text = text_series.copy().astype(str) # Combine all regex patterns into a single pass for better performance regex_patterns = [] if options.get("remove_html"): regex_patterns.append(r"<.*?>") if options.get("remove_urls"): regex_patterns.append(r"http\S+|www\.\S+") if options.get("handle_hashtags") == "Remove Hashtags": regex_patterns.append(r"#\w+") if options.get("handle_mentions") == "Remove Mentions": regex_patterns.append(r"@\w+") # Apply all regex replacements in a single pass if regex_patterns: combined_pattern = "|".join(regex_patterns) processed_text = processed_text.str.replace(combined_pattern, "", regex=True) # Emoji handling (separate as it needs special library) emoji_option = options.get("handle_emojis", "Keep Emojis") if emoji_option == "Remove Emojis": processed_text = processed_text.apply(lambda s: emoji.replace_emoji(s, replace='')) elif emoji_option == "Convert Emojis to Text": processed_text = processed_text.apply(emoji.demojize) # --- Stage 2: spaCy-based advanced processing --- # Using nlp.pipe for efficiency on a Series cleaned_docs = [] # docs = self.nlp.pipe(processed_text, n_process=-1, batch_size=500) docs = self.nlp.pipe(processed_text, n_process=n_process_spacy, batch_size=500) # Get custom stopwords and convert to lowercase set for fast lookups custom_stopwords = set(options.get("custom_stopwords", [])) for doc in docs: tokens = [] for token in doc: # Punctuation and Number handling if options.get("remove_punctuation") and token.is_punct: continue if options.get("remove_numbers") and (token.is_digit or token.like_num): continue # Stopword handling (including custom stopwords) is_stopword = token.is_stop or token.text.lower() in custom_stopwords if options.get("remove_stopwords") and is_stopword: continue # Use lemma if lemmatization is on, otherwise use the original text token_text = token.lemma_ if options.get("lemmatize") else token.text # Lowercasing (language-aware) if options.get("lowercase"): token_text = token_text.lower() # Remove any leftover special characters or whitespace if options.get("remove_special_chars"): token_text = re.sub(r'[^\w\s-]', '', token_text) if token_text.strip(): tokens.append(token_text.strip()) cleaned_docs.append(" ".join(tokens)) return pd.Series(cleaned_docs, index=text_series.index)