Spaces:

Mars203020
/

bertopic

Sleeping

File size: 5,375 Bytes

b7b041e

import re
import string
import pandas as pd
import spacy
import emoji
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
from pathlib import Path 

from resource_path import resource_path


class MultilingualPreprocessor:
    """
    A robust text preprocessor using spaCy for multilingual support.
    """
    def __init__(self, language: str):
        """
        Initializes the preprocessor and loads the appropriate spaCy model.

        Args:
            language (str): 'english' or 'multilingual'.
        """
        import sys

        model_map = {
            'english': 'en_core_web_sm',
            'multilingual': 'xx_ent_wiki_sm'
        }
        self.model_name = model_map.get(language, 'xx_ent_wiki_sm')

        try:
            # Check if running from PyInstaller bundle
            if hasattr(sys, '_MEIPASS'):
                # PyInstaller mode: load from bundled path
                model_path_obj = Path(resource_path(self.model_name))
                self.nlp = spacy.util.load_model_from_path(model_path_obj)
            else:
                # Normal development mode: load by model name
                self.nlp = spacy.load(self.model_name)

        except OSError as e:
            print(f"spaCy Model Error: Could not load model '{self.model_name}'")
            print(f"Please run: python -m spacy download {self.model_name}")
            raise

        # Customize tokenizer to not split on hyphens in words
        # CORRECTED LINE: CONCAT_QUOTES is wrapped in a list []
        infixes = LIST_ELLIPSES + LIST_ICONS + [CONCAT_QUOTES]
        infix_regex = compile_infix_regex(infixes)
        self.nlp.tokenizer.infix_finditer = infix_regex.finditer

    def preprocess_series(self, text_series: pd.Series, options: dict, n_process_spacy: int = -1) -> pd.Series:
        """
        Applies a series of cleaning steps to a pandas Series of text.
        
        Args:
            text_series (pd.Series): The text to be cleaned.
            options (dict): A dictionary of preprocessing options.

        Returns:
            pd.Series: The cleaned text Series.
        """
        # --- Stage 1: Fast, Regex-based cleaning (combined for performance) ---
        processed_text = text_series.copy().astype(str)

        # Combine all regex patterns into a single pass for better performance
        regex_patterns = []
        if options.get("remove_html"):
            regex_patterns.append(r"<.*?>")
        if options.get("remove_urls"):
            regex_patterns.append(r"http\S+|www\.\S+")
        if options.get("handle_hashtags") == "Remove Hashtags":
            regex_patterns.append(r"#\w+")
        if options.get("handle_mentions") == "Remove Mentions":
            regex_patterns.append(r"@\w+")

        # Apply all regex replacements in a single pass
        if regex_patterns:
            combined_pattern = "|".join(regex_patterns)
            processed_text = processed_text.str.replace(combined_pattern, "", regex=True)

        # Emoji handling (separate as it needs special library)
        emoji_option = options.get("handle_emojis", "Keep Emojis")
        if emoji_option == "Remove Emojis":
            processed_text = processed_text.apply(lambda s: emoji.replace_emoji(s, replace=''))
        elif emoji_option == "Convert Emojis to Text":
            processed_text = processed_text.apply(emoji.demojize)

        # --- Stage 2: spaCy-based advanced processing ---
        # Using nlp.pipe for efficiency on a Series
        cleaned_docs = []
        # docs = self.nlp.pipe(processed_text, n_process=-1, batch_size=500)
        docs = self.nlp.pipe(processed_text, n_process=n_process_spacy, batch_size=500)

        
        # Get custom stopwords and convert to lowercase set for fast lookups
        custom_stopwords = set(options.get("custom_stopwords", []))

        for doc in docs:
            tokens = []
            for token in doc:
                # Punctuation and Number handling
                if options.get("remove_punctuation") and token.is_punct:
                    continue
                if options.get("remove_numbers") and (token.is_digit or token.like_num):
                    continue
                
                # Stopword handling (including custom stopwords)
                is_stopword = token.is_stop or token.text.lower() in custom_stopwords
                if options.get("remove_stopwords") and is_stopword:
                    continue
                
                # Use lemma if lemmatization is on, otherwise use the original text
                token_text = token.lemma_ if options.get("lemmatize") else token.text
                
                # Lowercasing (language-aware)
                if options.get("lowercase"):
                    token_text = token_text.lower()
                
                # Remove any leftover special characters or whitespace
                if options.get("remove_special_chars"):
                    token_text = re.sub(r'[^\w\s-]', '', token_text)

                if token_text.strip():
                    tokens.append(token_text.strip())
            
            cleaned_docs.append(" ".join(tokens))
            
        return pd.Series(cleaned_docs, index=text_series.index)