Spaces:
Sleeping
Sleeping
File size: 5,375 Bytes
b7b041e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import re
import string
import pandas as pd
import spacy
import emoji
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
from pathlib import Path
from resource_path import resource_path
class MultilingualPreprocessor:
"""
A robust text preprocessor using spaCy for multilingual support.
"""
def __init__(self, language: str):
"""
Initializes the preprocessor and loads the appropriate spaCy model.
Args:
language (str): 'english' or 'multilingual'.
"""
import sys
model_map = {
'english': 'en_core_web_sm',
'multilingual': 'xx_ent_wiki_sm'
}
self.model_name = model_map.get(language, 'xx_ent_wiki_sm')
try:
# Check if running from PyInstaller bundle
if hasattr(sys, '_MEIPASS'):
# PyInstaller mode: load from bundled path
model_path_obj = Path(resource_path(self.model_name))
self.nlp = spacy.util.load_model_from_path(model_path_obj)
else:
# Normal development mode: load by model name
self.nlp = spacy.load(self.model_name)
except OSError as e:
print(f"spaCy Model Error: Could not load model '{self.model_name}'")
print(f"Please run: python -m spacy download {self.model_name}")
raise
# Customize tokenizer to not split on hyphens in words
# CORRECTED LINE: CONCAT_QUOTES is wrapped in a list []
infixes = LIST_ELLIPSES + LIST_ICONS + [CONCAT_QUOTES]
infix_regex = compile_infix_regex(infixes)
self.nlp.tokenizer.infix_finditer = infix_regex.finditer
def preprocess_series(self, text_series: pd.Series, options: dict, n_process_spacy: int = -1) -> pd.Series:
"""
Applies a series of cleaning steps to a pandas Series of text.
Args:
text_series (pd.Series): The text to be cleaned.
options (dict): A dictionary of preprocessing options.
Returns:
pd.Series: The cleaned text Series.
"""
# --- Stage 1: Fast, Regex-based cleaning (combined for performance) ---
processed_text = text_series.copy().astype(str)
# Combine all regex patterns into a single pass for better performance
regex_patterns = []
if options.get("remove_html"):
regex_patterns.append(r"<.*?>")
if options.get("remove_urls"):
regex_patterns.append(r"http\S+|www\.\S+")
if options.get("handle_hashtags") == "Remove Hashtags":
regex_patterns.append(r"#\w+")
if options.get("handle_mentions") == "Remove Mentions":
regex_patterns.append(r"@\w+")
# Apply all regex replacements in a single pass
if regex_patterns:
combined_pattern = "|".join(regex_patterns)
processed_text = processed_text.str.replace(combined_pattern, "", regex=True)
# Emoji handling (separate as it needs special library)
emoji_option = options.get("handle_emojis", "Keep Emojis")
if emoji_option == "Remove Emojis":
processed_text = processed_text.apply(lambda s: emoji.replace_emoji(s, replace=''))
elif emoji_option == "Convert Emojis to Text":
processed_text = processed_text.apply(emoji.demojize)
# --- Stage 2: spaCy-based advanced processing ---
# Using nlp.pipe for efficiency on a Series
cleaned_docs = []
# docs = self.nlp.pipe(processed_text, n_process=-1, batch_size=500)
docs = self.nlp.pipe(processed_text, n_process=n_process_spacy, batch_size=500)
# Get custom stopwords and convert to lowercase set for fast lookups
custom_stopwords = set(options.get("custom_stopwords", []))
for doc in docs:
tokens = []
for token in doc:
# Punctuation and Number handling
if options.get("remove_punctuation") and token.is_punct:
continue
if options.get("remove_numbers") and (token.is_digit or token.like_num):
continue
# Stopword handling (including custom stopwords)
is_stopword = token.is_stop or token.text.lower() in custom_stopwords
if options.get("remove_stopwords") and is_stopword:
continue
# Use lemma if lemmatization is on, otherwise use the original text
token_text = token.lemma_ if options.get("lemmatize") else token.text
# Lowercasing (language-aware)
if options.get("lowercase"):
token_text = token_text.lower()
# Remove any leftover special characters or whitespace
if options.get("remove_special_chars"):
token_text = re.sub(r'[^\w\s-]', '', token_text)
if token_text.strip():
tokens.append(token_text.strip())
cleaned_docs.append(" ".join(tokens))
return pd.Series(cleaned_docs, index=text_series.index) |