import re import pandas as pd def preprocess_text(text): """ Enhanced text preprocessing that better preserves domain-specific indicators """ # Handle potential NaN values if text is None or isinstance(text, float) and pd.isna(text): return "" # Convert to lowercase text = text.lower() # Remove special characters while preserving important separators text = re.sub(r'[^\w\s|-]', ' ', text) # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) # Explicitly preserve key domain terms by adding them multiple times # This increases their weight in the vectorization domain_terms = { 'music': ['music', 'guitar', 'band', 'concert', 'gig', 'sing', 'song', 'play music', 'musician'], 'food': ['food', 'cook', 'cuisine', 'recipe', 'restaurant', 'eat', 'culinary', 'bake', 'chef'], 'sports': ['sport', 'run', 'gym', 'fitness', 'workout', 'exercise', 'athletic', 'training'], 'arts': ['art', 'paint', 'draw', 'museum', 'gallery', 'exhibit', 'creative', 'design'], 'technology': ['tech', 'code', 'program', 'software', 'developer', 'computer', 'app', 'digital'], 'education': ['education', 'learn', 'course', 'class', 'study', 'book', 'read', 'academic'], 'travel': ['travel', 'trip', 'hike', 'explore', 'tour', 'visit', 'journey', 'destination'] } # Check for domain terms and emphasize them modified_text = text for category, terms in domain_terms.items(): for term in terms: if term in text: # Add the category name explicitly if a related term is found modified_text += f" {category} {category} {term} {term}" # Split on common separators but preserve the important phrases parts = [] for part in re.split(r'\s*\|\s*', modified_text): # Remove numbers (but keep words with numbers like "web3") part = re.sub(r'\b\d+\b', '', part) parts.append(part) # Define a more focused stopwords list (smaller to keep more domain indicators) core_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'the', 'a', 'an', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were'} # Process each part and filter stopwords processed_parts = [] for part in parts: words = part.split() filtered_words = [word for word in words if word not in core_stopwords] if filtered_words: processed_parts.append(' '.join(filtered_words)) # Join the processed parts back processed_text = ' '.join(processed_parts) return processed_text.strip()