Spaces:
Build error
Build error
File size: 3,121 Bytes
673d9a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import re
import pandas as pd
def preprocess_text(text):
"""
Enhanced text preprocessing that better preserves domain-specific indicators
"""
# Handle potential NaN values
if text is None or isinstance(text, float) and pd.isna(text):
return ""
# Convert to lowercase
text = text.lower()
# Remove special characters while preserving important separators
text = re.sub(r'[^\w\s|-]', ' ', text)
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
# Explicitly preserve key domain terms by adding them multiple times
# This increases their weight in the vectorization
domain_terms = {
'music': ['music', 'guitar', 'band', 'concert', 'gig', 'sing', 'song', 'play music', 'musician'],
'food': ['food', 'cook', 'cuisine', 'recipe', 'restaurant', 'eat', 'culinary', 'bake', 'chef'],
'sports': ['sport', 'run', 'gym', 'fitness', 'workout', 'exercise', 'athletic', 'training'],
'arts': ['art', 'paint', 'draw', 'museum', 'gallery', 'exhibit', 'creative', 'design'],
'technology': ['tech', 'code', 'program', 'software', 'developer', 'computer', 'app', 'digital'],
'education': ['education', 'learn', 'course', 'class', 'study', 'book', 'read', 'academic'],
'travel': ['travel', 'trip', 'hike', 'explore', 'tour', 'visit', 'journey', 'destination']
}
# Check for domain terms and emphasize them
modified_text = text
for category, terms in domain_terms.items():
for term in terms:
if term in text:
# Add the category name explicitly if a related term is found
modified_text += f" {category} {category} {term} {term}"
# Split on common separators but preserve the important phrases
parts = []
for part in re.split(r'\s*\|\s*', modified_text):
# Remove numbers (but keep words with numbers like "web3")
part = re.sub(r'\b\d+\b', '', part)
parts.append(part)
# Define a more focused stopwords list (smaller to keep more domain indicators)
core_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'the', 'a', 'an', 'and', 'but',
'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
'under', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were'}
# Process each part and filter stopwords
processed_parts = []
for part in parts:
words = part.split()
filtered_words = [word for word in words if word not in core_stopwords]
if filtered_words:
processed_parts.append(' '.join(filtered_words))
# Join the processed parts back
processed_text = ' '.join(processed_parts)
return processed_text.strip() |