Spaces:
Sleeping
Sleeping
File size: 7,894 Bytes
737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f e50b5c0 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 737d99f 5ab54b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import re
import pandas as pd
# from sentiment_train import predict_sentiment
from sentiment import predict_sentiment_bert_batch
import spacy
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
# Load language models
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
# Merge English and French stop words
custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))
def lemmatize_text(text, lang):
if lang == 'fr':
doc = nlp_fr(text)
else:
doc = nlp_en(text)
return " ".join([token.lemma_ for token in doc if not token.is_punct])
def clean_message(text):
""" Remove media notifications, special characters, and unwanted symbols. """
if not isinstance(text, str):
return ""
text = text.lower() # Convert to lowercase
text = re.sub(r"<media omitted>", "", text) # Remove media notifications
text = re.sub(r"this message was deleted", "", text)
text = re.sub(r"null", "", text)
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) # Remove links
text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text) # Remove special characters
return text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
def preprocess_for_clustering(df, n_clusters=5):
"""
Preprocess messages for clustering.
Args:
df (pd.DataFrame): DataFrame containing the 'lemmatized_message' column.
n_clusters (int): Number of clusters to create.
Returns:
df (pd.DataFrame): DataFrame with added 'cluster' column.
cluster_centers (np.array): Cluster centroids.
"""
# Step 1: Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
# Step 2: Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)
# Step 3: Add cluster labels to DataFrame
df['cluster'] = clusters
# Step 4: Reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(tfidf_matrix.toarray())
return df, reduced_features, kmeans.cluster_centers_
def parse_data(data):
"""
Parses the raw chat data into a DataFrame and performs basic cleaning.
"""
# Optimization: Use pandas vectorized string operations instead of looping
# Split lines
lines = data.strip().split("\n")
df = pd.DataFrame({'line': lines})
# Extract Date, Time, Sender, Message using regex
pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
extracted = df['line'].str.extract(pattern)
# Drop lines that didn't match (if any)
extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
# Combine Date and Time
extracted['Time'] = extracted['Time'].str.replace(' ', ' ', regex=False)
extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
# Handle Sender
extracted['Sender'] = extracted['Sender'].fillna('group_notification')
# Rename columns
df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
# Filter out system messages
df = df[df['user'].str.lower() != 'system']
# Convert date
df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
# Filter out invalid dates
df = df.dropna(subset=['date'])
# Filter out group notifications
df = df[df["user"] != "group_notification"]
df.reset_index(drop=True, inplace=True)
# unfiltered messages
df["unfiltered_messages"] = df["message"]
# Clean messages
df["message"] = df["message"].apply(clean_message)
# Extract time-based features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.day_name()
df['minute'] = df['date'].dt.minute
period = []
for hour in df['hour']:
if hour == 23:
period.append(str(hour) + "-" + str('00'))
elif hour == 0:
period.append(str('00') + "-" + str(hour + 1))
else:
period.append(str(hour) + "-" + str(hour + 1))
df['period'] = period
return df
def analyze_sentiment_and_topics(df):
"""
Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
Includes sampling for large datasets.
"""
# Sampling Logic: Cap at 5000 messages for deep analysis
original_df_len = len(df)
if len(df) > 5000:
print(f"Sampling 5000 messages from {len(df)}...")
# We keep the original index to potentially map back, but for now we just work on the sample
df_sample = df.sample(5000, random_state=42).copy()
else:
df_sample = df.copy()
# Filter and lemmatize messages
lemmatized_messages = []
# Optimization: Detect dominant language on a sample
sample_size = min(len(df_sample), 500)
sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
try:
dominant_lang = detect(sample_text)
except LangDetectException:
dominant_lang = 'en'
nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
# Use nlp.pipe for batch processing
lemmatized_messages = []
for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))
df_sample["lemmatized_message"] = lemmatized_messages
# Apply sentiment analysis
# Use batch processing for speed
df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)
# Filter out rows with null lemmatized_message
df_sample = df_sample.dropna(subset=['lemmatized_message'])
# **Fix: Use a custom stop word list**
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
try:
dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
except ValueError:
# Handle case where vocabulary is empty (e.g. all stop words)
print("Warning: Empty vocabulary after filtering. Returning empty topics.")
return df_sample, []
# Apply LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)
# Assign topics to messages
topic_results = lda.transform(dtm)
df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
df_sample['topic'] = topic_results.argmax(axis=1)
# Store topics for visualization
topics = []
for topic in lda.components_:
topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
# If we sampled, we return the sampled dataframe with sentiment/topics.
# The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
# Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
# For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
return df_sample, topics
|