File size: 7,894 Bytes
737d99f
 
5ab54b7
 
737d99f
5ab54b7
 
737d99f
 
 
5ab54b7
737d99f
 
 
e50b5c0
5ab54b7
 
737d99f
 
5ab54b7
 
737d99f
 
 
 
 
 
 
 
5ab54b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737d99f
5ab54b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737d99f
5ab54b7
737d99f
5ab54b7
737d99f
 
 
5ab54b7
737d99f
5ab54b7
 
737d99f
5ab54b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737d99f
5ab54b7
737d99f
5ab54b7
 
 
 
 
 
 
 
 
 
 
 
 
737d99f
5ab54b7
 
 
 
 
 
737d99f
 
 
 
 
 
 
5ab54b7
 
737d99f
 
 
 
 
 
5ab54b7
 
 
 
737d99f
5ab54b7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import re
import pandas as pd
# from sentiment_train import predict_sentiment
from sentiment import predict_sentiment_bert_batch
import spacy
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np

# Load language models
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

# Merge English and French stop words
custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))

def lemmatize_text(text, lang):
    if lang == 'fr':
        doc = nlp_fr(text)
    else:
        doc = nlp_en(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

def clean_message(text):
    """ Remove media notifications, special characters, and unwanted symbols. """
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<media omitted>", "", text)  # Remove media notifications
    text = re.sub(r"this message was deleted", "", text)
    text = re.sub(r"null", "", text)

    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove links
    text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text)  # Remove special characters
    return text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np

def preprocess_for_clustering(df, n_clusters=5):
    """
    Preprocess messages for clustering.
    Args:
        df (pd.DataFrame): DataFrame containing the 'lemmatized_message' column.
        n_clusters (int): Number of clusters to create.
    Returns:
        df (pd.DataFrame): DataFrame with added 'cluster' column.
        cluster_centers (np.array): Cluster centroids.
    """
    # Step 1: Vectorize text using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])

    # Step 2: Apply K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(tfidf_matrix)

    # Step 3: Add cluster labels to DataFrame
    df['cluster'] = clusters

    # Step 4: Reduce dimensionality for visualization
    tsne = TSNE(n_components=2, random_state=42)
    reduced_features = tsne.fit_transform(tfidf_matrix.toarray())

    return df, reduced_features, kmeans.cluster_centers_

def parse_data(data):
    """
    Parses the raw chat data into a DataFrame and performs basic cleaning.
    """
    # Optimization: Use pandas vectorized string operations instead of looping
    
    # Split lines
    lines = data.strip().split("\n")
    df = pd.DataFrame({'line': lines})
    
    # Extract Date, Time, Sender, Message using regex
    pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
    
    extracted = df['line'].str.extract(pattern)
    
    # Drop lines that didn't match (if any)
    extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
    
    # Combine Date and Time
    extracted['Time'] = extracted['Time'].str.replace(' ', ' ', regex=False)
    extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
    
    # Handle Sender
    extracted['Sender'] = extracted['Sender'].fillna('group_notification')
    
    # Rename columns
    df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
    
    # Filter out system messages
    df = df[df['user'].str.lower() != 'system']
    
    # Convert date
    df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
    
    # Filter out invalid dates
    df = df.dropna(subset=['date'])
    
    # Filter out group notifications
    df = df[df["user"] != "group_notification"]
    df.reset_index(drop=True, inplace=True)

    # unfiltered  messages
    df["unfiltered_messages"] = df["message"]
    # Clean messages
    df["message"] = df["message"].apply(clean_message)
    
    # Extract time-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.day_name()
    df['minute'] = df['date'].dt.minute

    period = []
    for hour in df['hour']:
        if hour == 23:
            period.append(str(hour) + "-" + str('00'))
        elif hour == 0:
            period.append(str('00') + "-" + str(hour + 1))
        else:
            period.append(str(hour) + "-" + str(hour + 1))

    df['period'] = period
    
    return df

def analyze_sentiment_and_topics(df):
    """
    Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
    Includes sampling for large datasets.
    """
    # Sampling Logic: Cap at 5000 messages for deep analysis
    original_df_len = len(df)
    if len(df) > 5000:
        print(f"Sampling 5000 messages from {len(df)}...")
        # We keep the original index to potentially map back, but for now we just work on the sample
        df_sample = df.sample(5000, random_state=42).copy()
    else:
        df_sample = df.copy()

    # Filter and lemmatize messages
    lemmatized_messages = []
    # Optimization: Detect dominant language on a sample
    sample_size = min(len(df_sample), 500)
    sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
    try:
        dominant_lang = detect(sample_text)
    except LangDetectException:
        dominant_lang = 'en'
    
    nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
    
    # Use nlp.pipe for batch processing
    lemmatized_messages = []
    for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
        lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))

    df_sample["lemmatized_message"] = lemmatized_messages

    # Apply sentiment analysis
    # Use batch processing for speed
    df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)

    # Filter out rows with null lemmatized_message
    df_sample = df_sample.dropna(subset=['lemmatized_message'])

    # **Fix: Use a custom stop word list**
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
    try:
        dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
    except ValueError:
        # Handle case where vocabulary is empty (e.g. all stop words)
        print("Warning: Empty vocabulary after filtering. Returning empty topics.")
        return df_sample, []

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)

    # Assign topics to messages
    topic_results = lda.transform(dtm)
    df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
    df_sample['topic'] = topic_results.argmax(axis=1)

    # Store topics for visualization
    topics = []
    for topic in lda.components_:
        topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    
    # If we sampled, we return the sampled dataframe with sentiment/topics.
    # The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
    # Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
    # For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
    
    return df_sample, topics