Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| # from sentiment_train import predict_sentiment | |
| from sentiment import predict_sentiment_bert_batch | |
| import spacy | |
| from langdetect import detect, LangDetectException | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.decomposition import LatentDirichletAllocation | |
| from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS | |
| from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| from sklearn.manifold import TSNE | |
| import numpy as np | |
| # Load language models | |
| nlp_fr = spacy.load("fr_core_news_sm") | |
| nlp_en = spacy.load("en_core_web_sm") | |
| # Merge English and French stop words | |
| custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS)) | |
| def lemmatize_text(text, lang): | |
| if lang == 'fr': | |
| doc = nlp_fr(text) | |
| else: | |
| doc = nlp_en(text) | |
| return " ".join([token.lemma_ for token in doc if not token.is_punct]) | |
| def clean_message(text): | |
| """ Remove media notifications, special characters, and unwanted symbols. """ | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.lower() # Convert to lowercase | |
| text = re.sub(r"<media omitted>", "", text) # Remove media notifications | |
| text = re.sub(r"this message was deleted", "", text) | |
| text = re.sub(r"null", "", text) | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) # Remove links | |
| text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text) # Remove special characters | |
| return text | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| from sklearn.manifold import TSNE | |
| import numpy as np | |
| def preprocess_for_clustering(df, n_clusters=5): | |
| """ | |
| Preprocess messages for clustering. | |
| Args: | |
| df (pd.DataFrame): DataFrame containing the 'lemmatized_message' column. | |
| n_clusters (int): Number of clusters to create. | |
| Returns: | |
| df (pd.DataFrame): DataFrame with added 'cluster' column. | |
| cluster_centers (np.array): Cluster centroids. | |
| """ | |
| # Step 1: Vectorize text using TF-IDF | |
| vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
| tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message']) | |
| # Step 2: Apply K-Means clustering | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
| clusters = kmeans.fit_predict(tfidf_matrix) | |
| # Step 3: Add cluster labels to DataFrame | |
| df['cluster'] = clusters | |
| # Step 4: Reduce dimensionality for visualization | |
| tsne = TSNE(n_components=2, random_state=42) | |
| reduced_features = tsne.fit_transform(tfidf_matrix.toarray()) | |
| return df, reduced_features, kmeans.cluster_centers_ | |
| def parse_data(data): | |
| """ | |
| Parses the raw chat data into a DataFrame and performs basic cleaning. | |
| """ | |
| # Optimization: Use pandas vectorized string operations instead of looping | |
| # Split lines | |
| lines = data.strip().split("\n") | |
| df = pd.DataFrame({'line': lines}) | |
| # Extract Date, Time, Sender, Message using regex | |
| pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$" | |
| extracted = df['line'].str.extract(pattern) | |
| # Drop lines that didn't match (if any) | |
| extracted = extracted.dropna(subset=['Date', 'Time', 'Message']) | |
| # Combine Date and Time | |
| extracted['Time'] = extracted['Time'].str.replace(' ', ' ', regex=False) | |
| extracted['message_date'] = extracted['Date'] + ", " + extracted['Time'] | |
| # Handle Sender | |
| extracted['Sender'] = extracted['Sender'].fillna('group_notification') | |
| # Rename columns | |
| df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'}) | |
| # Filter out system messages | |
| df = df[df['user'].str.lower() != 'system'] | |
| # Convert date | |
| df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce') | |
| # Filter out invalid dates | |
| df = df.dropna(subset=['date']) | |
| # Filter out group notifications | |
| df = df[df["user"] != "group_notification"] | |
| df.reset_index(drop=True, inplace=True) | |
| # unfiltered messages | |
| df["unfiltered_messages"] = df["message"] | |
| # Clean messages | |
| df["message"] = df["message"].apply(clean_message) | |
| # Extract time-based features | |
| df['year'] = df['date'].dt.year | |
| df['month'] = df['date'].dt.month_name() | |
| df['day'] = df['date'].dt.day | |
| df['hour'] = df['date'].dt.hour | |
| df['day_of_week'] = df['date'].dt.day_name() | |
| df['minute'] = df['date'].dt.minute | |
| period = [] | |
| for hour in df['hour']: | |
| if hour == 23: | |
| period.append(str(hour) + "-" + str('00')) | |
| elif hour == 0: | |
| period.append(str('00') + "-" + str(hour + 1)) | |
| else: | |
| period.append(str(hour) + "-" + str(hour + 1)) | |
| df['period'] = period | |
| return df | |
| def analyze_sentiment_and_topics(df): | |
| """ | |
| Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling. | |
| Includes sampling for large datasets. | |
| """ | |
| # Sampling Logic: Cap at 5000 messages for deep analysis | |
| original_df_len = len(df) | |
| if len(df) > 5000: | |
| print(f"Sampling 5000 messages from {len(df)}...") | |
| # We keep the original index to potentially map back, but for now we just work on the sample | |
| df_sample = df.sample(5000, random_state=42).copy() | |
| else: | |
| df_sample = df.copy() | |
| # Filter and lemmatize messages | |
| lemmatized_messages = [] | |
| # Optimization: Detect dominant language on a sample | |
| sample_size = min(len(df_sample), 500) | |
| sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist()) | |
| try: | |
| dominant_lang = detect(sample_text) | |
| except LangDetectException: | |
| dominant_lang = 'en' | |
| nlp = nlp_fr if dominant_lang == 'fr' else nlp_en | |
| # Use nlp.pipe for batch processing | |
| lemmatized_messages = [] | |
| for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]): | |
| lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct])) | |
| df_sample["lemmatized_message"] = lemmatized_messages | |
| # Apply sentiment analysis | |
| # Use batch processing for speed | |
| df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128) | |
| # Filter out rows with null lemmatized_message | |
| df_sample = df_sample.dropna(subset=['lemmatized_message']) | |
| # **Fix: Use a custom stop word list** | |
| vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words) | |
| try: | |
| dtm = vectorizer.fit_transform(df_sample['lemmatized_message']) | |
| except ValueError: | |
| # Handle case where vocabulary is empty (e.g. all stop words) | |
| print("Warning: Empty vocabulary after filtering. Returning empty topics.") | |
| return df_sample, [] | |
| # Apply LDA | |
| lda = LatentDirichletAllocation(n_components=5, random_state=42) | |
| lda.fit(dtm) | |
| # Assign topics to messages | |
| topic_results = lda.transform(dtm) | |
| df_sample = df_sample.iloc[:topic_results.shape[0]].copy() | |
| df_sample['topic'] = topic_results.argmax(axis=1) | |
| # Store topics for visualization | |
| topics = [] | |
| for topic in lda.components_: | |
| topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]) | |
| # If we sampled, we return the sampled dataframe with sentiment/topics. | |
| # The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different. | |
| # Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls. | |
| # For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative. | |
| return df_sample, topics | |