Spaces:

hansche
/

SocialMediaFoci

Sleeping

File size: 7,894 Bytes

import re
import pandas as pd
# from sentiment_train import predict_sentiment
from sentiment import predict_sentiment_bert_batch
import spacy
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np

# Load language models
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

# Merge English and French stop words
custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))

def lemmatize_text(text, lang):
    if lang == 'fr':
        doc = nlp_fr(text)
    else:
        doc = nlp_en(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

def clean_message(text):
    """ Remove media notifications, special characters, and unwanted symbols. """
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<media omitted>", "", text)  # Remove media notifications
    text = re.sub(r"this message was deleted", "", text)
    text = re.sub(r"null", "", text)

    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove links
    text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text)  # Remove special characters
    return text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np

def preprocess_for_clustering(df, n_clusters=5):
    """
    Preprocess messages for clustering.
    Args:
        df (pd.DataFrame): DataFrame containing the 'lemmatized_message' column.
        n_clusters (int): Number of clusters to create.
    Returns:
        df (pd.DataFrame): DataFrame with added 'cluster' column.
        cluster_centers (np.array): Cluster centroids.
    """
    # Step 1: Vectorize text using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])

    # Step 2: Apply K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(tfidf_matrix)

    # Step 3: Add cluster labels to DataFrame
    df['cluster'] = clusters

    # Step 4: Reduce dimensionality for visualization
    tsne = TSNE(n_components=2, random_state=42)
    reduced_features = tsne.fit_transform(tfidf_matrix.toarray())

    return df, reduced_features, kmeans.cluster_centers_

def parse_data(data):
    """
    Parses the raw chat data into a DataFrame and performs basic cleaning.
    """
    # Optimization: Use pandas vectorized string operations instead of looping
    
    # Split lines
    lines = data.strip().split("\n")
    df = pd.DataFrame({'line': lines})
    
    # Extract Date, Time, Sender, Message using regex
    pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
    
    extracted = df['line'].str.extract(pattern)
    
    # Drop lines that didn't match (if any)
    extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
    
    # Combine Date and Time
    extracted['Time'] = extracted['Time'].str.replace('â€¯', ' ', regex=False)
    extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
    
    # Handle Sender
    extracted['Sender'] = extracted['Sender'].fillna('group_notification')
    
    # Rename columns
    df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
    
    # Filter out system messages
    df = df[df['user'].str.lower() != 'system']
    
    # Convert date
    df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
    
    # Filter out invalid dates
    df = df.dropna(subset=['date'])
    
    # Filter out group notifications
    df = df[df["user"] != "group_notification"]
    df.reset_index(drop=True, inplace=True)

    # unfiltered  messages
    df["unfiltered_messages"] = df["message"]
    # Clean messages
    df["message"] = df["message"].apply(clean_message)
    
    # Extract time-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.day_name()
    df['minute'] = df['date'].dt.minute

    period = []
    for hour in df['hour']:
        if hour == 23:
            period.append(str(hour) + "-" + str('00'))
        elif hour == 0:
            period.append(str('00') + "-" + str(hour + 1))
        else:
            period.append(str(hour) + "-" + str(hour + 1))

    df['period'] = period
    
    return df

def analyze_sentiment_and_topics(df):
    """
    Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
    Includes sampling for large datasets.
    """
    # Sampling Logic: Cap at 5000 messages for deep analysis
    original_df_len = len(df)
    if len(df) > 5000:
        print(f"Sampling 5000 messages from {len(df)}...")
        # We keep the original index to potentially map back, but for now we just work on the sample
        df_sample = df.sample(5000, random_state=42).copy()
    else:
        df_sample = df.copy()

    # Filter and lemmatize messages
    lemmatized_messages = []
    # Optimization: Detect dominant language on a sample
    sample_size = min(len(df_sample), 500)
    sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
    try:
        dominant_lang = detect(sample_text)
    except LangDetectException:
        dominant_lang = 'en'
    
    nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
    
    # Use nlp.pipe for batch processing
    lemmatized_messages = []
    for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
        lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))

    df_sample["lemmatized_message"] = lemmatized_messages

    # Apply sentiment analysis
    # Use batch processing for speed
    df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)

    # Filter out rows with null lemmatized_message
    df_sample = df_sample.dropna(subset=['lemmatized_message'])

    # **Fix: Use a custom stop word list**
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
    try:
        dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
    except ValueError:
        # Handle case where vocabulary is empty (e.g. all stop words)
        print("Warning: Empty vocabulary after filtering. Returning empty topics.")
        return df_sample, []

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)

    # Assign topics to messages
    topic_results = lda.transform(dtm)
    df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
    df_sample['topic'] = topic_results.argmax(axis=1)

    # Store topics for visualization
    topics = []
    for topic in lda.components_:
        topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    
    # If we sampled, we return the sampled dataframe with sentiment/topics.
    # The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
    # Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
    # For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
    
    return df_sample, topics