Spaces:

Mars203020
/

bertopic

Sleeping

File size: 5,088 Bytes

b7b041e

import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired


def analyze_general_topic_evolution(topic_model, docs, timestamps):
    """
    Analyzes general topic evolution over time.

    Args:
        topic_model: Trained BERTopic model.
        docs (list): List of documents.
        timestamps (list): List of timestamps corresponding to the documents.

    Returns:
        pd.DataFrame: DataFrame with topic evolution information.
    """
    try:
        topics_over_time = topic_model.topics_over_time(docs, timestamps, global_tuning=True)
        return topics_over_time
    except Exception:
        # Fallback for small datasets or cases where evolution can't be computed
        return pd.DataFrame(columns=['Topic', 'Words', 'Frequency', 'Timestamp'])


def analyze_user_topic_evolution(df: pd.DataFrame, topic_model):
    """
    Analyzes topic evolution per user.

    Args:
        df (pd.DataFrame): DataFrame with (
            "user_id", "post_content", "timestamp", and "topic_id" columns.
        topic_model: Trained BERTopic model.

    Returns:
        dict: A dictionary where keys are user_ids and values are DataFrames of topic evolution for that user.
    """
    user_topic_evolution = {}
    for user_id in df["user_id"].unique():
        user_df = df[df["user_id"] == user_id].copy()
        if not user_df.empty and len(user_df) > 1:
            try:
                # Ensure timestamps are sorted for topics_over_time
                user_df = user_df.sort_values(by="timestamp")
                docs = user_df["post_content"].tolist()
                timestamps = user_df["timestamp"].tolist()
                selected_topics = user_df["topic_id"].tolist() # Get topic_ids for the user's posts
                topics_over_time = topic_model.topics_over_time(docs, timestamps, topics=selected_topics, global_tuning=True)
                user_topic_evolution[user_id] = topics_over_time
            except Exception:
                user_topic_evolution[user_id] = pd.DataFrame(columns=['Topic', 'Words', 'Frequency', 'Timestamp'])
        else:
             user_topic_evolution[user_id] = pd.DataFrame(columns=['Topic', 'Words', 'Frequency', 'Timestamp'])
    return user_topic_evolution

if __name__ == "__main__":
    # Example Usage:
    data = {
        "user_id": ["user1", "user2", "user1", "user3", "user2", "user1", "user4", "user3", "user2", "user1", "user5", "user4", "user3", "user2", "user1"],
        "post_content": [
            "This is a great movie, I loved the acting and the plot. It was truly captivating.",
            "The new phone has an amazing camera and long battery life. Highly recommend it.",
            "I enjoyed the film, especially the special effects and the soundtrack. A must-watch.",
            "Learning about AI and machine learning is fascinating. The future is here.",
            "My old phone is so slow, I need an upgrade soon. Thinking about the latest model.",
            "The best part of the movie was the soundtrack and the stunning visuals. Very immersive.",
            "Exploring the vastness of space is a lifelong dream. Astronomy is amazing.",
            "Data science is revolutionizing industries. Predictive analytics is key.",
            "I need a new laptop for work. Something powerful and portable.",
            "Just finished reading a fantastic book on quantum physics. Mind-blowing concepts.",
            "Cooking new recipes is my passion. Today, I tried a spicy Thai curry.",
            "The universe is full of mysteries. Black holes and dark matter are intriguing.",
            "Deep learning models are becoming incredibly sophisticated. Image recognition is impressive.",
            "My current laptop is crashing frequently. Time for an upgrade.",
            "Science fiction movies always make me think about the future of humanity."
        ],
        "timestamp": [
            "2023-01-01 10:00:00", "2023-01-01 11:00:00", "2023-01-02 10:30:00",
            "2023-01-02 14:00:00", "2023-01-03 09:00:00", "2023-01-03 16:00:00",
            "2023-01-04 08:00:00", "2023-01-04 12:00:00", "2023-01-05 10:00:00",
            "2023-01-05 15:00:00", "2023-01-06 09:30:00", "2023-01-06 13:00:00",
            "2023-01-07 11:00:00", "2023-01-07 14:30:00", "2023-01-08 10:00:00"
        ]
    }
    df = pd.DataFrame(data)
    df["timestamp"] = pd.to_datetime(df["timestamp"])

    print("Performing topic modeling (English)...")
    model_en, topics_en, probs_en = perform_topic_modeling(df, language="english")
    df["topic_id"] = topics_en

    print("\nAnalyzing general topic evolution...")
    general_evolution_df = analyze_general_topic_evolution(model_en, df["post_content"].tolist(), df["timestamp"].tolist())
    print(general_evolution_df.head())

    print("\nAnalyzing per user topic evolution...")
    user_evolution_dict = analyze_user_topic_evolution(df, model_en)
    for user_id, evolution_df in user_evolution_dict.items():
        print(f"\nTopic evolution for {user_id}:")
        print(evolution_df.head())