SocialMediaFoci / profile_performance.py
Bismark
Update Space
5ab54b7
import time
import pandas as pd
import preprocessor
import random
def generate_large_chat(lines=10000):
"""Generates a synthetic WhatsApp chat log."""
senders = ["User1", "User2", "User3"]
messages = [
"Hello there, how are you?",
"I am doing great, thanks for asking! Project update?",
"This is a test message to simulate a long chat about artificial intelligence.",
"Meeting is at 10 AM tomorrow to discuss the roadmap.",
"Check out this link: https://example.com",
"Haha that is funny 😂",
"Je parle un peu français aussi. C'est la vie.",
"Non, je ne crois pas. Il fait beau aujourd'hui.",
"Ok, see you later. Don't forget the deadline.",
"Python is a great programming language for data science.",
"Streamlit makes building apps very easy and fast."
]
chat_data = []
for _ in range(lines):
date = f"{random.randint(1, 12)}/{random.randint(1, 28)}/23"
hour = random.randint(1, 12)
minute = random.randint(10, 59)
ampm = random.choice(["AM", "PM"])
time_str = f"{hour}:{minute} {ampm}"
sender = random.choice(senders)
message = random.choice(messages)
chat_data.append(f"{date}, {time_str} - {sender}: {message}")
return "\n".join(chat_data)
def profile_preprocessing():
print("Generating synthetic data (10,000 lines)...")
raw_data = generate_large_chat(10000)
print(f"Data size: {len(raw_data) / 1024 / 1024:.2f} MB")
print("\nStarting profiling...")
start_total = time.time()
# We can't easily profile inside the function without modifying it,
# so we will measure the total time and infer from code analysis
# or modify preprocessor.py temporarily to print timings.
# For now, let's just run it and see the total time.
try:
start_time = time.time()
# Step 1: Parse
df = preprocessor.parse_data(raw_data)
print(f"Parsing took: {time.time() - start_time:.2f}s")
# Step 2: Analyze
step_start = time.time()
df, topics = preprocessor.analyze_sentiment_and_topics(df)
print(f"Analysis took: {time.time() - step_start:.2f}s")
end_total = time.time()
print(f"\nTotal Preprocessing Time: {end_total - start_total:.2f} seconds")
print(f"Messages processed: {len(df)}")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
profile_preprocessing()