Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import os | |
| def analyze(): | |
| print("Loading data...") | |
| # Load data | |
| try: | |
| train_df = pd.read_csv('data/rec/train.csv') | |
| books_df = pd.read_csv('data/books_processed.csv') | |
| except Exception as e: | |
| print(f"Error loading data: {e}") | |
| return | |
| print(f"Train rows: {len(train_df)}") | |
| print(f"Books rows: {len(books_df)}") | |
| # 1. Sparsity & Distributions | |
| n_users = train_df['user_id'].nunique() | |
| n_items = train_df['isbn'].nunique() | |
| sparsity = 1 - (len(train_df) / (n_users * n_items)) | |
| print("\n--- Distribution Stats ---") | |
| print(f"Users: {n_users}, Items: {n_items}") | |
| print(f"Sparsity: {sparsity:.6f}") | |
| user_counts = train_df['user_id'].value_counts() | |
| item_counts = train_df['isbn'].value_counts() | |
| print(f"User Ratings: Min={user_counts.min()}, Max={user_counts.max()}, Median={user_counts.median()}, Mean={user_counts.mean():.2f}") | |
| print(f"Item Ratings: Min={item_counts.min()}, Max={item_counts.max()}, Median={item_counts.median()}, Mean={item_counts.mean():.2f}") | |
| # Head and Tail check | |
| top_1_percent_users = user_counts.quantile(0.99) | |
| top_1_percent_items = item_counts.quantile(0.99) | |
| print(f"Top 1% User Threshold: {top_1_percent_users:.0f} ratings") | |
| print(f"Top 1% Item Threshold: {top_1_percent_items:.0f} ratings") | |
| # 2. Text Quality | |
| print("\n--- Text Quality ---") | |
| books_df['desc_len'] = books_df['description'].fillna("").apply(len) | |
| print(f"Description Length: Mean={books_df['desc_len'].mean():.0f}, Median={books_df['desc_len'].median():.0f}") | |
| print(f"Missing Descriptions: {sum(books_df['desc_len'] == 0)} ({sum(books_df['desc_len'] == 0)/len(books_df):.2%})") | |
| def has_html(text): | |
| if not isinstance(text, str): return False | |
| return bool(re.search(r'<[^>]+>', text)) | |
| html_count = books_df['description'].apply(has_html).sum() | |
| print(f"Rows with HTML artifacts: {html_count}") | |
| # 3. Temporal | |
| print("\n--- Temporal Analysis ---") | |
| if 'timestamp' in train_df.columns: | |
| # Convert to datetime if numeric | |
| try: | |
| # Assuming timestamp is unix | |
| dates = pd.to_datetime(train_df['timestamp'], unit='s', errors='coerce') | |
| print(f"Date Range: {dates.min()} to {dates.max()}") | |
| # Interactions per year | |
| print("Ratings per Year:") | |
| print(dates.dt.year.value_counts().sort_index()) | |
| except: | |
| print("Could not parse timestamps") | |
| if __name__ == "__main__": | |
| analyze() | |