Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import datetime | |
| def sanitize_dataframe_for_json(df: pd.DataFrame) -> pd.DataFrame: | |
| """Convert non-JSON-serializable values (datetime, date, time, period, timedelta) to strings.""" | |
| df_safe = df.copy() | |
| # Ensure simple integer index and string column names | |
| df_safe = df_safe.reset_index(drop=True) | |
| df_safe.columns = df_safe.columns.map(str) | |
| # Datetime columns: export as ISO 8601 strings with millisecond precision, null-safe | |
| for col in df_safe.select_dtypes(include=["datetime64[ns]"]).columns: | |
| try: | |
| dt = pd.to_datetime(df_safe[col], errors='coerce') | |
| # format to ISO with milliseconds (slice microseconds to 3 digits) | |
| iso = dt.dt.strftime('%Y-%m-%dT%H:%M:%S.%f').str.slice(0, 23) | |
| # set None where NaT | |
| iso = iso.where(dt.notna(), None) | |
| df_safe[col] = iso | |
| except Exception: | |
| # Fallback to plain string | |
| df_safe[col] = df_safe[col].astype(object).where(~df_safe[col].isna(), None) | |
| # Object columns: convert date/time-like objects to strings | |
| def _to_serializable(x): | |
| try: | |
| # Safe null check using a list contact for unhashable types | |
| if x is None or (not isinstance(x, (dict, list)) and pd.isna(x)): | |
| return None | |
| except Exception: | |
| # Fallback if the check itself fails | |
| if x is None: | |
| return None | |
| if isinstance(x, pd.Timestamp): | |
| try: | |
| return x.strftime('%Y-%m-%d %H:%M:%S') | |
| except Exception: | |
| return None | |
| if isinstance(x, (datetime.datetime, datetime.date)): | |
| return x.isoformat() | |
| if isinstance(x, datetime.time): | |
| return x.strftime('%H:%M:%S') | |
| # If it's a dict or list, it's already serializable by json.dumps if types match | |
| # but we might want to ensure it's clean. For now we leave it for the JSON serializer. | |
| return x | |
| for col in df_safe.select_dtypes(include=['object']).columns: | |
| try: | |
| # Try efficient mapping | |
| df_safe[col] = df_safe[col].map(_to_serializable) | |
| except (TypeError, Exception): | |
| # Fallback to slower apply or manual loop | |
| try: | |
| df_safe[col] = df_safe[col].apply(_to_serializable) | |
| except Exception: | |
| # Last resort: convert everything to string if mapping fails entirely | |
| df_safe[col] = df_safe[col].astype(str).where(df_safe[col].notna(), None) | |
| return df_safe | |
| def detect_and_combine_date_columns(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Detect separate year, month, day, hour columns and combine them into datetime columns. | |
| Removed streamlit logging for API safety. | |
| """ | |
| df_combined = df.copy() | |
| year_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in ['year', 'yr', 'yyyy'])] | |
| month_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in ['month', 'mon', 'mm'])] | |
| day_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in ['day', 'dd', 'date'])] | |
| hour_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in ['hour', 'hr', 'hh', 'time'])] | |
| date_combinations = [] | |
| for year_col in year_cols: | |
| for month_col in month_cols: | |
| for day_col in day_cols: | |
| try: | |
| sample_size = min(10, len(df)) | |
| test_sample = df.iloc[:sample_size] | |
| years = pd.to_numeric(test_sample[year_col], errors='coerce') | |
| months = pd.to_numeric(test_sample[month_col], errors='coerce') | |
| days = pd.to_numeric(test_sample[day_col], errors='coerce') | |
| valid_years = years.between(1900, 2100).all() | |
| valid_months = months.between(1, 12).all() | |
| valid_days = days.between(1, 31).all() | |
| if valid_years and valid_months and valid_days: | |
| combination = {'year': year_col, 'month': month_col, 'day': day_col, 'hour': None} | |
| for hour_col in hour_cols: | |
| try: | |
| hours = pd.to_numeric(test_sample[hour_col], errors='coerce') | |
| if hours.between(0, 23).all(): | |
| combination['hour'] = hour_col | |
| break | |
| except: | |
| continue | |
| date_combinations.append(combination) | |
| except Exception: | |
| continue | |
| for i, combo in enumerate(date_combinations): | |
| try: | |
| datetime_col_name = f"datetime_combined_{i+1}" | |
| year = pd.to_numeric(df_combined[combo['year']], errors='coerce') | |
| month = pd.to_numeric(df_combined[combo['month']], errors='coerce') | |
| day = pd.to_numeric(df_combined[combo['day']], errors='coerce') | |
| if combo['hour']: | |
| hour = pd.to_numeric(df_combined[combo['hour']], errors='coerce') | |
| df_combined[datetime_col_name] = pd.to_datetime({'year': year, 'month': month, 'day': day, 'hour': hour}, errors='coerce') | |
| else: | |
| df_combined[datetime_col_name] = pd.to_datetime({'year': year, 'month': month, 'day': day}, errors='coerce') | |
| except Exception: | |
| continue | |
| return df_combined | |
| def auto_create_date_column(df: pd.DataFrame) -> pd.DataFrame: | |
| """Automatically create a unified datetime column `date_x` for mapping.""" | |
| df_auto = df.copy() | |
| if 'date_x' in df_auto.columns: | |
| try: | |
| parsed = pd.to_datetime(df_auto['date_x'], errors='coerce') | |
| if parsed.notna().sum() > 0: | |
| df_auto['date_x'] = parsed | |
| return df_auto | |
| except Exception: | |
| pass | |
| candidate_cols = [c for c in df_auto.columns if any(k in c.lower() for k in ['date', 'datetime', 'timestamp'])] | |
| candidate_cols.extend([c for c in df_auto.select_dtypes(include=["datetime64[ns]"]).columns if c not in candidate_cols]) | |
| best_col = None | |
| best_valid = -1 | |
| for col in candidate_cols: | |
| try: | |
| parsed = pd.to_datetime(df_auto[col], errors='coerce') | |
| valid = parsed.notna().sum() | |
| if valid > best_valid: | |
| best_valid = valid | |
| best_col = col | |
| except Exception: | |
| continue | |
| if best_col is not None and best_valid > 0: | |
| try: | |
| df_auto['date_x'] = pd.to_datetime(df_auto[best_col], errors='coerce') | |
| try: | |
| df_auto['date_x'] = df_auto['date_x'].dt.tz_localize(None) | |
| except Exception: | |
| pass | |
| df_auto['date_x'] = df_auto['date_x'].dt.floor('ms') | |
| return df_auto | |
| except Exception: | |
| pass | |
| df_combined = detect_and_combine_date_columns(df_auto) | |
| combined_cols = [c for c in df_combined.columns if c.startswith('datetime_combined_')] | |
| best_combined = None | |
| best_combined_valid = -1 | |
| for col in combined_cols: | |
| try: | |
| valid = df_combined[col].notna().sum() | |
| if valid > best_combined_valid: | |
| best_combined_valid = valid | |
| best_combined = col | |
| except Exception: | |
| continue | |
| if best_combined is not None and best_combined_valid > 0: | |
| df_combined['date_x'] = pd.to_datetime(df_combined[best_combined], errors='coerce') | |
| try: | |
| df_combined['date_x'] = df_combined['date_x'].dt.tz_localize(None) | |
| except Exception: | |
| pass | |
| df_combined['date_x'] = df_combined['date_x'].dt.floor('ms') | |
| return df_combined | |
| return df_auto | |