File size: 3,848 Bytes
e5bc014 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
#!/usr/bin/env python3
"""Time Series Analysis Plugin"""
import pandas as pd
import numpy as np
from typing import Dict, Any, Tuple, Optional
class TimeSeriesAnalyzer:
"""Universal data science engine for any analytics task."""
def analyze_query(self, df: pd.DataFrame, query: str) -> Tuple[str, Optional[pd.DataFrame]]:
"""
Process any data science query: summarize, aggregate, group, segment, trends.
"""
query_lower = query.lower()
if df.empty:
return "β No data available.", None
# SUMMARIZATION
if any(kw in query_lower for kw in ['summarize', 'summary', 'overview', 'describe']):
summary = df.describe(include='all').transpose()
return f"π Data Summary: {df.shape[0]} rows Γ {df.shape[1]} columns", summary
# AGGREGATION
if any(kw in query_lower for kw in ['aggregate', 'sum', 'total', 'average', 'mean']):
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
agg_result = df[numeric_cols].agg(['sum', 'mean', 'count', 'min', 'max']).transpose()
return f"π Aggregated Statistics for {len(numeric_cols)} numeric columns", agg_result
return "β οΈ No numeric columns found for aggregation.", None
# GROUPING
if 'group' in query_lower:
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if cat_cols and numeric_cols:
group_col = cat_cols[0]
grouped = df.groupby(group_col)[numeric_cols].agg(['sum', 'mean', 'count']).head(20)
return f"π Grouped by `{group_col}` (top 20)", grouped
return "β οΈ Need categorical and numeric columns for grouping.", None
# SEGMENTATION
if any(kw in query_lower for kw in ['segment', 'cluster', 'categorize']):
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:3]
if numeric_cols:
segment_col = numeric_cols[0]
df_copy = df.copy()
df_copy['segment'] = pd.qcut(df[segment_col], q=4, labels=['Low', 'Medium', 'High', 'Very High'], duplicates='drop')
segment_summary = df_copy.groupby('segment').agg({segment_col: ['count', 'mean', 'min', 'max']})
return f"π― Segmentation based on `{segment_col}`", segment_summary
return "β οΈ Need numeric columns for segmentation.", None
# TREND ANALYSIS
if any(kw in query_lower for kw in ['trend', 'over time', 'pattern']):
date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if date_cols and numeric_cols:
df_sorted = df.sort_values(date_cols[0])
trend_data = df_sorted[[date_cols[0]] + numeric_cols[:2]].head(50)
return f"π Trend Analysis (first 50 records)", trend_data
return "β οΈ Need date and numeric columns for trends.", None
# CORRELATION
if 'correlat' in query_lower:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) >= 2:
corr_matrix = df[numeric_cols].corr()
return f"π Correlation Matrix for {len(numeric_cols)} columns", corr_matrix
return "β οΈ Need at least 2 numeric columns.", None
# DEFAULT
basic_stats = df.describe().transpose()
return "π Basic Statistics (try: 'summarize', 'group', 'segment', 'trend')", basic_stats
|