File size: 3,848 Bytes
e5bc014
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
"""Time Series Analysis Plugin"""
import pandas as pd
import numpy as np
from typing import Dict, Any, Tuple, Optional

class TimeSeriesAnalyzer:
    """Universal data science engine for any analytics task."""
    
    def analyze_query(self, df: pd.DataFrame, query: str) -> Tuple[str, Optional[pd.DataFrame]]:
        """
        Process any data science query: summarize, aggregate, group, segment, trends.
        """
        query_lower = query.lower()
        
        if df.empty:
            return "❌ No data available.", None
        
        # SUMMARIZATION
        if any(kw in query_lower for kw in ['summarize', 'summary', 'overview', 'describe']):
            summary = df.describe(include='all').transpose()
            return f"πŸ“Š Data Summary: {df.shape[0]} rows Γ— {df.shape[1]} columns", summary
        
        # AGGREGATION
        if any(kw in query_lower for kw in ['aggregate', 'sum', 'total', 'average', 'mean']):
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            if numeric_cols:
                agg_result = df[numeric_cols].agg(['sum', 'mean', 'count', 'min', 'max']).transpose()
                return f"πŸ“ˆ Aggregated Statistics for {len(numeric_cols)} numeric columns", agg_result
            return "⚠️ No numeric columns found for aggregation.", None
        
        # GROUPING
        if 'group' in query_lower:
            cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            if cat_cols and numeric_cols:
                group_col = cat_cols[0]
                grouped = df.groupby(group_col)[numeric_cols].agg(['sum', 'mean', 'count']).head(20)
                return f"πŸ” Grouped by `{group_col}` (top 20)", grouped
            return "⚠️ Need categorical and numeric columns for grouping.", None
        
        # SEGMENTATION
        if any(kw in query_lower for kw in ['segment', 'cluster', 'categorize']):
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:3]
            if numeric_cols:
                segment_col = numeric_cols[0]
                df_copy = df.copy()
                df_copy['segment'] = pd.qcut(df[segment_col], q=4, labels=['Low', 'Medium', 'High', 'Very High'], duplicates='drop')
                segment_summary = df_copy.groupby('segment').agg({segment_col: ['count', 'mean', 'min', 'max']})
                return f"🎯 Segmentation based on `{segment_col}`", segment_summary
            return "⚠️ Need numeric columns for segmentation.", None
        
        # TREND ANALYSIS
        if any(kw in query_lower for kw in ['trend', 'over time', 'pattern']):
            date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            if date_cols and numeric_cols:
                df_sorted = df.sort_values(date_cols[0])
                trend_data = df_sorted[[date_cols[0]] + numeric_cols[:2]].head(50)
                return f"πŸ“‰ Trend Analysis (first 50 records)", trend_data
            return "⚠️ Need date and numeric columns for trends.", None
        
        # CORRELATION
        if 'correlat' in query_lower:
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            if len(numeric_cols) >= 2:
                corr_matrix = df[numeric_cols].corr()
                return f"πŸ”— Correlation Matrix for {len(numeric_cols)} columns", corr_matrix
            return "⚠️ Need at least 2 numeric columns.", None
        
        # DEFAULT
        basic_stats = df.describe().transpose()
        return "πŸ“Š Basic Statistics (try: 'summarize', 'group', 'segment', 'trend')", basic_stats