JatinAutonomousLabs commited on
Commit
e5bc014
Β·
verified Β·
1 Parent(s): bb2b847

Upload 2 files

Browse files
plugins/analyzers/statistical_analyzer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Statistical Analysis Plugin"""
3
+ import pandas as pd
4
+ from typing import Dict, Any
5
+
6
+ class StatisticalAnalyzer:
7
+ """Perform statistical analysis on data."""
8
+ def analyze(self, df: pd.DataFrame) -> Dict[str, Any]:
9
+ """Generate comprehensive statistical summary."""
10
+ analysis = {"shape": {"rows": len(df), "columns": len(df.columns)}, "columns": {}}
11
+ for col in df.columns:
12
+ col_analysis = {"name": col, "dtype": str(df[col].dtype)}
13
+ col_analysis["missing_percent"] = float(df[col].isna().mean() * 100)
14
+ if pd.api.types.is_numeric_dtype(df[col]):
15
+ col_analysis.update({
16
+ "mean": float(df[col].mean()),
17
+ "std": float(df[col].std()),
18
+ "min": float(df[col].min()),
19
+ "max": float(df[col].max()),
20
+ "median": float(df[col].median())
21
+ })
22
+ elif pd.api.types.is_datetime64_any_dtype(df[col]):
23
+ col_analysis.update({"min_date": str(df[col].min()), "max_date": str(df[col].max())})
24
+ else:
25
+ col_analysis.update({"unique_values": int(df[col].nunique())})
26
+ analysis["columns"][col] = col_analysis
27
+ return analysis
plugins/analyzers/time_series_analyzer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Time Series Analysis Plugin"""
3
+ import pandas as pd
4
+ import numpy as np
5
+ from typing import Dict, Any, Tuple, Optional
6
+
7
+ class TimeSeriesAnalyzer:
8
+ """Universal data science engine for any analytics task."""
9
+
10
+ def analyze_query(self, df: pd.DataFrame, query: str) -> Tuple[str, Optional[pd.DataFrame]]:
11
+ """
12
+ Process any data science query: summarize, aggregate, group, segment, trends.
13
+ """
14
+ query_lower = query.lower()
15
+
16
+ if df.empty:
17
+ return "❌ No data available.", None
18
+
19
+ # SUMMARIZATION
20
+ if any(kw in query_lower for kw in ['summarize', 'summary', 'overview', 'describe']):
21
+ summary = df.describe(include='all').transpose()
22
+ return f"πŸ“Š Data Summary: {df.shape[0]} rows Γ— {df.shape[1]} columns", summary
23
+
24
+ # AGGREGATION
25
+ if any(kw in query_lower for kw in ['aggregate', 'sum', 'total', 'average', 'mean']):
26
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
27
+ if numeric_cols:
28
+ agg_result = df[numeric_cols].agg(['sum', 'mean', 'count', 'min', 'max']).transpose()
29
+ return f"πŸ“ˆ Aggregated Statistics for {len(numeric_cols)} numeric columns", agg_result
30
+ return "⚠️ No numeric columns found for aggregation.", None
31
+
32
+ # GROUPING
33
+ if 'group' in query_lower:
34
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
35
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
36
+ if cat_cols and numeric_cols:
37
+ group_col = cat_cols[0]
38
+ grouped = df.groupby(group_col)[numeric_cols].agg(['sum', 'mean', 'count']).head(20)
39
+ return f"πŸ” Grouped by `{group_col}` (top 20)", grouped
40
+ return "⚠️ Need categorical and numeric columns for grouping.", None
41
+
42
+ # SEGMENTATION
43
+ if any(kw in query_lower for kw in ['segment', 'cluster', 'categorize']):
44
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:3]
45
+ if numeric_cols:
46
+ segment_col = numeric_cols[0]
47
+ df_copy = df.copy()
48
+ df_copy['segment'] = pd.qcut(df[segment_col], q=4, labels=['Low', 'Medium', 'High', 'Very High'], duplicates='drop')
49
+ segment_summary = df_copy.groupby('segment').agg({segment_col: ['count', 'mean', 'min', 'max']})
50
+ return f"🎯 Segmentation based on `{segment_col}`", segment_summary
51
+ return "⚠️ Need numeric columns for segmentation.", None
52
+
53
+ # TREND ANALYSIS
54
+ if any(kw in query_lower for kw in ['trend', 'over time', 'pattern']):
55
+ date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
56
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
57
+ if date_cols and numeric_cols:
58
+ df_sorted = df.sort_values(date_cols[0])
59
+ trend_data = df_sorted[[date_cols[0]] + numeric_cols[:2]].head(50)
60
+ return f"πŸ“‰ Trend Analysis (first 50 records)", trend_data
61
+ return "⚠️ Need date and numeric columns for trends.", None
62
+
63
+ # CORRELATION
64
+ if 'correlat' in query_lower:
65
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
66
+ if len(numeric_cols) >= 2:
67
+ corr_matrix = df[numeric_cols].corr()
68
+ return f"πŸ”— Correlation Matrix for {len(numeric_cols)} columns", corr_matrix
69
+ return "⚠️ Need at least 2 numeric columns.", None
70
+
71
+ # DEFAULT
72
+ basic_stats = df.describe().transpose()
73
+ return "πŸ“Š Basic Statistics (try: 'summarize', 'group', 'segment', 'trend')", basic_stats