Upload 2 files
Browse files
plugins/analyzers/statistical_analyzer.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Statistical Analysis Plugin"""
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
|
| 6 |
+
class StatisticalAnalyzer:
|
| 7 |
+
"""Perform statistical analysis on data."""
|
| 8 |
+
def analyze(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 9 |
+
"""Generate comprehensive statistical summary."""
|
| 10 |
+
analysis = {"shape": {"rows": len(df), "columns": len(df.columns)}, "columns": {}}
|
| 11 |
+
for col in df.columns:
|
| 12 |
+
col_analysis = {"name": col, "dtype": str(df[col].dtype)}
|
| 13 |
+
col_analysis["missing_percent"] = float(df[col].isna().mean() * 100)
|
| 14 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
| 15 |
+
col_analysis.update({
|
| 16 |
+
"mean": float(df[col].mean()),
|
| 17 |
+
"std": float(df[col].std()),
|
| 18 |
+
"min": float(df[col].min()),
|
| 19 |
+
"max": float(df[col].max()),
|
| 20 |
+
"median": float(df[col].median())
|
| 21 |
+
})
|
| 22 |
+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
|
| 23 |
+
col_analysis.update({"min_date": str(df[col].min()), "max_date": str(df[col].max())})
|
| 24 |
+
else:
|
| 25 |
+
col_analysis.update({"unique_values": int(df[col].nunique())})
|
| 26 |
+
analysis["columns"][col] = col_analysis
|
| 27 |
+
return analysis
|
plugins/analyzers/time_series_analyzer.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Time Series Analysis Plugin"""
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import Dict, Any, Tuple, Optional
|
| 6 |
+
|
| 7 |
+
class TimeSeriesAnalyzer:
|
| 8 |
+
"""Universal data science engine for any analytics task."""
|
| 9 |
+
|
| 10 |
+
def analyze_query(self, df: pd.DataFrame, query: str) -> Tuple[str, Optional[pd.DataFrame]]:
|
| 11 |
+
"""
|
| 12 |
+
Process any data science query: summarize, aggregate, group, segment, trends.
|
| 13 |
+
"""
|
| 14 |
+
query_lower = query.lower()
|
| 15 |
+
|
| 16 |
+
if df.empty:
|
| 17 |
+
return "β No data available.", None
|
| 18 |
+
|
| 19 |
+
# SUMMARIZATION
|
| 20 |
+
if any(kw in query_lower for kw in ['summarize', 'summary', 'overview', 'describe']):
|
| 21 |
+
summary = df.describe(include='all').transpose()
|
| 22 |
+
return f"π Data Summary: {df.shape[0]} rows Γ {df.shape[1]} columns", summary
|
| 23 |
+
|
| 24 |
+
# AGGREGATION
|
| 25 |
+
if any(kw in query_lower for kw in ['aggregate', 'sum', 'total', 'average', 'mean']):
|
| 26 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 27 |
+
if numeric_cols:
|
| 28 |
+
agg_result = df[numeric_cols].agg(['sum', 'mean', 'count', 'min', 'max']).transpose()
|
| 29 |
+
return f"π Aggregated Statistics for {len(numeric_cols)} numeric columns", agg_result
|
| 30 |
+
return "β οΈ No numeric columns found for aggregation.", None
|
| 31 |
+
|
| 32 |
+
# GROUPING
|
| 33 |
+
if 'group' in query_lower:
|
| 34 |
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 35 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 36 |
+
if cat_cols and numeric_cols:
|
| 37 |
+
group_col = cat_cols[0]
|
| 38 |
+
grouped = df.groupby(group_col)[numeric_cols].agg(['sum', 'mean', 'count']).head(20)
|
| 39 |
+
return f"π Grouped by `{group_col}` (top 20)", grouped
|
| 40 |
+
return "β οΈ Need categorical and numeric columns for grouping.", None
|
| 41 |
+
|
| 42 |
+
# SEGMENTATION
|
| 43 |
+
if any(kw in query_lower for kw in ['segment', 'cluster', 'categorize']):
|
| 44 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:3]
|
| 45 |
+
if numeric_cols:
|
| 46 |
+
segment_col = numeric_cols[0]
|
| 47 |
+
df_copy = df.copy()
|
| 48 |
+
df_copy['segment'] = pd.qcut(df[segment_col], q=4, labels=['Low', 'Medium', 'High', 'Very High'], duplicates='drop')
|
| 49 |
+
segment_summary = df_copy.groupby('segment').agg({segment_col: ['count', 'mean', 'min', 'max']})
|
| 50 |
+
return f"π― Segmentation based on `{segment_col}`", segment_summary
|
| 51 |
+
return "β οΈ Need numeric columns for segmentation.", None
|
| 52 |
+
|
| 53 |
+
# TREND ANALYSIS
|
| 54 |
+
if any(kw in query_lower for kw in ['trend', 'over time', 'pattern']):
|
| 55 |
+
date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 56 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 57 |
+
if date_cols and numeric_cols:
|
| 58 |
+
df_sorted = df.sort_values(date_cols[0])
|
| 59 |
+
trend_data = df_sorted[[date_cols[0]] + numeric_cols[:2]].head(50)
|
| 60 |
+
return f"π Trend Analysis (first 50 records)", trend_data
|
| 61 |
+
return "β οΈ Need date and numeric columns for trends.", None
|
| 62 |
+
|
| 63 |
+
# CORRELATION
|
| 64 |
+
if 'correlat' in query_lower:
|
| 65 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 66 |
+
if len(numeric_cols) >= 2:
|
| 67 |
+
corr_matrix = df[numeric_cols].corr()
|
| 68 |
+
return f"π Correlation Matrix for {len(numeric_cols)} columns", corr_matrix
|
| 69 |
+
return "β οΈ Need at least 2 numeric columns.", None
|
| 70 |
+
|
| 71 |
+
# DEFAULT
|
| 72 |
+
basic_stats = df.describe().transpose()
|
| 73 |
+
return "π Basic Statistics (try: 'summarize', 'group', 'segment', 'trend')", basic_stats
|