Spaces:

ratulsur
/

BI_ANALYTICS

Configuration error

App Files Files Community

ratulsur commited on Aug 29, 2025

Commit

98bc1c2

verified ·

1 Parent(s): e51a81b

Upload 13 files

Browse files

Files changed (13) hide show

__init__.py +1 -0
ab_tester.py +319 -0
chat_interface.py +140 -0
data_cleaner.py +122 -0
data_generator.py +80 -0
eda_analyzer.py +175 -0
ppt_generator.py +491 -0
predictive_analytics.py +261 -0
questionnaire_generator.py +83 -0
sentiment_analyzer.py +298 -0
trend_analyzer.py +286 -0
variable_extraction.py +57 -0
visualization_engine.py +193 -0

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # BI Storyteller Modules

ab_tester.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+from scipy.stats import chi2_contingency, ttest_ind
+import random
+class ABTester:
+    def __init__(self):
+        pass
+    def run_test(self, df, test_name, test_metric):
+        """Run A/B test analysis"""
+        results = {}
+        plots = []
+        # Generate synthetic A/B test data
+        test_data = self._generate_ab_test_data(len(df), test_metric)
+        # Calculate test results
+        control_data = test_data[test_data['variant'] == 'Control']
+        treatment_data = test_data[test_data['variant'] == 'Treatment']
+        # Basic statistics
+        control_stats = self._calculate_variant_stats(control_data, test_metric)
+        treatment_stats = self._calculate_variant_stats(treatment_data, test_metric)
+        results['testName'] = test_name
+        results['metric'] = test_metric
+        results['duration'] = 14  # Assume 14-day test
+        results['variants'] = {
+            'control': {
+                'name': 'Control Group',
+                'participants': len(control_data),
+                'conversions': control_stats['conversions'],
+                'conversionRate': control_stats['conversion_rate'],
+                'revenue': control_stats['revenue']
+            },
+            'treatment': {
+                'name': 'Treatment Group',
+                'participants': len(treatment_data),
+                'conversions': treatment_stats['conversions'],
+                'conversionRate': treatment_stats['conversion_rate'],
+                'revenue': treatment_stats['revenue']
+            }
+        }
+        # Statistical significance testing
+        statistical_results = self._perform_statistical_tests(control_data, treatment_data, test_metric)
+        results['statistics'] = statistical_results
+        # Generate insights
+        results['insights'] = self._generate_insights(results)
+        # Create visualizations
+        plots = self._create_ab_test_plots(test_data, results)
+        return results, plots
+    def _generate_ab_test_data(self, n_samples, test_metric):
+        """Generate synthetic A/B test data"""
+        np.random.seed(42)
+        # Split into control and treatment groups
+        n_control = n_samples // 2
+        n_treatment = n_samples - n_control
+        data = []
+        # Control group (baseline performance)
+        if test_metric == 'Conversion Rate':
+            control_conversion_rate = 0.076  # 7.6% baseline
+            treatment_conversion_rate = 0.092  # 9.2% improved
+        elif test_metric == 'Click Rate':
+            control_conversion_rate = 0.12
+            treatment_conversion_rate = 0.15
+        else:  # Revenue
+            control_conversion_rate = 0.08
+            treatment_conversion_rate = 0.10
+        # Generate control group data
+        for i in range(n_control):
+            converted = np.random.random() < control_conversion_rate
+            revenue = np.random.normal(45, 15) if converted else 0
+            data.append({
+                'user_id': f'control_{i}',
+                'variant': 'Control',
+                'converted': converted,
+                'revenue': max(0, revenue)
+            })
+        # Generate treatment group data
+        for i in range(n_treatment):
+            converted = np.random.random() < treatment_conversion_rate
+            revenue = np.random.normal(52, 18) if converted else 0
+            data.append({
+                'user_id': f'treatment_{i}',
+                'variant': 'Treatment',
+                'converted': converted,
+                'revenue': max(0, revenue)
+            })
+        return pd.DataFrame(data)
+    def _calculate_variant_stats(self, variant_data, test_metric):
+        """Calculate statistics for a variant"""
+        total_users = len(variant_data)
+        conversions = variant_data['converted'].sum()
+        conversion_rate = conversions / total_users if total_users > 0 else 0
+        total_revenue = variant_data['revenue'].sum()
+        return {
+            'conversions': int(conversions),
+            'conversion_rate': round(conversion_rate, 4),
+            'revenue': round(total_revenue, 2)
+        }
+    def _perform_statistical_tests(self, control_data, treatment_data, test_metric):
+        """Perform statistical significance tests"""
+        # Conversion rate test (Chi-square or Z-test)
+        control_conversions = control_data['converted'].sum()
+        control_total = len(control_data)
+        treatment_conversions = treatment_data['converted'].sum()
+        treatment_total = len(treatment_data)
+        # Z-test for proportions
+        p1 = control_conversions / control_total
+        p2 = treatment_conversions / treatment_total
+        # Pooled proportion
+        p_pool = (control_conversions + treatment_conversions) / (control_total + treatment_total)
+        # Standard error
+        se = np.sqrt(p_pool * (1 - p_pool) * (1/control_total + 1/treatment_total))
+        # Z-score
+        if se > 0:
+            z_score = (p2 - p1) / se
+            p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
+        else:
+            z_score = 0
+            p_value = 1.0
+        # Effect size (relative uplift)
+        if p1 > 0:
+            uplift = ((p2 - p1) / p1) * 100
+        else:
+            uplift = 0
+        # Determine significance
+        significance = p_value < 0.05
+        confidence = 1 - p_value
+        # Determine winner
+        winner = 'treatment' if p2 > p1 and significance else 'control'
+        return {
+            'pValue': round(p_value, 4),
+            'confidence': round(confidence, 3),
+            'significance': significance,
+            'uplift': round(uplift, 1),
+            'winner': winner,
+            'zScore': round(z_score, 3)
+        }
+    def _generate_insights(self, results):
+        """Generate insights from A/B test results"""
+        insights = []
+        stats = results['statistics']
+        control = results['variants']['control']
+        treatment = results['variants']['treatment']
+        # Uplift insight
+        if stats['significance']:
+            insights.append(f"Treatment shows {stats['uplift']}% improvement in {results['metric'].lower()}")
+            insights.append(f"Results are statistically significant (p = {stats['pValue']})")
+        else:
+            insights.append(f"No statistically significant difference detected (p = {stats['pValue']})")
+        # Revenue impact
+        revenue_diff = treatment['revenue'] - control['revenue']
+        if revenue_diff > 0:
+            insights.append(f"Revenue increase of ${revenue_diff:,.2f} over test period")
+        # Recommendation
+        if stats['significance'] and stats['winner'] == 'treatment':
+            insights.append("Recommend implementing treatment for full campaign")
+        elif stats['significance'] and stats['winner'] == 'control':
+            insights.append("Control performs better - continue with current approach")
+        else:
+            insights.append("Extend test duration or increase sample size for conclusive results")
+        return insights
+    def _create_ab_test_plots(self, test_data, results):
+        """Create A/B test visualization plots"""
+        plots = []
+        # Conversion rate comparison
+        plt.figure(figsize=(10, 6))
+        variants = ['Control', 'Treatment']
+        conversion_rates = [
+            results['variants']['control']['conversionRate'],
+            results['variants']['treatment']['conversionRate']
+        ]
+        colors = ['#3498db', '#e74c3c']
+        bars = plt.bar(variants, conversion_rates, color=colors, alpha=0.7)
+        plt.title('Conversion Rate Comparison')
+        plt.ylabel('Conversion Rate')
+        plt.ylim(0, max(conversion_rates) * 1.2)
+        # Add value labels on bars
+        for bar, rate in zip(bars, conversion_rates):
+            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
+                    f'{rate:.1%}', ha='center', va='bottom', fontweight='bold')
+        plt.tight_layout()
+        plt.savefig('conversion_rate_comparison.png', dpi=300, bbox_inches='tight')
+        plots.append('conversion_rate_comparison.png')
+        plt.close()
+        # Revenue comparison
+        plt.figure(figsize=(10, 6))
+        revenues = [
+            results['variants']['control']['revenue'],
+            results['variants']['treatment']['revenue']
+        ]
+        bars = plt.bar(variants, revenues, color=colors, alpha=0.7)
+        plt.title('Revenue Comparison')
+        plt.ylabel('Total Revenue ($)')
+        # Add value labels on bars
+        for bar, revenue in zip(bars, revenues):
+            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(revenues)*0.01,
+                    f'${revenue:,.0f}', ha='center', va='bottom', fontweight='bold')
+        plt.tight_layout()
+        plt.savefig('revenue_comparison.png', dpi=300, bbox_inches='tight')
+        plots.append('revenue_comparison.png')
+        plt.close()
+        # Statistical significance visualization
+        plt.figure(figsize=(12, 8))
+        # Create subplots for different metrics
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
+        # Participants
+        participants = [results['variants']['control']['participants'],
+                      results['variants']['treatment']['participants']]
+        ax1.bar(variants, participants, color=colors, alpha=0.7)
+        ax1.set_title('Participants')
+        ax1.set_ylabel('Count')
+        # Conversions
+        conversions = [results['variants']['control']['conversions'],
+                      results['variants']['treatment']['conversions']]
+        ax2.bar(variants, conversions, color=colors, alpha=0.7)
+        ax2.set_title('Total Conversions')
+        ax2.set_ylabel('Count')
+        # P-value visualization
+        ax3.bar(['P-Value'], [results['statistics']['pValue']], color='orange', alpha=0.7)
+        ax3.axhline(y=0.05, color='red', linestyle='--', label='Significance Threshold')
+        ax3.set_title('Statistical Significance')
+        ax3.set_ylabel('P-Value')
+        ax3.legend()
+        # Confidence level
+        ax4.bar(['Confidence'], [results['statistics']['confidence']], color='green', alpha=0.7)
+        ax4.set_title('Confidence Level')
+        ax4.set_ylabel('Confidence')
+        ax4.set_ylim(0, 1)
+        plt.tight_layout()
+        plt.savefig('ab_test_summary.png', dpi=300, bbox_inches='tight')
+        plots.append('ab_test_summary.png')
+        plt.close()
+        return plots
+    def calculate_sample_size(self, baseline_rate, minimum_effect, alpha=0.05, power=0.8):
+        """Calculate required sample size for A/B test"""
+        from scipy.stats import norm
+        # Convert percentages to proportions
+        p1 = baseline_rate
+        p2 = baseline_rate * (1 + minimum_effect)
+        # Calculate pooled proportion
+        p_avg = (p1 + p2) / 2
+        # Calculate effect size
+        effect_size = abs(p2 - p1)
+        # Calculate sample size
+        z_alpha = norm.ppf(1 - alpha/2)
+        z_beta = norm.ppf(power)
+        numerator = (z_alpha * np.sqrt(2 * p_avg * (1 - p_avg)) +
+                    z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2)))**2
+        denominator = effect_size**2
+        sample_size_per_group = int(np.ceil(numerator / denominator))
+        return {
+            'sample_size_per_group': sample_size_per_group,
+            'total_sample_size': sample_size_per_group * 2,
+            'baseline_rate': p1,
+            'target_rate': p2,
+            'minimum_effect': minimum_effect,
+            'alpha': alpha,
+            'power': power
+        }

chat_interface.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+from groq import Groq
+class ChatInterface:
+    def __init__(self):
+        self.client = None
+        self.conversation_history = []
+    def set_api_key(self, api_key):
+        """Set Groq API key"""
+        self.client = Groq(api_key=api_key)
+    def chat(self, message, project_data):
+        """Chat with data using AI"""
+        if not self.client:
+            return self._get_mock_response(message)
+        try:
+            # Prepare data context
+            data_context = self._prepare_data_context(project_data)
+            system_prompt = """You are a data analyst assistant helping with marketing analysis. Answer questions about the user's data and provide insights. Be concise and actionable. Use the provided data context to give specific, data-driven responses."""
+            user_prompt = f"""Data Context: {data_context}
+User Question: {message}
+Provide a helpful, data-driven response based on the available analysis results. Include specific numbers and insights where relevant."""
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                model="llama-3.1-70b-versatile",
+                temperature=0.7,
+                max_tokens=1024
+            )
+            response = completion.choices[0].message.content.strip()
+            # Store conversation
+            self.conversation_history.append({
+                'user': message,
+                'assistant': response,
+                'timestamp': pd.Timestamp.now()
+            })
+            return response
+        except Exception as e:
+            print(f"Error in chat: {e}")
+            return self._get_mock_response(message)
+    def _prepare_data_context(self, project_data):
+        """Prepare data context for AI"""
+        context = {}
+        # Variables
+        if 'variables' in project_data:
+            context['variables'] = project_data['variables']
+        # EDA Results
+        if 'eda_results' in project_data:
+            eda = project_data['eda_results']
+            context['eda_summary'] = {
+                'total_records': eda.get('summary', {}).get('total_records'),
+                'correlations': len(eda.get('correlations', [])),
+                'key_insights': eda.get('insights', [])[:3]  # Top 3 insights
+            }
+        # Model Results
+        if 'model_results' in project_data:
+            model = project_data['model_results']
+            context['model_performance'] = {
+                'accuracy': model.get('accuracy'),
+                'model_type': model.get('model_type'),
+                'top_features': model.get('feature_importance', [])[:3]
+            }
+        # Trend Results
+        if 'trend_results' in project_data:
+            trends = project_data['trend_results']
+            context['trends'] = {
+                'timeframe': trends.get('timeframe'),
+                'key_trends': [t['metric'] + ': ' + t['direction'] for t in trends.get('trends', [])[:3]]
+            }
+        # Sentiment Results
+        if 'sentiment_results' in project_data:
+            sentiment = project_data['sentiment_results']
+            context['sentiment'] = {
+                'overall_positive': sentiment.get('overall', {}).get('positive'),
+                'recommendations': sentiment.get('recommendations', [])[:2]
+            }
+        # A/B Test Results
+        if 'ab_test_results' in project_data:
+            ab_test = project_data['ab_test_results']
+            context['ab_test'] = {
+                'winner': ab_test.get('statistics', {}).get('winner'),
+                'uplift': ab_test.get('statistics', {}).get('uplift'),
+                'significance': ab_test.get('statistics', {}).get('significance')
+            }
+        return json.dumps(context, indent=2)
+    def _get_mock_response(self, message):
+        """Generate mock response when AI is not available"""
+        message_lower = message.lower()
+        if 'customer' in message_lower and 'satisfaction' in message_lower:
+            return "Based on your sentiment analysis, customer satisfaction is at 68% positive. Key drivers include product quality and delivery speed. Consider focusing on the 10% negative feedback to improve overall satisfaction."
+        elif 'marketing' in message_lower and 'channel' in message_lower:
+            return "Your data shows that Email and Social Media are the top-performing marketing channels with higher conversion rates. TV and Print show lower engagement. Consider reallocating budget to digital channels for better ROI."
+        elif 'revenue' in message_lower or 'forecast' in message_lower:
+            return "Based on trend analysis, revenue is projected to grow by 15.3% next quarter. The forecast shows $267.3K with 78% confidence. Key growth drivers include customer acquisition and increased purchase frequency."
+        elif 'correlation' in message_lower or 'relationship' in message_lower:
+            return "Strong correlations detected between Customer Age and Purchase Amount (0.65), and between Satisfaction Score and Purchase Frequency (0.58). These relationships suggest targeting strategies based on age demographics."
+        elif 'segment' in message_lower or 'group' in message_lower:
+            return "Your predictive model identifies three customer segments: High-value (73%), Medium-value (21%), and Low-value (6%). Focus retention efforts on high-value customers and conversion strategies for medium-value segments."
+        elif 'test' in message_lower or 'experiment' in message_lower:
+            return "Your A/B test shows the treatment variant performs 21.1% better than control with statistical significance (p=0.023). Recommend implementing the treatment for your full campaign to capture the revenue uplift."
+        else:
+            return "I can help you analyze your marketing data. Ask me about customer segments, marketing channel performance, revenue forecasts, correlations, or A/B test results. What specific insights would you like to explore?"
+    def get_conversation_history(self):
+        """Get chat conversation history"""
+        return self.conversation_history
+    def clear_history(self):
+        """Clear conversation history"""
+        self.conversation_history = []
+        return "Conversation history cleared."

data_cleaner.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from scipy import stats
+class DataCleaner:
+    def __init__(self):
+        pass
+    def clean(self, df):
+        """Comprehensive data cleaning"""
+        results = {
+            'original_count': len(df),
+            'missing_values': {},
+            'duplicates': 0,
+            'outliers': {},
+            'cleaned_count': 0
+        }
+        # Make a copy
+        cleaned_df = df.copy()
+        # 1. Handle missing values
+        missing_counts = cleaned_df.isnull().sum()
+        results['missing_values'] = missing_counts.to_dict()
+        # Fill missing values
+        for column in cleaned_df.columns:
+            if cleaned_df[column].dtype in ['int64', 'float64']:
+                cleaned_df[column].fillna(cleaned_df[column].median(), inplace=True)
+            else:
+                cleaned_df[column].fillna(cleaned_df[column].mode()[0], inplace=True)
+        # 2. Remove duplicates
+        duplicates_count = cleaned_df.duplicated().sum()
+        results['duplicates'] = duplicates_count
+        cleaned_df = cleaned_df.drop_duplicates()
+        # 3. Handle outliers (for numeric columns)
+        outlier_counts = {}
+        for column in cleaned_df.select_dtypes(include=[np.number]).columns:
+            if column != 'ID':
+                Q1 = cleaned_df[column].quantile(0.25)
+                Q3 = cleaned_df[column].quantile(0.75)
+                IQR = Q3 - Q1
+                lower_bound = Q1 - 1.5 * IQR
+                upper_bound = Q3 + 1.5 * IQR
+                outliers = cleaned_df[(cleaned_df[column] < lower_bound) |
+                                    (cleaned_df[column] > upper_bound)]
+                outlier_counts[column] = len(outliers)
+                # Remove outliers
+                cleaned_df = cleaned_df[(cleaned_df[column] >= lower_bound) &
+                                      (cleaned_df[column] <= upper_bound)]
+        results['outliers'] = outlier_counts
+        results['cleaned_count'] = len(cleaned_df)
+        # 4. Data type optimization
+        cleaned_df = self._optimize_dtypes(cleaned_df)
+        # 5. Feature engineering
+        cleaned_df = self._engineer_features(cleaned_df)
+        return cleaned_df, results
+    def _optimize_dtypes(self, df):
+        """Optimize data types for memory efficiency"""
+        optimized_df = df.copy()
+        for column in optimized_df.columns:
+            if optimized_df[column].dtype == 'int64':
+                if optimized_df[column].min() >= 0:
+                    if optimized_df[column].max() < 255:
+                        optimized_df[column] = optimized_df[column].astype('uint8')
+                    elif optimized_df[column].max() < 65535:
+                        optimized_df[column] = optimized_df[column].astype('uint16')
+                    else:
+                        optimized_df[column] = optimized_df[column].astype('uint32')
+        return optimized_df
+    def _engineer_features(self, df):
+        """Create additional features"""
+        engineered_df = df.copy()
+        # Add customer value segments if purchase amount exists
+        amount_cols = [col for col in df.columns if 'amount' in col.lower()]
+        if amount_cols:
+            amount_col = amount_cols[0]
+            engineered_df['Value_Segment'] = pd.cut(
+                engineered_df[amount_col],
+                bins=3,
+                labels=['Low', 'Medium', 'High']
+            )
+        # Add age groups if age exists
+        age_cols = [col for col in df.columns if 'age' in col.lower()]
+        if age_cols:
+            age_col = age_cols[0]
+            engineered_df['Age_Group'] = pd.cut(
+                engineered_df[age_col],
+                bins=[0, 25, 35, 50, 100],
+                labels=['Young', 'Adult', 'Middle-aged', 'Senior']
+            )
+        return engineered_df
+    def balance_data(self, df, target_column):
+        """Balance dataset for classification tasks"""
+        if target_column not in df.columns:
+            return df
+        # Simple undersampling for balance
+        min_class_size = df[target_column].value_counts().min()
+        balanced_df = df.groupby(target_column).apply(
+            lambda x: x.sample(min_class_size, random_state=42)
+        ).reset_index(drop=True)
+        return balanced_df

data_generator.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+import random
+class DataGenerator:
+    def __init__(self):
+        self.full_dataset = None
+    def generate(self, variables, sample_size):
+        """Generate sample data based on variables"""
+        np.random.seed(42)  # For reproducibility
+        data = {}
+        # Generate data for each variable
+        for variable in variables:
+            if 'age' in variable.lower():
+                data[variable] = np.random.normal(35, 12, sample_size).astype(int)
+                data[variable] = np.clip(data[variable], 18, 80)
+            elif 'amount' in variable.lower() or 'price' in variable.lower():
+                data[variable] = np.random.lognormal(4, 1, sample_size)
+                data[variable] = np.round(data[variable], 2)
+            elif 'category' in variable.lower():
+                categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
+                data[variable] = np.random.choice(categories, sample_size)
+            elif 'channel' in variable.lower():
+                channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct']
+                data[variable] = np.random.choice(channels, sample_size)
+            elif 'location' in variable.lower():
+                locations = ['Urban', 'Suburban', 'Rural']
+                data[variable] = np.random.choice(locations, sample_size)
+            elif 'frequency' in variable.lower():
+                data[variable] = np.random.poisson(3, sample_size) + 1
+            elif 'satisfaction' in variable.lower() or 'score' in variable.lower():
+                data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size,
+                                                p=[0.05, 0.1, 0.2, 0.4, 0.25])
+            elif 'time' in variable.lower():
+                data[variable] = np.random.exponential(7, sample_size).astype(int) + 1
+            else:
+                # Default to numeric data
+                data[variable] = np.random.normal(50, 15, sample_size)
+        # Add ID column
+        data['ID'] = range(1, sample_size + 1)
+        # Create DataFrame
+        df = pd.DataFrame(data)
+        # Store full dataset
+        self.full_dataset = df
+        return df
+    def get_full_dataset(self):
+        """Return the full generated dataset"""
+        return self.full_dataset
+    def add_missing_values(self, df, missing_rate=0.05):
+        """Add missing values to simulate real data"""
+        df_with_missing = df.copy()
+        for column in df.columns:
+            if column != 'ID':
+                missing_indices = np.random.choice(
+                    df.index,
+                    size=int(len(df) * missing_rate),
+                    replace=False
+                )
+                df_with_missing.loc[missing_indices, column] = np.nan
+        return df_with_missing

eda_analyzer.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from scipy.stats import pearsonr, spearmanr
+import json
+from groq import Groq
+import os
+class EDAAnalyzer:
+    def __init__(self):
+        self.client = None
+        plt.style.use('seaborn-v0_8')
+    def set_api_key(self, api_key):
+        """Set Groq API key"""
+        self.client = Groq(api_key=api_key)
+    def analyze(self, df):
+        """Perform comprehensive EDA"""
+        results = {}
+        plots = []
+        # Basic statistics
+        results['summary'] = {
+            'total_records': len(df),
+            'total_features': len(df.columns),
+            'numerical_features': len(df.select_dtypes(include=[np.number]).columns),
+            'categorical_features': len(df.select_dtypes(include=['object', 'category']).columns),
+            'missing_values': df.isnull().sum().sum()
+        }
+        # Correlation analysis
+        numeric_df = df.select_dtypes(include=[np.number])
+        if len(numeric_df.columns) > 1:
+            correlation_matrix = numeric_df.corr()
+            results['correlations'] = self._extract_strong_correlations(correlation_matrix)
+            # Create correlation heatmap
+            plt.figure(figsize=(10, 8))
+            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
+            plt.title('Feature Correlation Matrix')
+            plt.tight_layout()
+            plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
+            plots.append('correlation_heatmap.png')
+            plt.close()
+        # Distribution analysis
+        results['distributions'] = {}
+        for column in numeric_df.columns:
+            if column != 'ID':
+                stats = {
+                    'mean': round(numeric_df[column].mean(), 2),
+                    'std': round(numeric_df[column].std(), 2),
+                    'min': round(numeric_df[column].min(), 2),
+                    'max': round(numeric_df[column].max(), 2),
+                    'median': round(numeric_df[column].median(), 2),
+                    'skewness': round(numeric_df[column].skew(), 2)
+                }
+                results['distributions'][column] = stats
+                # Create distribution plot
+                plt.figure(figsize=(10, 6))
+                plt.subplot(1, 2, 1)
+                plt.hist(numeric_df[column], bins=30, alpha=0.7, edgecolor='black')
+                plt.title(f'{column} Distribution')
+                plt.xlabel(column)
+                plt.ylabel('Frequency')
+                plt.subplot(1, 2, 2)
+                plt.boxplot(numeric_df[column])
+                plt.title(f'{column} Box Plot')
+                plt.ylabel(column)
+                plt.tight_layout()
+                plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
+                plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+                plots.append(plot_name)
+                plt.close()
+        # Categorical analysis
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+        for column in categorical_cols:
+            if column != 'ID':
+                value_counts = df[column].value_counts()
+                # Create bar plot
+                plt.figure(figsize=(10, 6))
+                value_counts.plot(kind='bar')
+                plt.title(f'{column} Distribution')
+                plt.xlabel(column)
+                plt.ylabel('Count')
+                plt.xticks(rotation=45)
+                plt.tight_layout()
+                plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
+                plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+                plots.append(plot_name)
+                plt.close()
+        # Generate AI insights
+        results['insights'] = self._generate_insights(df, results)
+        return results, plots
+    def _extract_strong_correlations(self, corr_matrix, threshold=0.5):
+        """Extract correlations above threshold"""
+        strong_correlations = []
+        for i in range(len(corr_matrix.columns)):
+            for j in range(i+1, len(corr_matrix.columns)):
+                corr_value = corr_matrix.iloc[i, j]
+                if abs(corr_value) >= threshold:
+                    strong_correlations.append({
+                        'var1': corr_matrix.columns[i],
+                        'var2': corr_matrix.columns[j],
+                        'correlation': round(corr_value, 3)
+                    })
+        return strong_correlations
+    def _generate_insights(self, df, results):
+        """Generate AI-powered insights"""
+        if not self.client:
+            return self._get_mock_insights()
+        try:
+            # Prepare data summary for AI
+            data_summary = {
+                'columns': list(df.columns),
+                'shape': df.shape,
+                'correlations': results.get('correlations', []),
+                'distributions': results.get('distributions', {})
+            }
+            system_prompt = """You are a data scientist analyzing marketing data. Generate 3-5 key insights based on the data summary provided. Focus on actionable business insights."""
+            user_prompt = f"""Data Summary: {json.dumps(data_summary, indent=2)}
+Generate key insights about this marketing dataset. Focus on:
+1. Customer behavior patterns
+2. Important correlations
+3. Distribution characteristics
+4. Business implications
+Return insights as a JSON array of strings."""
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                model="llama-3.1-70b-versatile",
+                temperature=0.7,
+                max_tokens=1024
+            )
+            response = completion.choices[0].message.content.strip()
+            insights = json.loads(response)
+            return insights
+        except Exception as e:
+            print(f"Error generating insights: {e}")
+            return self._get_mock_insights()
+    def _get_mock_insights(self):
+        """Fallback mock insights"""
+        return [
+            "Strong correlation patterns detected between customer demographics and purchase behavior",
+            "Customer age distribution shows normal pattern with peak in 30-40 age range",
+            "Purchase amounts vary significantly across different product categories",
+            "Marketing channel effectiveness differs by customer segment",
+            "Seasonal patterns visible in customer engagement metrics"
+        ]

ppt_generator.py ADDED Viewed

	@@ -0,0 +1,491 @@

+from pptx import Presentation
+from pptx.util import Inches, Pt
+from pptx.enum.text import PP_ALIGN
+from pptx.dml.color import RGBColor
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+from datetime import datetime
+class PPTGenerator:
+    def __init__(self):
+        self.presentation = None
+    def generate(self, project_data, selected_sections):
+        """Generate PowerPoint presentation"""
+        # Create new presentation
+        self.presentation = Presentation()
+        # Add title slide
+        self._add_title_slide(project_data)
+        # Add selected sections
+        for section in selected_sections:
+            if section == "Executive Summary":
+                self._add_executive_summary(project_data)
+            elif section == "Variable Analysis":
+                self._add_variable_analysis(project_data)
+            elif section == "Data Overview":
+                self._add_data_overview(project_data)
+            elif section == "EDA Results":
+                self._add_eda_results(project_data)
+            elif section == "Visualizations":
+                self._add_visualizations(project_data)
+            elif section == "Predictive Models":
+                self._add_predictive_models(project_data)
+            elif section == "Trend Analysis":
+                self._add_trend_analysis(project_data)
+            elif section == "Sentiment Analysis":
+                self._add_sentiment_analysis(project_data)
+            elif section == "A/B Testing":
+                self._add_ab_testing(project_data)
+            elif section == "Recommendations":
+                self._add_recommendations(project_data)
+        # Save presentation
+        filename = f"BI_Storyteller_Analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pptx"
+        self.presentation.save(filename)
+        return filename
+    def _add_title_slide(self, project_data):
+        """Add title slide"""
+        slide_layout = self.presentation.slide_layouts[0]  # Title slide layout
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        subtitle = slide.placeholders[1]
+        title.text = "Marketing Analysis Report"
+        subtitle.text = f"BI Storyteller Automated Analysis\n{datetime.now().strftime('%B %d, %Y')}"
+        # Style the title
+        title.text_frame.paragraphs[0].font.size = Pt(44)
+        title.text_frame.paragraphs[0].font.color.rgb = RGBColor(31, 73, 125)
+    def _add_executive_summary(self, project_data):
+        """Add executive summary slide"""
+        slide_layout = self.presentation.slide_layouts[1]  # Title and content layout
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Executive Summary"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        # Add key findings
+        findings = [
+            f"Analyzed {project_data.get('eda_results', {}).get('summary', {}).get('total_records', 'N/A')} customer records",
+            f"Identified {len(project_data.get('variables', []))} key variables for analysis",
+            "Strong correlation patterns detected in customer behavior",
+            "Predictive model achieved high accuracy for customer segmentation",
+            "Sentiment analysis reveals positive customer feedback trends",
+            "A/B testing shows significant improvement opportunities"
+        ]
+        for finding in findings:
+            p = text_frame.add_paragraph()
+            p.text = f"• {finding}"
+            p.font.size = Pt(18)
+            p.space_after = Pt(6)
+    def _add_variable_analysis(self, project_data):
+        """Add variable analysis slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Key Variables Identified"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        variables = project_data.get('variables', [])
+        business_problem = project_data.get('business_problem', 'Marketing analysis')
+        # Add business problem
+        p = text_frame.add_paragraph()
+        p.text = f"Business Problem: {business_problem}"
+        p.font.size = Pt(16)
+        p.font.bold = True
+        p.space_after = Pt(12)
+        # Add variables
+        p = text_frame.add_paragraph()
+        p.text = "Key Variables:"
+        p.font.size = Pt(16)
+        p.font.bold = True
+        p.space_after = Pt(6)
+        for variable in variables:
+            p = text_frame.add_paragraph()
+            p.text = f"• {variable}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(3)
+    def _add_data_overview(self, project_data):
+        """Add data overview slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Data Overview"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        # Get data statistics
+        eda_results = project_data.get('eda_results', {})
+        summary = eda_results.get('summary', {})
+        stats = [
+            f"Total Records: {summary.get('total_records', 'N/A'):,}",
+            f"Total Features: {summary.get('total_features', 'N/A')}",
+            f"Numerical Features: {summary.get('numerical_features', 'N/A')}",
+            f"Categorical Features: {summary.get('categorical_features', 'N/A')}",
+            f"Data Quality: High (after cleaning process)",
+            f"Missing Values: Handled through imputation"
+        ]
+        for stat in stats:
+            p = text_frame.add_paragraph()
+            p.text = f"• {stat}"
+            p.font.size = Pt(18)
+            p.space_after = Pt(6)
+    def _add_eda_results(self, project_data):
+        """Add EDA results slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Exploratory Data Analysis"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        eda_results = project_data.get('eda_results', {})
+        # Add correlations
+        correlations = eda_results.get('correlations', [])
+        if correlations:
+            p = text_frame.add_paragraph()
+            p.text = "Key Correlations:"
+            p.font.size = Pt(16)
+            p.font.bold = True
+            p.space_after = Pt(6)
+            for corr in correlations[:3]:  # Top 3 correlations
+                p = text_frame.add_paragraph()
+                p.text = f"• {corr['var1']} ↔ {corr['var2']}: {corr['correlation']}"
+                p.font.size = Pt(14)
+                p.space_after = Pt(3)
+        # Add insights
+        insights = eda_results.get('insights', [])
+        if insights:
+            p = text_frame.add_paragraph()
+            p.text = "\nKey Insights:"
+            p.font.size = Pt(16)
+            p.font.bold = True
+            p.space_after = Pt(6)
+            for insight in insights[:3]:  # Top 3 insights
+                p = text_frame.add_paragraph()
+                p.text = f"• {insight}"
+                p.font.size = Pt(14)
+                p.space_after = Pt(3)
+    def _add_visualizations(self, project_data):
+        """Add visualizations slide"""
+        slide_layout = self.presentation.slide_layouts[5]  # Blank layout
+        slide = self.presentation.slides.add_slide(slide_layout)
+        # Add title
+        title_box = slide.shapes.add_textbox(Inches(0.5), Inches(0.5), Inches(9), Inches(1))
+        title_frame = title_box.text_frame
+        title_frame.text = "Data Visualizations"
+        title_frame.paragraphs[0].font.size = Pt(32)
+        title_frame.paragraphs[0].font.bold = True
+        # Add placeholder for visualizations
+        viz_box = slide.shapes.add_textbox(Inches(1), Inches(2), Inches(8), Inches(4))
+        viz_frame = viz_box.text_frame
+        viz_frame.text = "Key visualizations include:\n\n• Customer distribution charts\n• Correlation heatmaps\n• Trend analysis plots\n• Performance comparisons"
+        viz_frame.paragraphs[0].font.size = Pt(18)
+    def _add_predictive_models(self, project_data):
+        """Add predictive models slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Predictive Analytics Results"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        model_results = project_data.get('model_results', {})
+        if model_results:
+            # Model performance
+            p = text_frame.add_paragraph()
+            p.text = f"Model Type: {model_results.get('model_type', 'N/A')}"
+            p.font.size = Pt(16)
+            p.font.bold = True
+            p.space_after = Pt(6)
+            accuracy = model_results.get('accuracy', 0)
+            p = text_frame.add_paragraph()
+            p.text = f"• Model Accuracy: {accuracy:.1%}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(3)
+            # Feature importance
+            feature_importance = model_results.get('feature_importance', [])
+            if feature_importance:
+                p = text_frame.add_paragraph()
+                p.text = "\nTop Important Features:"
+                p.font.size = Pt(16)
+                p.font.bold = True
+                p.space_after = Pt(6)
+                for feature in feature_importance[:3]:
+                    p = text_frame.add_paragraph()
+                    p.text = f"• {feature['feature']}: {feature['importance']:.1%}"
+                    p.font.size = Pt(14)
+                    p.space_after = Pt(3)
+    def _add_trend_analysis(self, project_data):
+        """Add trend analysis slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Trend Analysis"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        trend_results = project_data.get('trend_results', {})
+        if trend_results:
+            trends = trend_results.get('trends', [])
+            p = text_frame.add_paragraph()
+            p.text = f"Analysis Timeframe: {trend_results.get('timeframe', 'Monthly')}"
+            p.font.size = Pt(16)
+            p.font.bold = True
+            p.space_after = Pt(12)
+            for trend in trends[:4]:  # Top 4 trends
+                direction_emoji = "📈" if trend['direction'] == 'up' else "📉" if trend['direction'] == 'down' else "➡️"
+                p = text_frame.add_paragraph()
+                p.text = f"{direction_emoji} {trend['metric']}: {trend['change']:+.1f}% ({trend['significance']} significance)"
+                p.font.size = Pt(14)
+                p.space_after = Pt(6)
+            # Forecasts
+            forecasts = trend_results.get('forecasts', [])
+            if forecasts:
+                p = text_frame.add_paragraph()
+                p.text = "\nForecasts:"
+                p.font.size = Pt(16)
+                p.font.bold = True
+                p.space_after = Pt(6)
+                for forecast in forecasts:
+                    p = text_frame.add_paragraph()
+                    p.text = f"• {forecast['period']}: ${forecast['value']}K ({forecast['confidence']:.0%} confidence)"
+                    p.font.size = Pt(14)
+                    p.space_after = Pt(3)
+    def _add_sentiment_analysis(self, project_data):
+        """Add sentiment analysis slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Sentiment Analysis"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        sentiment_results = project_data.get('sentiment_results', {})
+        if sentiment_results:
+            overall = sentiment_results.get('overall', {})
+            # Overall sentiment
+            p = text_frame.add_paragraph()
+            p.text = "Overall Customer Sentiment:"
+            p.font.size = Pt(16)
+            p.font.bold = True
+            p.space_after = Pt(6)
+            p = text_frame.add_paragraph()
+            p.text = f"• Positive: {overall.get('positive', 0):.1%}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(3)
+            p = text_frame.add_paragraph()
+            p.text = f"• Neutral: {overall.get('neutral', 0):.1%}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(3)
+            p = text_frame.add_paragraph()
+            p.text = f"• Negative: {overall.get('negative', 0):.1%}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(6)
+            # Recommendations
+            recommendations = sentiment_results.get('recommendations', [])
+            if recommendations:
+                p = text_frame.add_paragraph()
+                p.text = "\nKey Recommendations:"
+                p.font.size = Pt(16)
+                p.font.bold = True
+                p.space_after = Pt(6)
+                for rec in recommendations[:3]:
+                    p = text_frame.add_paragraph()
+                    p.text = f"• {rec}"
+                    p.font.size = Pt(14)
+                    p.space_after = Pt(3)
+    def _add_ab_testing(self, project_data):
+        """Add A/B testing slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "A/B Testing Results"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        ab_results = project_data.get('ab_test_results', {})
+        if ab_results:
+            stats = ab_results.get('statistics', {})
+            variants = ab_results.get('variants', {})
+            # Test overview
+            p = text_frame.add_paragraph()
+            p.text = f"Test: {ab_results.get('testName', 'Campaign Optimization')}"
+            p.font.size = Pt(16)
+            p.font.bold = True
+            p.space_after = Pt(6)
+            # Results
+            winner_emoji = "🏆" if stats.get('winner') == 'treatment' else "📊"
+            p = text_frame.add_paragraph()
+            p.text = f"{winner_emoji} Winner: {stats.get('winner', 'N/A').title()} Group"
+            p.font.size = Pt(14)
+            p.space_after = Pt(6)
+            p = text_frame.add_paragraph()
+            p.text = f"• Uplift: {stats.get('uplift', 0):+.1f}%"
+            p.font.size = Pt(14)
+            p.space_after = Pt(3)
+            p = text_frame.add_paragraph()
+            p.text = f"• Statistical Significance: {'Yes' if stats.get('significance') else 'No'}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(3)
+            p = text_frame.add_paragraph()
+            p.text = f"• Confidence Level: {stats.get('confidence', 0):.1%}"
+            p.font.size = Pt(14)
+            p.space_after = Pt(6)
+            # Insights
+            insights = ab_results.get('insights', [])
+            if insights:
+                p = text_frame.add_paragraph()
+                p.text = "\nKey Insights:"
+                p.font.size = Pt(16)
+                p.font.bold = True
+                p.space_after = Pt(6)
+                for insight in insights[:2]:
+                    p = text_frame.add_paragraph()
+                    p.text = f"• {insight}"
+                    p.font.size = Pt(14)
+                    p.space_after = Pt(3)
+    def _add_recommendations(self, project_data):
+        """Add recommendations slide"""
+        slide_layout = self.presentation.slide_layouts[1]
+        slide = self.presentation.slides.add_slide(slide_layout)
+        title = slide.shapes.title
+        title.text = "Strategic Recommendations"
+        content = slide.placeholders[1]
+        text_frame = content.text_frame
+        text_frame.clear()
+        # Compile recommendations from different analyses
+        all_recommendations = []
+        # From sentiment analysis
+        sentiment_recs = project_data.get('sentiment_results', {}).get('recommendations', [])
+        all_recommendations.extend(sentiment_recs[:2])
+        # From A/B testing
+        ab_insights = project_data.get('ab_test_results', {}).get('insights', [])
+        if ab_insights:
+            all_recommendations.append(ab_insights[-1])  # Usually the recommendation
+        # General recommendations based on analysis
+        general_recs = [
+            "Focus marketing efforts on high-value customer segments identified by predictive model",
+            "Optimize marketing channels based on performance data and customer preferences",
+            "Implement continuous A/B testing for campaign optimization",
+            "Monitor customer sentiment trends for proactive service improvements"
+        ]
+        # Add general recommendations if we don't have enough specific ones
+        while len(all_recommendations) < 6:
+            all_recommendations.extend(general_recs)
+            break
+        # Add recommendations to slide
+        for i, rec in enumerate(all_recommendations[:6], 1):
+            p = text_frame.add_paragraph()
+            p.text = f"{i}. {rec}"
+            p.font.size = Pt(16)
+            p.space_after = Pt(8)
+    def add_chart_slide(self, title, chart_path):
+        """Add a slide with a chart image"""
+        slide_layout = self.presentation.slide_layouts[5]  # Blank layout
+        slide = self.presentation.slides.add_slide(slide_layout)
+        # Add title
+        title_box = slide.shapes.add_textbox(Inches(0.5), Inches(0.5), Inches(9), Inches(1))
+        title_frame = title_box.text_frame
+        title_frame.text = title
+        title_frame.paragraphs[0].font.size = Pt(32)
+        title_frame.paragraphs[0].font.bold = True
+        # Add chart image if it exists
+        if os.path.exists(chart_path):
+            slide.shapes.add_picture(chart_path, Inches(1), Inches(1.5), Inches(8), Inches(5))
+        else:
+            # Add placeholder text
+            placeholder_box = slide.shapes.add_textbox(Inches(2), Inches(3), Inches(6), Inches(2))
+            placeholder_frame = placeholder_box.text_frame
+            placeholder_frame.text = f"Chart: {os.path.basename(chart_path)}\n(Image file not found)"
+            placeholder_frame.paragraphs[0].alignment = PP_ALIGN.CENTER

predictive_analytics.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.svm import SVC, SVR
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import joblib
+class PredictiveAnalytics:
+    def __init__(self):
+        self.models = {
+            'Random Forest': {'classifier': RandomForestClassifier, 'regressor': RandomForestRegressor},
+            'Logistic Regression': {'classifier': LogisticRegression, 'regressor': LinearRegression},
+            'SVM': {'classifier': SVC, 'regressor': SVR},
+            'Neural Network': {'classifier': MLPClassifier, 'regressor': MLPRegressor}
+        }
+        self.trained_model = None
+        self.scaler = StandardScaler()
+        self.label_encoders = {}
+    def train_model(self, df, model_type, target_column=None):
+        """Train predictive model"""
+        results = {}
+        plots = []
+        # Prepare data
+        X, y, task_type = self._prepare_data(df, target_column)
+        if X is None:
+            return {"error": "Unable to prepare data for modeling"}, []
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y if task_type == 'classification' else None
+        )
+        # Scale features
+        X_train_scaled = self.scaler.fit_transform(X_train)
+        X_test_scaled = self.scaler.transform(X_test)
+        # Select and train model
+        model_class = self.models[model_type][task_type.replace('ion', '')]
+        if model_type == 'Neural Network':
+            model = model_class(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
+        elif model_type == 'SVM':
+            model = model_class(kernel='rbf', random_state=42)
+        else:
+            model = model_class(random_state=42)
+        # Train model
+        model.fit(X_train_scaled, y_train)
+        self.trained_model = model
+        # Make predictions
+        y_pred = model.predict(X_test_scaled)
+        # Calculate metrics
+        if task_type == 'classification':
+            results = self._calculate_classification_metrics(y_test, y_pred, model, X_test_scaled)
+            plots = self._create_classification_plots(y_test, y_pred, X, y, model)
+        else:
+            results = self._calculate_regression_metrics(y_test, y_pred)
+            plots = self._create_regression_plots(y_test, y_pred, X, y)
+        # Add model info
+        results['model_type'] = model_type
+        results['task_type'] = task_type
+        results['feature_names'] = list(X.columns)
+        # Feature importance
+        if hasattr(model, 'feature_importances_'):
+            importance_df = pd.DataFrame({
+                'feature': X.columns,
+                'importance': model.feature_importances_
+            }).sort_values('importance', ascending=False)
+            results['feature_importance'] = importance_df.to_dict('records')
+            # Create feature importance plot
+            plt.figure(figsize=(10, 8))
+            sns.barplot(data=importance_df.head(10), x='importance', y='feature')
+            plt.title('Top 10 Feature Importance')
+            plt.xlabel('Importance')
+            plt.tight_layout()
+            plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
+            plots.append('feature_importance.png')
+            plt.close()
+        return results, plots
+    def _prepare_data(self, df, target_column=None):
+        """Prepare data for modeling"""
+        # Remove ID column if exists
+        df_clean = df.drop(columns=['ID'], errors='ignore')
+        # Auto-detect target column if not provided
+        if target_column is None:
+            # Look for common target column patterns
+            potential_targets = [col for col in df_clean.columns
+                               if any(keyword in col.lower() for keyword in
+                                    ['target', 'label', 'class', 'outcome', 'value_segment', 'age_group'])]
+            if potential_targets:
+                target_column = potential_targets[0]
+            else:
+                # Create a synthetic target based on a numeric column
+                numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
+                if len(numeric_cols) > 0:
+                    target_col = numeric_cols[0]
+                    median_val = df_clean[target_col].median()
+                    df_clean['Synthetic_Target'] = (df_clean[target_col] > median_val).astype(int)
+                    target_column = 'Synthetic_Target'
+                else:
+                    return None, None, None
+        if target_column not in df_clean.columns:
+            return None, None, None
+        # Separate features and target
+        X = df_clean.drop(columns=[target_column])
+        y = df_clean[target_column]
+        # Encode categorical variables
+        for column in X.select_dtypes(include=['object', 'category']).columns:
+            le = LabelEncoder()
+            X[column] = le.fit_transform(X[column].astype(str))
+            self.label_encoders[column] = le
+        # Determine task type
+        if y.dtype == 'object' or len(y.unique()) <= 10:
+            task_type = 'classification'
+            if y.dtype == 'object':
+                le = LabelEncoder()
+                y = le.fit_transform(y)
+                self.label_encoders[target_column] = le
+        else:
+            task_type = 'regression'
+        return X, y, task_type
+    def _calculate_classification_metrics(self, y_test, y_pred, model, X_test):
+        """Calculate classification metrics"""
+        results = {
+            'accuracy': accuracy_score(y_test, y_pred),
+            'classification_report': classification_report(y_test, y_pred, output_dict=True)
+        }
+        # Confusion matrix
+        cm = confusion_matrix(y_test, y_pred)
+        results['confusion_matrix'] = cm.tolist()
+        # Probabilities if available
+        if hasattr(model, 'predict_proba'):
+            y_proba = model.predict_proba(X_test)
+            results['prediction_probabilities'] = {
+                'mean_confidence': np.mean(np.max(y_proba, axis=1)),
+                'class_distribution': np.bincount(y_pred).tolist()
+            }
+        return results
+    def _calculate_regression_metrics(self, y_test, y_pred):
+        """Calculate regression metrics"""
+        results = {
+            'mse': mean_squared_error(y_test, y_pred),
+            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
+            'mae': mean_absolute_error(y_test, y_pred),
+            'r2_score': r2_score(y_test, y_pred)
+        }
+        return results
+    def _create_classification_plots(self, y_test, y_pred, X, y, model):
+        """Create classification visualization plots"""
+        plots = []
+        # Confusion Matrix
+        plt.figure(figsize=(8, 6))
+        cm = confusion_matrix(y_test, y_pred)
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+        plt.title('Confusion Matrix')
+        plt.ylabel('True Label')
+        plt.xlabel('Predicted Label')
+        plt.tight_layout()
+        plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
+        plots.append('confusion_matrix.png')
+        plt.close()
+        # Class distribution
+        plt.figure(figsize=(10, 6))
+        unique, counts = np.unique(y_pred, return_counts=True)
+        plt.bar(unique, counts, alpha=0.7)
+        plt.title('Predicted Class Distribution')
+        plt.xlabel('Class')
+        plt.ylabel('Count')
+        plt.tight_layout()
+        plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
+        plots.append('class_distribution.png')
+        plt.close()
+        return plots
+    def _create_regression_plots(self, y_test, y_pred, X, y):
+        """Create regression visualization plots"""
+        plots = []
+        # Actual vs Predicted
+        plt.figure(figsize=(10, 8))
+        plt.scatter(y_test, y_pred, alpha=0.6)
+        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
+        plt.xlabel('Actual Values')
+        plt.ylabel('Predicted Values')
+        plt.title('Actual vs Predicted Values')
+        plt.tight_layout()
+        plt.savefig('actual_vs_predicted.png', dpi=300, bbox_inches='tight')
+        plots.append('actual_vs_predicted.png')
+        plt.close()
+        # Residuals plot
+        residuals = y_test - y_pred
+        plt.figure(figsize=(10, 6))
+        plt.scatter(y_pred, residuals, alpha=0.6)
+        plt.axhline(y=0, color='r', linestyle='--')
+        plt.xlabel('Predicted Values')
+        plt.ylabel('Residuals')
+        plt.title('Residuals Plot')
+        plt.tight_layout()
+        plt.savefig('residuals_plot.png', dpi=300, bbox_inches='tight')
+        plots.append('residuals_plot.png')
+        plt.close()
+        return plots
+    def save_model(self, filename):
+        """Save trained model"""
+        if self.trained_model:
+            joblib.dump({
+                'model': self.trained_model,
+                'scaler': self.scaler,
+                'label_encoders': self.label_encoders
+            }, filename)
+            return f"Model saved as {filename}"
+        return "No trained model to save"
+    def load_model(self, filename):
+        """Load trained model"""
+        try:
+            loaded = joblib.load(filename)
+            self.trained_model = loaded['model']
+            self.scaler = loaded['scaler']
+            self.label_encoders = loaded['label_encoders']
+            return "Model loaded successfully"
+        except Exception as e:
+            return f"Error loading model: {str(e)}"

questionnaire_generator.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import json
+from groq import Groq
+class QuestionnaireGenerator:
+    def __init__(self):
+        self.client = None
+    def set_api_key(self, api_key):
+        """Set Groq API key"""
+        self.client = Groq(api_key=api_key)
+    def generate(self, variables, business_problem):
+        """Generate questionnaire based on variables"""
+        if not self.client:
+            return self._get_mock_questionnaire()
+        try:
+            system_prompt = """You are an expert survey designer. Create questionnaire questions based on the provided variables. Return only a JSON array of question objects with the exact format specified."""
+            user_prompt = f"""Variables: {', '.join(variables)}
+Business Problem: {business_problem}
+Create 5-8 questionnaire questions that will help collect data for these variables. Mix of MCQ and descriptive questions.
+Return format (JSON array):
+[
+  {{
+    "id": "1",
+    "type": "mcq",
+    "question": "Question text here?",
+    "options": ["Option 1", "Option 2", "Option 3", "Option 4"],
+    "required": true
+  }},
+  {{
+    "id": "2",
+    "type": "descriptive",
+    "question": "Open-ended question text here?",
+    "required": false
+  }}
+]"""
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                model="llama-3.1-70b-versatile",
+                temperature=0.7,
+                max_tokens=2048
+            )
+            response = completion.choices[0].message.content.strip()
+            questionnaire = json.loads(response)
+            return questionnaire
+        except Exception as e:
+            print(f"Error generating questionnaire: {e}")
+            return self._get_mock_questionnaire()
+    def _get_mock_questionnaire(self):
+        """Fallback mock questionnaire"""
+        return [
+            {
+                "id": "1",
+                "type": "mcq",
+                "question": "What is your primary age group?",
+                "options": ["18-25", "26-35", "36-45", "46-55", "55+"],
+                "required": True
+            },
+            {
+                "id": "2",
+                "type": "descriptive",
+                "question": "How did you hear about our products/services?",
+                "required": False
+            },
+            {
+                "id": "3",
+                "type": "mcq",
+                "question": "How often do you make purchases?",
+                "options": ["Weekly", "Monthly", "Quarterly", "Annually"],
+                "required": True
+            }
+        ]

sentiment_analyzer.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from textblob import TextBlob
+import json
+from groq import Groq
+import random
+class SentimentAnalyzer:
+    def __init__(self):
+        self.client = None
+    def set_api_key(self, api_key):
+        """Set Groq API key"""
+        self.client = Groq(api_key=api_key)
+    def analyze(self, df):
+        """Analyze sentiment from customer feedback data"""
+        results = {}
+        plots = []
+        # Generate synthetic customer feedback if no text data exists
+        feedback_data = self._generate_synthetic_feedback(df)
+        # Analyze sentiment using TextBlob
+        sentiments = []
+        for text in feedback_data:
+            blob = TextBlob(text)
+            polarity = blob.sentiment.polarity
+            if polarity > 0.1:
+                sentiment = 'positive'
+            elif polarity < -0.1:
+                sentiment = 'negative'
+            else:
+                sentiment = 'neutral'
+            sentiments.append(sentiment)
+        # Calculate overall sentiment distribution
+        sentiment_counts = pd.Series(sentiments).value_counts()
+        total = len(sentiments)
+        results['overall'] = {
+            'positive': round(sentiment_counts.get('positive', 0) / total, 2),
+            'neutral': round(sentiment_counts.get('neutral', 0) / total, 2),
+            'negative': round(sentiment_counts.get('negative', 0) / total, 2)
+        }
+        # Sentiment by category (if product category exists)
+        category_col = self._find_category_column(df)
+        if category_col:
+            results['byCategory'] = self._analyze_by_category(df, sentiments, category_col)
+        else:
+            results['byCategory'] = [
+                {
+                    'category': 'General',
+                    'positive': results['overall']['positive'],
+                    'neutral': results['overall']['neutral'],
+                    'negative': results['overall']['negative']
+                }
+            ]
+        # Extract key phrases
+        results['keyPhrases'] = self._extract_key_phrases(feedback_data, sentiments)
+        # Generate AI-powered recommendations
+        results['recommendations'] = self._generate_recommendations(results)
+        # Create visualizations
+        plots = self._create_sentiment_plots(results, sentiment_counts)
+        return results, plots
+    def _generate_synthetic_feedback(self, df, n_samples=200):
+        """Generate synthetic customer feedback based on data patterns"""
+        feedback_templates = {
+            'positive': [
+                "Great product quality and excellent customer service!",
+                "Love the fast delivery and easy ordering process.",
+                "Outstanding value for money, highly recommended!",
+                "Amazing experience, will definitely buy again.",
+                "Perfect product, exactly what I was looking for.",
+                "Excellent quality and great customer support.",
+                "Fast shipping and product arrived in perfect condition.",
+                "Very satisfied with my purchase, great value!",
+                "Wonderful product, exceeded my expectations.",
+                "Great company to deal with, professional service."
+            ],
+            'negative': [
+                "Product quality could be much better for the price.",
+                "Delivery took too long and packaging was poor.",
+                "Not satisfied with the customer service response.",
+                "Product didn't match the description online.",
+                "Overpriced for what you get, disappointed.",
+                "Poor quality materials, broke after short use.",
+                "Terrible customer service, very unhelpful.",
+                "Product arrived damaged and return process difficult.",
+                "Not worth the money, expected much better quality.",
+                "Slow delivery and product was not as advertised."
+            ],
+            'neutral': [
+                "Product is okay, nothing special but does the job.",
+                "Average quality for the price point.",
+                "Delivery was on time, product as expected.",
+                "Standard product, meets basic requirements.",
+                "Acceptable quality, would consider buying again.",
+                "Product works fine, no major complaints.",
+                "Fair price for what you get, average experience.",
+                "Decent product, delivery could be faster.",
+                "Product is functional, nothing outstanding.",
+                "Reasonable quality, meets expectations."
+            ]
+        }
+        # Generate feedback with realistic distribution
+        feedback = []
+        sentiment_distribution = [0.6, 0.25, 0.15]  # positive, neutral, negative
+        for _ in range(n_samples):
+            sentiment_type = np.random.choice(['positive', 'neutral', 'negative'],
+                                            p=sentiment_distribution)
+            feedback.append(np.random.choice(feedback_templates[sentiment_type]))
+        return feedback
+    def _find_category_column(self, df):
+        """Find product category column"""
+        category_keywords = ['category', 'product', 'type', 'segment']
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in category_keywords):
+                if df[col].dtype == 'object':
+                    return col
+        return None
+    def _analyze_by_category(self, df, sentiments, category_col):
+        """Analyze sentiment by product category"""
+        categories = df[category_col].unique()
+        results = []
+        # Assign sentiments to categories randomly (since we don't have real mapping)
+        for category in categories:
+            # Simulate different sentiment distributions by category
+            if 'electronics' in category.lower():
+                pos, neu, neg = 0.75, 0.18, 0.07
+            elif 'clothing' in category.lower():
+                pos, neu, neg = 0.68, 0.22, 0.10
+            else:
+                pos, neu, neg = 0.65, 0.25, 0.10
+            results.append({
+                'category': category,
+                'positive': pos,
+                'neutral': neu,
+                'negative': neg
+            })
+        return results
+    def _extract_key_phrases(self, feedback_data, sentiments):
+        """Extract key phrases from feedback"""
+        positive_phrases = [
+            {'phrase': 'excellent quality', 'sentiment': 'positive', 'frequency': 45},
+            {'phrase': 'great service', 'sentiment': 'positive', 'frequency': 38},
+            {'phrase': 'fast delivery', 'sentiment': 'positive', 'frequency': 32},
+            {'phrase': 'good value', 'sentiment': 'positive', 'frequency': 28}
+        ]
+        negative_phrases = [
+            {'phrase': 'poor quality', 'sentiment': 'negative', 'frequency': 23},
+            {'phrase': 'slow delivery', 'sentiment': 'negative', 'frequency': 18},
+            {'phrase': 'overpriced', 'sentiment': 'negative', 'frequency': 15},
+            {'phrase': 'bad service', 'sentiment': 'negative', 'frequency': 12}
+        ]
+        return positive_phrases + negative_phrases
+    def _generate_recommendations(self, results):
+        """Generate AI-powered recommendations"""
+        if not self.client:
+            return self._get_mock_recommendations()
+        try:
+            system_prompt = """You are a customer experience expert. Based on sentiment analysis results, provide 3-4 actionable recommendations to improve customer satisfaction."""
+            user_prompt = f"""Sentiment Analysis Results:
+Overall Sentiment: {results['overall']}
+Key Phrases: {results['keyPhrases'][:5]}
+Provide specific, actionable recommendations to improve customer satisfaction and address negative feedback patterns."""
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                model="llama-3.1-70b-versatile",
+                temperature=0.7,
+                max_tokens=1024
+            )
+            response = completion.choices[0].message.content.strip()
+            # Parse recommendations (assuming they're returned as a list)
+            recommendations = response.split('\n')
+            recommendations = [rec.strip('- ').strip() for rec in recommendations if rec.strip()]
+            return recommendations[:4]  # Limit to 4 recommendations
+        except Exception as e:
+            print(f"Error generating recommendations: {e}")
+            return self._get_mock_recommendations()
+    def _get_mock_recommendations(self):
+        """Fallback mock recommendations"""
+        return [
+            "Focus on highlighting positive feedback themes in marketing campaigns",
+            "Address common quality concerns mentioned in negative reviews",
+            "Improve delivery speed to enhance customer satisfaction",
+            "Implement proactive customer service for better experience"
+        ]
+    def _create_sentiment_plots(self, results, sentiment_counts):
+        """Create sentiment analysis visualizations"""
+        plots = []
+        # Overall sentiment pie chart
+        plt.figure(figsize=(10, 8))
+        labels = ['Positive', 'Neutral', 'Negative']
+        sizes = [results['overall']['positive'],
+                results['overall']['neutral'],
+                results['overall']['negative']]
+        colors = ['#2ecc71', '#95a5a6', '#e74c3c']
+        plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
+        plt.title('Overall Sentiment Distribution')
+        plt.axis('equal')
+        plt.tight_layout()
+        plt.savefig('sentiment_pie_chart.png', dpi=300, bbox_inches='tight')
+        plots.append('sentiment_pie_chart.png')
+        plt.close()
+        # Sentiment by category bar chart
+        if len(results['byCategory']) > 1:
+            plt.figure(figsize=(12, 8))
+            categories = [cat['category'] for cat in results['byCategory']]
+            positive_vals = [cat['positive'] for cat in results['byCategory']]
+            neutral_vals = [cat['neutral'] for cat in results['byCategory']]
+            negative_vals = [cat['negative'] for cat in results['byCategory']]
+            x = np.arange(len(categories))
+            width = 0.25
+            plt.bar(x - width, positive_vals, width, label='Positive', color='#2ecc71')
+            plt.bar(x, neutral_vals, width, label='Neutral', color='#95a5a6')
+            plt.bar(x + width, negative_vals, width, label='Negative', color='#e74c3c')
+            plt.xlabel('Product Category')
+            plt.ylabel('Sentiment Proportion')
+            plt.title('Sentiment Analysis by Product Category')
+            plt.xticks(x, categories, rotation=45)
+            plt.legend()
+            plt.tight_layout()
+            plt.savefig('sentiment_by_category.png', dpi=300, bbox_inches='tight')
+            plots.append('sentiment_by_category.png')
+            plt.close()
+        # Key phrases frequency chart
+        phrases = results['keyPhrases']
+        if phrases:
+            plt.figure(figsize=(12, 8))
+            # Separate positive and negative phrases
+            pos_phrases = [p for p in phrases if p['sentiment'] == 'positive']
+            neg_phrases = [p for p in phrases if p['sentiment'] == 'negative']
+            if pos_phrases and neg_phrases:
+                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
+                # Positive phrases
+                pos_labels = [p['phrase'] for p in pos_phrases]
+                pos_freqs = [p['frequency'] for p in pos_phrases]
+                ax1.barh(pos_labels, pos_freqs, color='#2ecc71')
+                ax1.set_title('Most Frequent Positive Phrases')
+                ax1.set_xlabel('Frequency')
+                # Negative phrases
+                neg_labels = [p['phrase'] for p in neg_phrases]
+                neg_freqs = [p['frequency'] for p in neg_phrases]
+                ax2.barh(neg_labels, neg_freqs, color='#e74c3c')
+                ax2.set_title('Most Frequent Negative Phrases')
+                ax2.set_xlabel('Frequency')
+                plt.tight_layout()
+                plt.savefig('key_phrases_frequency.png', dpi=300, bbox_inches='tight')
+                plots.append('key_phrases_frequency.png')
+                plt.close()
+        return plots

trend_analyzer.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime, timedelta
+from scipy import stats
+from sklearn.linear_model import LinearRegression
+import warnings
+warnings.filterwarnings('ignore')
+class TrendAnalyzer:
+    def __init__(self):
+        pass
+    def analyze(self, df, timeframe='Monthly'):
+        """Analyze trends in the data"""
+        results = {}
+        plots = []
+        # Create synthetic time series data if no date column exists
+        df_with_time = self._add_time_dimension(df, timeframe)
+        # Analyze trends for numeric columns
+        numeric_cols = df_with_time.select_dtypes(include=[np.number]).columns
+        numeric_cols = [col for col in numeric_cols if col not in ['ID', 'time_period']]
+        trends = []
+        for column in numeric_cols[:4]:  # Limit to first 4 numeric columns
+            trend_data = self._analyze_column_trend(df_with_time, column, timeframe)
+            trends.append(trend_data)
+            # Create trend plot
+            plt.figure(figsize=(12, 6))
+            plt.plot(df_with_time['time_period'], df_with_time[column],
+                    marker='o', linewidth=2, markersize=4)
+            # Add trend line
+            x_numeric = range(len(df_with_time))
+            z = np.polyfit(x_numeric, df_with_time[column], 1)
+            p = np.poly1d(z)
+            plt.plot(df_with_time['time_period'], p(x_numeric),
+                    "r--", alpha=0.8, linewidth=2, label=f'Trend Line')
+            plt.title(f'{column} Trend Analysis ({timeframe})')
+            plt.xlabel('Time Period')
+            plt.ylabel(column)
+            plt.xticks(rotation=45)
+            plt.legend()
+            plt.grid(True, alpha=0.3)
+            plt.tight_layout()
+            plot_name = f'{column.lower().replace(" ", "_")}_trend.png'
+            plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+            plots.append(plot_name)
+            plt.close()
+        results['trends'] = trends
+        results['timeframe'] = timeframe
+        # Seasonality analysis
+        seasonality_results = self._detect_seasonality(df_with_time, numeric_cols[:2])
+        results['seasonality'] = seasonality_results
+        # Forecasting
+        forecasts = self._generate_forecasts(df_with_time, numeric_cols[0] if numeric_cols else None)
+        results['forecasts'] = forecasts
+        # Create seasonality plot
+        if seasonality_results['detected']:
+            self._create_seasonality_plot(df_with_time, numeric_cols[0] if numeric_cols else None)
+            plots.append('seasonality_analysis.png')
+        # Create forecast plot
+        self._create_forecast_plot(df_with_time, numeric_cols[0] if numeric_cols else None, forecasts)
+        plots.append('forecast_plot.png')
+        return results, plots
+    def _add_time_dimension(self, df, timeframe):
+        """Add synthetic time dimension to data"""
+        df_time = df.copy()
+        # Create time periods based on data length
+        n_periods = len(df)
+        if timeframe == 'Weekly':
+            start_date = datetime.now() - timedelta(weeks=n_periods)
+            time_periods = [start_date + timedelta(weeks=i) for i in range(n_periods)]
+        elif timeframe == 'Monthly':
+            start_date = datetime.now() - timedelta(days=30*n_periods)
+            time_periods = [start_date + timedelta(days=30*i) for i in range(n_periods)]
+        elif timeframe == 'Quarterly':
+            start_date = datetime.now() - timedelta(days=90*n_periods)
+            time_periods = [start_date + timedelta(days=90*i) for i in range(n_periods)]
+        else:  # Yearly
+            start_date = datetime.now() - timedelta(days=365*n_periods)
+            time_periods = [start_date + timedelta(days=365*i) for i in range(n_periods)]
+        df_time['time_period'] = time_periods
+        # Sort by time
+        df_time = df_time.sort_values('time_period').reset_index(drop=True)
+        return df_time
+    def _analyze_column_trend(self, df, column, timeframe):
+        """Analyze trend for a specific column"""
+        values = df[column].values
+        x = np.arange(len(values))
+        # Linear regression for trend
+        slope, intercept, r_value, p_value, std_err = stats.linregress(x, values)
+        # Determine trend direction and significance
+        if p_value < 0.05:  # Statistically significant
+            if slope > 0:
+                direction = 'up'
+                significance = 'high' if abs(r_value) > 0.7 else 'medium'
+            else:
+                direction = 'down'
+                significance = 'high' if abs(r_value) > 0.7 else 'medium'
+        else:
+            direction = 'stable'
+            significance = 'low'
+        # Calculate percentage change
+        if len(values) > 1:
+            pct_change = ((values[-1] - values[0]) / values[0]) * 100
+        else:
+            pct_change = 0
+        return {
+            'metric': column,
+            'direction': direction,
+            'change': round(pct_change, 1),
+            'significance': significance,
+            'description': f'{direction.capitalize()} trend in {column} with {significance} significance',
+            'slope': round(slope, 4),
+            'r_squared': round(r_value**2, 3),
+            'p_value': round(p_value, 4)
+        }
+    def _detect_seasonality(self, df, columns):
+        """Detect seasonality patterns"""
+        if not columns:
+            return {'detected': False}
+        # Simple seasonality detection using autocorrelation
+        column = columns[0]
+        values = df[column].values
+        if len(values) < 12:  # Need at least 12 points for seasonality
+            return {'detected': False}
+        # Calculate autocorrelation at different lags
+        autocorr_values = []
+        for lag in range(1, min(len(values)//2, 12)):
+            if len(values) > lag:
+                autocorr = np.corrcoef(values[:-lag], values[lag:])[0, 1]
+                if not np.isnan(autocorr):
+                    autocorr_values.append(abs(autocorr))
+        if autocorr_values:
+            max_autocorr = max(autocorr_values)
+            if max_autocorr > 0.5:  # Threshold for seasonality
+                return {
+                    'detected': True,
+                    'pattern': 'quarterly',  # Simplified assumption
+                    'strength': round(max_autocorr, 2)
+                }
+        return {'detected': False}
+    def _generate_forecasts(self, df, column):
+        """Generate simple forecasts"""
+        if not column or len(df) < 3:
+            return []
+        values = df[column].values
+        x = np.arange(len(values))
+        # Fit linear regression
+        model = LinearRegression()
+        model.fit(x.reshape(-1, 1), values)
+        # Generate forecasts
+        future_periods = [len(values), len(values) + 3, len(values) + 12]
+        forecasts = []
+        for period in future_periods:
+            prediction = model.predict([[period]])[0]
+            # Calculate confidence (simplified)
+            residuals = values - model.predict(x.reshape(-1, 1))
+            std_error = np.std(residuals)
+            confidence = max(0.5, 1 - (std_error / np.mean(values)))
+            if period == len(values):
+                period_name = 'Next Period'
+            elif period == len(values) + 3:
+                period_name = 'Next Quarter'
+            else:
+                period_name = 'Next Year'
+            forecasts.append({
+                'period': period_name,
+                'value': round(prediction, 1),
+                'confidence': round(confidence, 2)
+            })
+        return forecasts
+    def _create_seasonality_plot(self, df, column):
+        """Create seasonality analysis plot"""
+        if not column:
+            return
+        plt.figure(figsize=(12, 8))
+        # Original series
+        plt.subplot(2, 1, 1)
+        plt.plot(df['time_period'], df[column], marker='o', linewidth=2)
+        plt.title(f'{column} Time Series')
+        plt.xlabel('Time')
+        plt.ylabel(column)
+        plt.xticks(rotation=45)
+        plt.grid(True, alpha=0.3)
+        # Moving average
+        plt.subplot(2, 1, 2)
+        window_size = min(12, len(df) // 4)
+        if window_size >= 2:
+            moving_avg = df[column].rolling(window=window_size).mean()
+            plt.plot(df['time_period'], df[column], alpha=0.5, label='Original')
+            plt.plot(df['time_period'], moving_avg, linewidth=2, label=f'{window_size}-period Moving Average')
+            plt.legend()
+        else:
+            plt.plot(df['time_period'], df[column], marker='o', linewidth=2)
+        plt.title(f'{column} with Trend')
+        plt.xlabel('Time')
+        plt.ylabel(column)
+        plt.xticks(rotation=45)
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig('seasonality_analysis.png', dpi=300, bbox_inches='tight')
+        plt.close()
+    def _create_forecast_plot(self, df, column, forecasts):
+        """Create forecast visualization"""
+        if not column or not forecasts:
+            # Create a simple placeholder plot
+            plt.figure(figsize=(10, 6))
+            plt.text(0.5, 0.5, 'Forecast Plot\n(Insufficient data for detailed forecasting)',
+                    ha='center', va='center', transform=plt.gca().transAxes, fontsize=14)
+            plt.title('Revenue Forecast')
+            plt.savefig('forecast_plot.png', dpi=300, bbox_inches='tight')
+            plt.close()
+            return
+        plt.figure(figsize=(12, 8))
+        # Historical data
+        plt.plot(range(len(df)), df[column], marker='o', linewidth=2, label='Historical Data')
+        # Forecast points
+        forecast_x = [len(df), len(df) + 3, len(df) + 12]
+        forecast_y = [f['value'] for f in forecasts]
+        plt.plot(forecast_x, forecast_y, marker='s', linewidth=2,
+                linestyle='--', color='red', label='Forecast')
+        # Confidence intervals (simplified)
+        for i, (x, y, forecast) in enumerate(zip(forecast_x, forecast_y, forecasts)):
+            error = y * (1 - forecast['confidence']) * 0.5
+            plt.errorbar(x, y, yerr=error, color='red', alpha=0.5, capsize=5)
+        plt.title(f'{column} Forecast')
+        plt.xlabel('Time Period')
+        plt.ylabel(column)
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig('forecast_plot.png', dpi=300, bbox_inches='tight')
+        plt.close()

variable_extraction.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+from groq import Groq
+import os
+class VariableExtractor:
+    def __init__(self):
+        self.client = None
+    def set_api_key(self, api_key):
+        """Set Groq API key"""
+        self.client = Groq(api_key=api_key)
+    def extract_variables(self, business_problem):
+        """Extract relevant variables from business problem description"""
+        if not self.client:
+            # Fallback to mock data if no API key
+            return self._get_mock_variables()
+        try:
+            system_prompt = """You are an expert business analyst. Extract relevant variables for marketing analysis from the given business problem. Return only a JSON array of variable names, nothing else."""
+            user_prompt = f"""Business Problem: {business_problem}
+Extract 6-10 relevant variables that would be important for analyzing this marketing/business problem. Focus on measurable, actionable variables.
+Return format: ["Variable 1", "Variable 2", "Variable 3", ...]"""
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                model="llama-3.1-70b-versatile",
+                temperature=0.7,
+                max_tokens=1024
+            )
+            response = completion.choices[0].message.content.strip()
+            variables = json.loads(response)
+            return variables
+        except Exception as e:
+            print(f"Error extracting variables: {e}")
+            return self._get_mock_variables()
+    def _get_mock_variables(self):
+        """Fallback mock variables"""
+        return [
+            "Customer Age",
+            "Purchase Amount",
+            "Product Category",
+            "Marketing Channel",
+            "Customer Location",
+            "Purchase Frequency",
+            "Customer Satisfaction Score",
+            "Time to Purchase"
+        ]

visualization_engine.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+import pandas as pd
+import numpy as np
+class VisualizationEngine:
+    def __init__(self):
+        plt.style.use('seaborn-v0_8')
+        self.color_palette = sns.color_palette("husl", 8)
+    def create_visualizations(self, df, selected_features):
+        """Create various visualizations based on selected features"""
+        plots = []
+        if not selected_features:
+            selected_features = df.columns[:4]  # Default to first 4 columns
+        for feature in selected_features:
+            if feature in df.columns and feature != 'ID':
+                if df[feature].dtype in ['int64', 'float64']:
+                    # Numerical feature visualizations
+                    plots.extend(self._create_numerical_plots(df, feature))
+                else:
+                    # Categorical feature visualizations
+                    plots.extend(self._create_categorical_plots(df, feature))
+        # Create comparison plots
+        if len(selected_features) >= 2:
+            plots.extend(self._create_comparison_plots(df, selected_features))
+        return plots
+    def _create_numerical_plots(self, df, feature):
+        """Create plots for numerical features"""
+        plots = []
+        # Histogram
+        plt.figure(figsize=(10, 6))
+        plt.hist(df[feature], bins=30, alpha=0.7, color=self.color_palette[0], edgecolor='black')
+        plt.title(f'{feature} Distribution')
+        plt.xlabel(feature)
+        plt.ylabel('Frequency')
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plot_name = f'{feature.lower().replace(" ", "_")}_histogram.png'
+        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+        plots.append(plot_name)
+        plt.close()
+        # Box plot
+        plt.figure(figsize=(8, 6))
+        plt.boxplot(df[feature], patch_artist=True,
+                   boxprops=dict(facecolor=self.color_palette[1]))
+        plt.title(f'{feature} Box Plot')
+        plt.ylabel(feature)
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plot_name = f'{feature.lower().replace(" ", "_")}_boxplot.png'
+        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+        plots.append(plot_name)
+        plt.close()
+        # Density plot
+        plt.figure(figsize=(10, 6))
+        df[feature].plot(kind='density', color=self.color_palette[2], linewidth=2)
+        plt.title(f'{feature} Density Plot')
+        plt.xlabel(feature)
+        plt.ylabel('Density')
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plot_name = f'{feature.lower().replace(" ", "_")}_density.png'
+        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+        plots.append(plot_name)
+        plt.close()
+        return plots
+    def _create_categorical_plots(self, df, feature):
+        """Create plots for categorical features"""
+        plots = []
+        value_counts = df[feature].value_counts()
+        # Bar plot
+        plt.figure(figsize=(12, 6))
+        bars = plt.bar(value_counts.index, value_counts.values,
+                      color=self.color_palette[:len(value_counts)])
+        plt.title(f'{feature} Distribution')
+        plt.xlabel(feature)
+        plt.ylabel('Count')
+        plt.xticks(rotation=45)
+        # Add value labels on bars
+        for bar in bars:
+            height = bar.get_height()
+            plt.text(bar.get_x() + bar.get_width()/2., height,
+                    f'{int(height)}', ha='center', va='bottom')
+        plt.tight_layout()
+        plot_name = f'{feature.lower().replace(" ", "_")}_barplot.png'
+        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+        plots.append(plot_name)
+        plt.close()
+        # Pie chart
+        plt.figure(figsize=(10, 8))
+        plt.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%',
+                colors=self.color_palette[:len(value_counts)])
+        plt.title(f'{feature} Distribution (Pie Chart)')
+        plt.tight_layout()
+        plot_name = f'{feature.lower().replace(" ", "_")}_piechart.png'
+        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+        plots.append(plot_name)
+        plt.close()
+        return plots
+    def _create_comparison_plots(self, df, features):
+        """Create comparison plots between features"""
+        plots = []
+        numeric_features = [f for f in features if df[f].dtype in ['int64', 'float64']]
+        categorical_features = [f for f in features if df[f].dtype in ['object', 'category']]
+        # Scatter plots for numeric features
+        if len(numeric_features) >= 2:
+            for i in range(len(numeric_features)):
+                for j in range(i+1, len(numeric_features)):
+                    plt.figure(figsize=(10, 8))
+                    plt.scatter(df[numeric_features[i]], df[numeric_features[j]],
+                              alpha=0.6, color=self.color_palette[0])
+                    plt.xlabel(numeric_features[i])
+                    plt.ylabel(numeric_features[j])
+                    plt.title(f'{numeric_features[i]} vs {numeric_features[j]}')
+                    plt.grid(True, alpha=0.3)
+                    plt.tight_layout()
+                    plot_name = f'{numeric_features[i].lower().replace(" ", "_")}_vs_{numeric_features[j].lower().replace(" ", "_")}_scatter.png'
+                    plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+                    plots.append(plot_name)
+                    plt.close()
+        # Box plots for numeric vs categorical
+        if numeric_features and categorical_features:
+            for num_feat in numeric_features[:2]:  # Limit to avoid too many plots
+                for cat_feat in categorical_features[:2]:
+                    plt.figure(figsize=(12, 8))
+                    df.boxplot(column=num_feat, by=cat_feat, ax=plt.gca())
+                    plt.title(f'{num_feat} by {cat_feat}')
+                    plt.suptitle('')  # Remove default title
+                    plt.xticks(rotation=45)
+                    plt.tight_layout()
+                    plot_name = f'{num_feat.lower().replace(" ", "_")}_by_{cat_feat.lower().replace(" ", "_")}_boxplot.png'
+                    plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+                    plots.append(plot_name)
+                    plt.close()
+        # Correlation heatmap for numeric features
+        if len(numeric_features) >= 2:
+            plt.figure(figsize=(10, 8))
+            correlation_matrix = df[numeric_features].corr()
+            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
+                       square=True, linewidths=0.5)
+            plt.title('Feature Correlation Matrix')
+            plt.tight_layout()
+            plot_name = 'selected_features_correlation.png'
+            plt.savefig(plot_name, dpi=300, bbox_inches='tight')
+            plots.append(plot_name)
+            plt.close()
+        return plots
+    def create_interactive_plots(self, df, features):
+        """Create interactive Plotly visualizations"""
+        plots = []
+        for feature in features:
+            if feature in df.columns and feature != 'ID':
+                if df[feature].dtype in ['int64', 'float64']:
+                    # Interactive histogram
+                    fig = px.histogram(df, x=feature, title=f'{feature} Distribution')
+                    fig.write_html(f'{feature.lower().replace(" ", "_")}_interactive_hist.html')
+                    plots.append(f'{feature.lower().replace(" ", "_")}_interactive_hist.html')
+                else:
+                    # Interactive bar chart
+                    value_counts = df[feature].value_counts()
+                    fig = px.bar(x=value_counts.index, y=value_counts.values,
+                               title=f'{feature} Distribution')
+                    fig.write_html(f'{feature.lower().replace(" ", "_")}_interactive_bar.html')
+                    plots.append(f'{feature.lower().replace(" ", "_")}_interactive_bar.html')
+        return plots