import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns def explore_data(file_path): df = pd.read_csv(file_path) print("Dataset Shape:", df.shape) print("\nColumns:", df.columns.tolist()) # Check for missing values print("\nMissing Values:\n", df.isnull().sum()) # Engine distribution plt.figure(figsize=(10, 6)) sns.countplot(x='engine', data=df) plt.title('Distribution of Chatbot Engines') plt.savefig('engine_distribution.png') # Performance distribution (Best/Worst) plt.figure(figsize=(10, 6)) df['best'].value_counts().plot(kind='bar') plt.title('Distribution of "Best" Label') plt.savefig('best_distribution.png') # p1-p10 correlation p_cols = [f'p{i}' for i in range(1, 11)] plt.figure(figsize=(12, 10)) sns.heatmap(df[p_cols].astype(int).corr(), annot=True, cmap='coolwarm') plt.title('Correlation between Evaluation Parameters (p1-p10)') plt.savefig('p_correlation.png') print("\nTarget Variable 'best' counts:") print(df['best'].value_counts()) if __name__ == "__main__": explore_data('BP_MHS_V1.csv')