| import pandas as pd
|
| import numpy as np
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
|
|
| def explore_data(file_path):
|
| df = pd.read_csv(file_path)
|
| print("Dataset Shape:", df.shape)
|
| print("\nColumns:", df.columns.tolist())
|
|
|
|
|
| print("\nMissing Values:\n", df.isnull().sum())
|
|
|
|
|
| plt.figure(figsize=(10, 6))
|
| sns.countplot(x='engine', data=df)
|
| plt.title('Distribution of Chatbot Engines')
|
| plt.savefig('engine_distribution.png')
|
|
|
|
|
| plt.figure(figsize=(10, 6))
|
| df['best'].value_counts().plot(kind='bar')
|
| plt.title('Distribution of "Best" Label')
|
| plt.savefig('best_distribution.png')
|
|
|
|
|
| p_cols = [f'p{i}' for i in range(1, 11)]
|
| plt.figure(figsize=(12, 10))
|
| sns.heatmap(df[p_cols].astype(int).corr(), annot=True, cmap='coolwarm')
|
| plt.title('Correlation between Evaluation Parameters (p1-p10)')
|
| plt.savefig('p_correlation.png')
|
|
|
| print("\nTarget Variable 'best' counts:")
|
| print(df['best'].value_counts())
|
|
|
| if __name__ == "__main__":
|
| explore_data('BP_MHS_V1.csv')
|
|
|