LSTM1 / explore_data.py
d-e-e-k-11's picture
Upload folder using huggingface_hub
bde793d verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def explore_data(file_path):
df = pd.read_csv(file_path)
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())
# Engine distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='engine', data=df)
plt.title('Distribution of Chatbot Engines')
plt.savefig('engine_distribution.png')
# Performance distribution (Best/Worst)
plt.figure(figsize=(10, 6))
df['best'].value_counts().plot(kind='bar')
plt.title('Distribution of "Best" Label')
plt.savefig('best_distribution.png')
# p1-p10 correlation
p_cols = [f'p{i}' for i in range(1, 11)]
plt.figure(figsize=(12, 10))
sns.heatmap(df[p_cols].astype(int).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Evaluation Parameters (p1-p10)')
plt.savefig('p_correlation.png')
print("\nTarget Variable 'best' counts:")
print(df['best'].value_counts())
if __name__ == "__main__":
explore_data('BP_MHS_V1.csv')