import pandas as pd

# Define the classes you want to keep
CLASS_NAMES = [
    "Bronchiectasis",
    "Bronchiolitis",
    "chronic obstructive pulmonary disease",          # ⚠️ Note: you have a trailing comma — is this intentional?
    "Healthy",
    "Pneumonia",
]

# Load your dataset
df = pd.read_csv('respiratory symptoms and treatment.csv')  # 👈 replace with your actual filename

# Assuming the column containing disease labels is named 'disease'
# If it has a different name (e.g., 'diagnosis', 'label'), change it below
LABEL_COLUMN = 'Disease'  # 👈 update this if your column has a different name

# Filter rows: keep only those where the label is in CLASS_NAMES
filtered_df = df[df[LABEL_COLUMN].isin(CLASS_NAMES)]

# Optional: Reset index after filtering
filtered_df = filtered_df.reset_index(drop=True)

# Save the filtered dataset
filtered_df.to_csv('filtered_dataset.csv', index=False)

print(f"Original rows: {len(df)}")
print(f"Filtered rows: {len(filtered_df)}")
print("Filtered dataset saved as 'filtered_dataset.csv'")