import pandas as pd # Define the classes you want to keep CLASS_NAMES = [ "Bronchiectasis", "Bronchiolitis", "chronic obstructive pulmonary disease", # ⚠️ Note: you have a trailing comma — is this intentional? "Healthy", "Pneumonia", ] # Load your dataset df = pd.read_csv('respiratory symptoms and treatment.csv') # 👈 replace with your actual filename # Assuming the column containing disease labels is named 'disease' # If it has a different name (e.g., 'diagnosis', 'label'), change it below LABEL_COLUMN = 'Disease' # 👈 update this if your column has a different name # Filter rows: keep only those where the label is in CLASS_NAMES filtered_df = df[df[LABEL_COLUMN].isin(CLASS_NAMES)] # Optional: Reset index after filtering filtered_df = filtered_df.reset_index(drop=True) # Save the filtered dataset filtered_df.to_csv('filtered_dataset.csv', index=False) print(f"Original rows: {len(df)}") print(f"Filtered rows: {len(filtered_df)}") print("Filtered dataset saved as 'filtered_dataset.csv'")