import os import sys import pandas as pd RESP_BASE = r"c:\Users\ASUS\lung_ai_project\data\extracted_cough\Respiratory_Sound_Dataset-main" COS_BASE = r"c:\Users\ASUS\lung_ai_project\data\coswara" def get_all_test_files(): all_samples = [] # Respiratory resp_csv = os.path.join(RESP_BASE, "patient_diagnosis.csv") if os.path.exists(resp_csv): resp_df = pd.read_csv(resp_csv) resp_map = dict(zip(resp_df['Patient_ID'], resp_df['DIAGNOSIS'])) resp_dir = os.path.join(RESP_BASE, "audio_and_txt_files") if os.path.exists(resp_dir): resp_files = [f for f in os.listdir(resp_dir) if f.endswith(".wav")] print(f"Found {len(resp_files)} resp files") for f in resp_files: try: pid = int(f.split('_')[0]) diag = resp_map.get(pid, "").lower() if diag: label = "healthy" if diag == "healthy" else "sick" all_samples.append((os.path.join(resp_dir, f), label)) except: continue else: print(f"Resp dir {resp_dir} not found") else: print(f"Resp csv {resp_csv} not found") # Coswara cos_csv_dir = os.path.join(COS_BASE, "csvs") cos_status_map = {} if os.path.exists(cos_csv_dir): for csv_file in os.listdir(cos_csv_dir): if csv_file.endswith(".csv"): try: df = pd.read_csv(os.path.join(cos_csv_dir, csv_file)) if 'id' in df.columns and 'covid_status' in df.columns: for _, row in df.iterrows(): cos_status_map[row['id']] = row['covid_status'] except: pass print(f"Loaded {len(cos_status_map)} coswara status mappings") else: print(f"Coswara csv dir {cos_csv_dir} not found") cos_data_dir = os.path.join(COS_BASE, "coswara_data", "kaggle_data") if os.path.exists(cos_data_dir): pids = os.listdir(cos_data_dir) print(f"Found {len(pids)} PIDs in coswara data dir") for pid in pids: status = cos_status_map.get(pid, "").lower() if status: label = "healthy" if status == "healthy" else "sick" pid_dir = os.path.join(cos_data_dir, pid) if os.path.isdir(pid_dir): for af in ["cough.wav", "cough-heavy.wav"]: path = os.path.join(pid_dir, af) if os.path.exists(path): all_samples.append((path, label)) break else: print(f"Coswara data dir {cos_data_dir} not found") return all_samples samples = get_all_test_files() print(f"Total samples collected: {len(samples)}") if samples: print(f"First 5: {samples[:5]}")