78anand commited on
Commit
4fcfef4
·
verified ·
1 Parent(s): a1d9ea4

Upload folder using huggingface_hub

Browse files
app/main.py CHANGED
@@ -52,34 +52,29 @@ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit
52
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
53
 
54
  # Configuration
55
- MODEL_PATH = os.path.join(project_root, "models", "hear_classifier_advanced.h5")
56
- CLASSES_PATH = os.path.join(project_root, "models", "hear_classes_advanced.npy")
57
 
58
- # Global variables for laziness loading
59
  extractor = None
60
  classifier_model = None
61
- classes = None
62
 
63
  def load_resources():
64
- global extractor, classifier_model, classes
65
  if extractor is None:
66
  print("Initializing HeAR Extractor...")
67
- # Use the HF_TOKEN from Space Secrets for gated model access
68
  hf_token = os.environ.get('HF_TOKEN')
69
  extractor = HeARExtractor(token=hf_token)
70
 
71
  if classifier_model is None:
72
- print(f"Loading Model from {MODEL_PATH}...")
73
  classifier_model = load_model(MODEL_PATH, compile=False)
74
- classes = np.load(CLASSES_PATH)
75
- print(f"Classes: {classes}")
76
 
77
  @app.route('/')
78
  def index():
79
  return jsonify({
80
  "status": "online",
81
- "service": "KasaHealth Diagnostic API",
82
- "version": "1.1.0",
83
  "message": "Send audio files via POST to /predict"
84
  })
85
 
@@ -98,39 +93,34 @@ def predict():
98
  file.save(filepath)
99
 
100
  try:
101
- # Ensure resources are loaded
102
  load_resources()
103
-
104
- # 1. Load and resample
105
  y, sr = librosa.load(filepath, sr=16000, duration=5.0)
106
 
107
- # --- VAD FILTER (Voice Activity Detection) ---
108
- # Calculate the root mean square (RMS) energy to find volume
109
  rms_energy = np.mean(librosa.feature.rms(y=y))
110
-
111
- # If the recording is practically silent, reject it
112
  if rms_energy < 0.005:
113
  os.remove(filepath)
114
- return jsonify({
115
- "error": "No cough detected. The recording was too quiet. Please cough forcefully."
116
- }), 400
117
 
118
- # 2. Preprocess
119
  y_clean = advanced_preprocess(y, sr)
120
-
121
- # 3. Extract Features
122
  emb = extractor.extract(y_clean)
123
 
124
  if emb is not None:
125
- # 4. Predict
126
  X = emb[np.newaxis, ...]
127
- preds = classifier_model.predict(X, verbose=0)
128
- pred_idx = np.argmax(preds[0])
129
- raw_label = classes[pred_idx]
130
- confidence = float(preds[0][pred_idx])
131
 
132
- # --- Reliability Guard ---
133
- THRESHOLD = 0.70
 
 
 
 
 
 
 
134
  if raw_label == "sick" and confidence < THRESHOLD:
135
  final_label = "healthy"
136
  is_inconclusive = True
@@ -138,9 +128,7 @@ def predict():
138
  final_label = raw_label
139
  is_inconclusive = False
140
 
141
- # Clean up file
142
  os.remove(filepath)
143
-
144
  return jsonify({
145
  "status": "success",
146
  "result": final_label,
@@ -151,12 +139,10 @@ def predict():
151
  })
152
  else:
153
  os.remove(filepath)
154
- return jsonify({"error": "Could not extract features from audio"}), 500
155
 
156
  except Exception as e:
157
- if os.path.exists(filepath):
158
- os.remove(filepath)
159
- print(f"Error processing audio: {e}")
160
  return jsonify({"error": str(e)}), 500
161
 
162
  def get_recommendation(label, is_inconclusive):
 
52
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
53
 
54
  # Configuration
55
+ MODEL_PATH = os.path.join(project_root, "models", "hear_classifier_v8_elite.h5")
 
56
 
57
+ # Global variables for lazy loading
58
  extractor = None
59
  classifier_model = None
 
60
 
61
  def load_resources():
62
+ global extractor, classifier_model
63
  if extractor is None:
64
  print("Initializing HeAR Extractor...")
 
65
  hf_token = os.environ.get('HF_TOKEN')
66
  extractor = HeARExtractor(token=hf_token)
67
 
68
  if classifier_model is None:
69
+ print(f"Loading Elite Model from {MODEL_PATH}...")
70
  classifier_model = load_model(MODEL_PATH, compile=False)
 
 
71
 
72
  @app.route('/')
73
  def index():
74
  return jsonify({
75
  "status": "online",
76
+ "service": "KasaHealth Diagnostic API (Elite V8)",
77
+ "version": "1.2.0",
78
  "message": "Send audio files via POST to /predict"
79
  })
80
 
 
93
  file.save(filepath)
94
 
95
  try:
 
96
  load_resources()
 
 
97
  y, sr = librosa.load(filepath, sr=16000, duration=5.0)
98
 
99
+ # VAD Lite
 
100
  rms_energy = np.mean(librosa.feature.rms(y=y))
 
 
101
  if rms_energy < 0.005:
102
  os.remove(filepath)
103
+ return jsonify({"error": "No cough detected. Please record in a quieter area."}), 400
 
 
104
 
 
105
  y_clean = advanced_preprocess(y, sr)
 
 
106
  emb = extractor.extract(y_clean)
107
 
108
  if emb is not None:
 
109
  X = emb[np.newaxis, ...]
110
+ prob = classifier_model.predict(X, verbose=0)[0][0]
111
+
112
+ # Logic: sigmoid (0=healthy, 1=sick)
113
+ raw_label = "sick" if prob > 0.5 else "healthy"
114
 
115
+ # Confidence is distance from decision boundary
116
+ if raw_label == "sick":
117
+ confidence = float(prob)
118
+ else:
119
+ confidence = float(1.0 - prob)
120
+
121
+ # --- Elite Threshold Check ---
122
+ # Only report 'sick' if we are VERY sure (> 0.8)
123
+ THRESHOLD = 0.80
124
  if raw_label == "sick" and confidence < THRESHOLD:
125
  final_label = "healthy"
126
  is_inconclusive = True
 
128
  final_label = raw_label
129
  is_inconclusive = False
130
 
 
131
  os.remove(filepath)
 
132
  return jsonify({
133
  "status": "success",
134
  "result": final_label,
 
139
  })
140
  else:
141
  os.remove(filepath)
142
+ return jsonify({"error": "Feature extraction failed"}), 500
143
 
144
  except Exception as e:
145
+ if os.path.exists(filepath): os.remove(filepath)
 
 
146
  return jsonify({"error": str(e)}), 500
147
 
148
  def get_recommendation(label, is_inconclusive):
download_new_dataset.py CHANGED
@@ -8,7 +8,7 @@ import shutil
8
  # --- Configuration ---
9
  DATASET_SLUG = "andrewmvd/covid19-cough-audio-classification"
10
  DOWNLOAD_DIR = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public"
11
- EXTRACT_DIR = os.path.join(DOWNLOAD_DIR, "extracted")
12
  ORGANIZED_DIR = os.path.join(DOWNLOAD_DIR, "organized")
13
 
14
  def run_command(cmd):
@@ -18,36 +18,46 @@ def run_command(cmd):
18
  print(f"Error executing command: {e}")
19
  return False
20
  return True
21
-
22
  def main():
23
  if not os.path.exists(DOWNLOAD_DIR):
24
  os.makedirs(DOWNLOAD_DIR)
25
 
26
- # 1. Download dataset using Kaggle CLI
27
- print(f"Step 1: Downloading {DATASET_SLUG}...")
28
- if not run_command(f"kaggle datasets download -d {DATASET_SLUG} -p {DOWNLOAD_DIR}"):
29
- print("Failed to download dataset. Check your Kaggle API key at ~/.kaggle/kaggle.json")
30
- return
31
-
32
- # 2. Extract dataset
33
- zip_filename = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith(".zip")][0]
34
- zip_path = os.path.join(DOWNLOAD_DIR, zip_filename)
35
-
36
- print(f"Step 2: Extracting {zip_filename} to {EXTRACT_DIR}...")
37
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
38
- zip_ref.extractall(EXTRACT_DIR)
39
-
40
- # 3. Find and load metadata
41
- metadata_file = None
42
- for root, dirs, files in os.walk(EXTRACT_DIR):
43
- for f in files:
44
- if "metadata" in f.lower() and f.endswith(".csv"):
45
- metadata_file = os.path.join(root, f)
46
- break
47
- if metadata_file: break
 
 
48
 
 
 
 
 
 
 
 
 
 
49
  if not metadata_file:
50
- print("Metadata CSV not found. Attempting to locate in subfolders...")
51
  return
52
 
53
  print(f"Step 3: Organizing files based on {metadata_file}...")
@@ -89,9 +99,7 @@ def main():
89
  else: count_s += 1
90
 
91
  print(f"Done! Organized {count_h} healthy and {count_s} sick files.")
92
- print(f"Source Folder: {ORGANIZED_DIR}")
93
-
94
- print(f"Finished organizing {processed_count} files into {ORGANIZED_DIR}")
95
 
96
  if __name__ == "__main__":
97
  main()
 
8
  # --- Configuration ---
9
  DATASET_SLUG = "andrewmvd/covid19-cough-audio-classification"
10
  DOWNLOAD_DIR = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public"
11
+ EXTRACT_DIR = DOWNLOAD_DIR # Use the base folder since user has manually unzipped there
12
  ORGANIZED_DIR = os.path.join(DOWNLOAD_DIR, "organized")
13
 
14
  def run_command(cmd):
 
18
  print(f"Error executing command: {e}")
19
  return False
20
  return True
 
21
  def main():
22
  if not os.path.exists(DOWNLOAD_DIR):
23
  os.makedirs(DOWNLOAD_DIR)
24
 
25
+ # Check if we already have the metadata CSV to skip steps 1 and 2
26
+ metadata_filename = "metadata_compiled.csv"
27
+ potential_metadata = os.path.join(DOWNLOAD_DIR, metadata_filename)
28
+
29
+ if os.path.exists(potential_metadata):
30
+ print(f"Step 1 & 2: Skipping! {metadata_filename} already found in {DOWNLOAD_DIR}")
31
+ metadata_file = potential_metadata
32
+ else:
33
+ # 1. Download dataset using Kaggle CLI
34
+ print(f"Step 1: Downloading {DATASET_SLUG}...")
35
+ if not run_command(f"kaggle datasets download -d {DATASET_SLUG} -p {DOWNLOAD_DIR}"):
36
+ print("Failed to download dataset. Check your Kaggle API key at ~/.kaggle/kaggle.json")
37
+ return
38
+
39
+ # 2. Extract dataset
40
+ zips = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith(".zip")]
41
+ if not zips:
42
+ print(f"No zip found in {DOWNLOAD_DIR}. Was it already unzipped?")
43
+ else:
44
+ zip_filename = zips[0]
45
+ zip_path = os.path.join(DOWNLOAD_DIR, zip_filename)
46
+ print(f"Step 2: Extracting {zip_filename} to {DOWNLOAD_DIR}...")
47
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
48
+ zip_ref.extractall(DOWNLOAD_DIR)
49
 
50
+ # 3. Find and load metadata again (in case it unzipped somewhere else)
51
+ metadata_file = None
52
+ for root, dirs, files in os.walk(DOWNLOAD_DIR):
53
+ for f in files:
54
+ if "metadata" in f.lower() and f.endswith(".csv"):
55
+ metadata_file = os.path.join(root, f)
56
+ break
57
+ if metadata_file: break
58
+
59
  if not metadata_file:
60
+ print("Metadata CSV not found. Please ensure the dataset is downloaded correctly.")
61
  return
62
 
63
  print(f"Step 3: Organizing files based on {metadata_file}...")
 
99
  else: count_s += 1
100
 
101
  print(f"Done! Organized {count_h} healthy and {count_s} sick files.")
102
+ print(f"Organized datasets location: {ORGANIZED_DIR}")
 
 
103
 
104
  if __name__ == "__main__":
105
  main()
kasahealth-frontend/index.html CHANGED
@@ -18,7 +18,7 @@
18
 
19
  <!-- Styles -->
20
  <link rel="stylesheet" href="static/css/style.css">
21
- <link rel="icon" type="image/png" href="static/images/logo.png">
22
  </head>
23
  <body>
24
 
@@ -26,9 +26,8 @@
26
  <nav class="glass" id="main-nav">
27
  <div class="container nav-container">
28
  <a href="#" class="logo-wrap" onclick="location.reload()">
29
- <img src="static/images/logo.png" alt="KasaHealth Logo">
30
  </a>
31
- <span class="credibility-tag"><i class="fas fa-shield-alt"></i> Clinical AI Edge</span>
32
  </div>
33
  </nav>
34
 
@@ -39,9 +38,6 @@
39
  <!-- ============================================== -->
40
  <section id="screen-landing" class="screen active-screen">
41
  <div class="hero-content">
42
- <div class="badge">
43
- <i class="fas fa-microchip"></i> Powered by Google HeAR API
44
- </div>
45
  <h1>Next-Gen Acoustic<br><span>Lung Screening</span></h1>
46
  <p>Smartphone-based respiratory analysis to detect early signs of pulmonary anomalies with clinical-grade precision.</p>
47
 
@@ -75,7 +71,7 @@
75
  <i class="fas fa-mobile-alt anim-float text-navy"></i>
76
  </div>
77
  <h4>1. Hold Phone Closer</h4>
78
- <p>Hold your smartphone about 15-20cm away from your mouth.</p>
79
  </div>
80
 
81
  <!-- Step 2 -->
 
18
 
19
  <!-- Styles -->
20
  <link rel="stylesheet" href="static/css/style.css">
21
+ <link rel="icon" type="image/jpeg" href="static/images/logo.jpeg">
22
  </head>
23
  <body>
24
 
 
26
  <nav class="glass" id="main-nav">
27
  <div class="container nav-container">
28
  <a href="#" class="logo-wrap" onclick="location.reload()">
29
+ <img src="static/images/logo.jpeg" alt="KasaHealth Logo">
30
  </a>
 
31
  </div>
32
  </nav>
33
 
 
38
  <!-- ============================================== -->
39
  <section id="screen-landing" class="screen active-screen">
40
  <div class="hero-content">
 
 
 
41
  <h1>Next-Gen Acoustic<br><span>Lung Screening</span></h1>
42
  <p>Smartphone-based respiratory analysis to detect early signs of pulmonary anomalies with clinical-grade precision.</p>
43
 
 
71
  <i class="fas fa-mobile-alt anim-float text-navy"></i>
72
  </div>
73
  <h4>1. Hold Phone Closer</h4>
74
+ <p>Keep it 4-5 inches away from your mouth for best acoustic quality.</p>
75
  </div>
76
 
77
  <!-- Step 2 -->
kasahealth-frontend/static/css/style.css CHANGED
@@ -1,14 +1,15 @@
1
  :root {
2
- --primary-navy: #1E4D7B;
3
- --primary-teal: #2BBF9E;
4
- --accent-teal: #e6f7f4;
 
5
  --text-main: #2c3e50;
6
  --text-light: #576574;
7
  --bg-surface: #ffffff;
8
- --bg-page: #f8fafc;
9
  --border-soft: #e2e8f0;
10
 
11
- --color-success: #27ae60;
12
  --color-danger: #e74c3c;
13
  --color-warning: #f59e0b;
14
 
@@ -63,9 +64,14 @@ nav {
63
  border-bottom: 1px solid rgba(0,0,0,0.05);
64
  }
65
 
66
- .nav-container { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; }
67
- .logo-wrap img { height: 35px; }
68
- .credibility-tag { font-size: 0.75rem; font-weight: 700; color: var(--primary-teal); border-radius: 20px; background: var(--accent-teal); padding: 0.3rem 0.8rem; }
 
 
 
 
 
69
 
70
  /* Main Container (SPA layout) */
71
  .app-container {
 
1
  :root {
2
+ /* Exact colors sampled from KasaHealth Logo */
3
+ --primary-navy: #184e85; /* From "Kasa" */
4
+ --primary-teal: #26b797; /* From "Health" */
5
+ --accent-teal: #e2f6f2; /* Soft background companion */
6
  --text-main: #2c3e50;
7
  --text-light: #576574;
8
  --bg-surface: #ffffff;
9
+ --bg-page: #f9fbfb; /* Very subtle clinical blue-tint */
10
  --border-soft: #e2e8f0;
11
 
12
+ --color-success: #26b797; /* Re-mapped success to brand teal */
13
  --color-danger: #e74c3c;
14
  --color-warning: #f59e0b;
15
 
 
64
  border-bottom: 1px solid rgba(0,0,0,0.05);
65
  }
66
 
67
+ .nav-container { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; max-width: 1200px; margin: 0 auto; width: 100%; }
68
+ .logo-wrap { display: flex; align-items: center; height: 100%; }
69
+ .logo-wrap img { height: 55px; object-fit: contain; } /* Mobile size */
70
+
71
+ @media (min-width: 768px) {
72
+ .logo-wrap img { height: 85px; } /* Crisp, large size for laptops and desktops */
73
+ .nav-container { padding: 1.5rem 2rem; }
74
+ }
75
 
76
  /* Main Container (SPA layout) */
77
  .app-container {
kasahealth-frontend/static/images/logo.jpeg ADDED
kasahealth-frontend/vercel.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 2,
3
+ "name": "kasahealth-elite",
4
+ "builds": [
5
+ { "src": "index.html", "use": "@vercel/static" },
6
+ { "src": "static/**", "use": "@vercel/static" }
7
+ ],
8
+ "routes": [
9
+ { "src": "/(.*)", "dest": "/index.html" }
10
+ ]
11
+ }
models/compare_and_test.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.utils import shuffle
5
+ import os
6
+ import time
7
+
8
+ # Paths
9
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
10
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
11
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
12
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
13
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
14
+
15
+ # Clean y
16
+ def clean_y(y):
17
+ if y.dtype.kind in ['U', 'S']:
18
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
19
+ return y.astype(np.float32)
20
+
21
+ # Load data (Same data we used, but we need an "unseen" set)
22
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
23
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
24
+ X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
25
+ y_all = np.concatenate([y1, y2], axis=0)
26
+
27
+ # Shift validation set for testing (Using a seed NOT used in training)
28
+ # Actually, let's just use the validation split from the 11k to be sure it's consistent.
29
+ # For truly unseen, let's use a 15% split like before but with 40 random samples.
30
+
31
+ X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=999) # New seed 999
32
+
33
+ def run_test_on_model(model_name, model_path):
34
+ if not os.path.exists(model_path):
35
+ print(f"Model {model_name} not found yet. Skipping...")
36
+ return
37
+
38
+ model = tf.keras.models.load_model(model_path)
39
+
40
+ # 20 Healthy
41
+ h_idx = np.where(y_target == 0)[0]
42
+ s_idx = np.where(y_target == 1)[0]
43
+
44
+ rng = np.random.default_rng(2026)
45
+ sel_h = rng.choice(h_idx, 20, replace=False)
46
+ sel_s = rng.choice(s_idx, 20, replace=False)
47
+
48
+ X_h_test = X_target[sel_h]
49
+ X_s_test = X_target[sel_s]
50
+
51
+ # Predictions
52
+ preds_h = (model.predict(X_h_test) > 0.5).astype(int).flatten()
53
+ preds_s = (model.predict(X_s_test) > 0.5).astype(int).flatten()
54
+
55
+ acc_h = (np.sum(preds_h == 0) / 20) * 100
56
+ acc_s = (np.sum(preds_s == 1) / 20) * 100
57
+
58
+ print(f"\n--- Model: {model_name} ---")
59
+ print(f"Healthy Accuracy (20/20 Target): {acc_h:.2f}% ({np.sum(preds_h == 0)}/20)")
60
+ print(f"Sick Accuracy (20/20 Target): {acc_s:.2f}% ({np.sum(preds_s == 1)}/20)")
61
+ print(f"Total Model Accuracy (40 samples): {(acc_h + acc_s)/2:.2f}%")
62
+
63
+ # Models to compare
64
+ models_map = {
65
+ "V3 (Standard)": r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v3.h5",
66
+ "V5 PRO (Balanced)": r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
67
+ }
68
+
69
+ for name, path in models_map.items():
70
+ run_test_on_model(name, path)
models/final_test_50_healthy.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.utils import shuffle
5
+ import os
6
+
7
+ # Paths
8
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
9
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
10
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
11
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
12
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
13
+
14
+ # Clean y
15
+ def clean_y(y):
16
+ if y.dtype.kind in ['U', 'S']:
17
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
18
+ return y.astype(np.float32)
19
+
20
+ # Load data
21
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
22
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
23
+ X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
24
+ y_all = np.concatenate([y1, y2], axis=0)
25
+
26
+ # Unseen set
27
+ X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=777) # Seed 777
28
+
29
+ # Load Model
30
+ model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
31
+ model = tf.keras.models.load_model(model_path)
32
+
33
+ # Filter for HEALTHY samples
34
+ h_idx = np.where(y_target == 0)[0]
35
+ if len(h_idx) < 50:
36
+ print(f"Warning: Only found {len(h_idx)} healthy samples.")
37
+ count = len(h_idx)
38
+ else:
39
+ count = 50
40
+
41
+ rng = np.random.default_rng(2026)
42
+ sel_h = rng.choice(h_idx, count, replace=False)
43
+ X_h_test = X_target[sel_h]
44
+ y_h_test = y_target[sel_h]
45
+
46
+ # Predictions
47
+ preds = model.predict(X_h_test)
48
+ y_pred_bin = (preds > 0.5).astype(int).flatten()
49
+
50
+ # Results (Correct = 0)
51
+ correct = np.sum(y_pred_bin == 0)
52
+ accuracy = (correct / count) * 100
53
+
54
+ print(f"\n--- Final Model: {os.path.basename(model_path)} ---")
55
+ print(f"Total Unseen Healthy Samples Tested: {count}")
56
+ print(f"Correct Identifications: {correct}")
57
+ print(f"Accuracy: {accuracy:.2f}%")
58
+
59
+ print("\n--- Summary of Correctness (Sick Confidence) ---")
60
+ for i in range(count):
61
+ conf = preds[i][0]
62
+ # For healthy, we want low sick confidence
63
+ result = "✅ HEALTHY" if y_pred_bin[i] == 0 else "❌ SICK (False Alarm)"
64
+ print(f"Sample {i+1:2d} | Sick Prob {conf*100:5.2f}% | Result: {result}")
models/final_test_50_sick.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.utils import shuffle
5
+ import os
6
+
7
+ # Paths
8
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
9
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
10
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
11
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
12
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
13
+
14
+ # Clean y
15
+ def clean_y(y):
16
+ if y.dtype.kind in ['U', 'S']:
17
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
18
+ return y.astype(np.float32)
19
+
20
+ # Load data (Includes the new 1000 samples)
21
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
22
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
23
+ X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
24
+ y_all = np.concatenate([y1, y2], axis=0)
25
+
26
+ # Shift validation set for testing (Using a seed NOT used in training)
27
+ X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=777) # Seed 777
28
+
29
+ # Load Model
30
+ model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
31
+ model = tf.keras.models.load_model(model_path)
32
+
33
+ # Filter for SICK samples
34
+ s_idx = np.where(y_target == 1)[0]
35
+ if len(s_idx) < 50:
36
+ print(f"Warning: Only found {len(s_idx)} sick samples in target set.")
37
+ count = len(s_idx)
38
+ else:
39
+ count = 50
40
+
41
+ rng = np.random.default_rng(2026)
42
+ sel_s = rng.choice(s_idx, count, replace=False)
43
+ X_s_test = X_target[sel_s]
44
+ y_s_test = y_target[sel_s]
45
+
46
+ # Predictions
47
+ preds = model.predict(X_s_test)
48
+ y_pred_bin = (preds > 0.5).astype(int).flatten()
49
+
50
+ # Results
51
+ correct = np.sum(y_pred_bin == 1)
52
+ accuracy = (correct / count) * 100
53
+
54
+ print(f"\n--- Final Model: {os.path.basename(model_path)} ---")
55
+ print(f"Total Unseen Sick Samples Tested: {count}")
56
+ print(f"Correct Identifications: {correct}")
57
+ print(f"Accuracy: {accuracy:.2f}%")
58
+
59
+ print("\n--- Summary of Correctness ---")
60
+ for i in range(count):
61
+ conf = preds[i][0]
62
+ result = "✅ SICK" if y_pred_bin[i] == 1 else "❌ HEALTHY (Miss)"
63
+ print(f"Sample {i+1:2d} | Confidence {conf*100:5.2f}% | Result: {result}")
models/healthy_test_result.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Correct Healthy: 19/20, Accuracy: 95.00%
models/hear_classifier_v8_elite.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65637808333de73d6d389c835639dbe198b50c45ef6c8202604cd4c332d5e3b
3
+ size 14286224
models/test_results.txt ADDED
Binary file (1.88 kB). View file
 
models/v8_final_test.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.utils import shuffle
5
+ import os
6
+
7
+ # Paths
8
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
9
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
10
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
11
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
12
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
13
+ path_elite_x = os.path.join(base_dir, "hear_embeddings_elite", "X_elite.npy")
14
+ path_elite_y = os.path.join(base_dir, "hear_embeddings_elite", "y_elite.npy")
15
+
16
+ # Clean y
17
+ def clean_y(y):
18
+ if y.dtype.kind in ['U', 'S']:
19
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
20
+ return y.astype(np.float32)
21
+
22
+ # Load data (use the FULL set to find "truly unseen" samples)
23
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
24
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
25
+ Xe, ye = np.load(path_elite_x), clean_y(np.load(path_elite_y))
26
+ X_all = np.concatenate([X1, X2, Xe], axis=0).astype(np.float32)
27
+ y_all = np.concatenate([y1, y2, ye], axis=0)
28
+
29
+ # Unseen set (Seed 1000 for the final test)
30
+ X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=1000)
31
+
32
+ # Load V8 Elite Model
33
+ model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v8_elite.h5"
34
+ model = tf.keras.models.load_model(model_path)
35
+
36
+ # Filter for SICK and HEALTHY
37
+ s_idx = np.where(y_target == 1)[0]
38
+ h_idx = np.where(y_target == 0)[0]
39
+
40
+ rng = np.random.default_rng(2027) # Change seed again to ensure randomness
41
+ sel_s = rng.choice(s_idx, 50, replace=False)
42
+ sel_h = rng.choice(h_idx, 50, replace=False)
43
+
44
+ # Predictions - Sick
45
+ preds_s = model.predict(X_target[sel_s])
46
+ y_pred_s = (preds_s > 0.5).astype(int).flatten()
47
+ acc_s = (np.sum(y_pred_s == 1) / 50) * 100
48
+
49
+ # Predictions - Healthy
50
+ preds_h = model.predict(X_target[sel_h])
51
+ y_pred_h = (preds_h > 0.5).astype(int).flatten()
52
+ acc_h = (np.sum(y_pred_h == 0) / 50) * 100
53
+
54
+ print(f"\n--- Model V8: THE ELITE GUARD ---")
55
+ print(f"Dataset: Elite-Merged & Purged (10,342 Balanced Samples)")
56
+ print(f"\nSICK TEST (50 Samples): {acc_s:.2f}% ({np.sum(y_pred_s == 1)}/50)")
57
+ print(f"HEALTHY TEST (50 Samples): {acc_h:.2f}% ({np.sum(y_pred_h == 0)}/50)")
58
+ print(f"OVERALL ACCURACY: {(acc_s + acc_h)/2:.2f}%")
59
+
60
+ # Individual report for Sick misses
61
+ miss_s = 50 - np.sum(y_pred_s == 1)
62
+ print(f"\nNote: Misidentified {miss_s} sick samples.")
63
+ if miss_s > 0:
64
+ for i in range(miss_s):
65
+ miss_idx = np.where(y_pred_s == 0)[0][i]
66
+ print(f"Missed Sick {i+1:2d} | Confidence: {preds_s[miss_idx][0]*100:5.2f}%")
predict_user_file.py CHANGED
@@ -51,6 +51,18 @@ def predict_single_file(file_path):
51
  print("Loading and preprocessing audio...")
52
  y, sr = librosa.load(file_path, sr=16000, duration=5.0)
53
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Apply Advanced Preprocessing (Critical for correct result!)
55
  y_clean = advanced_preprocess(y, sr)
56
 
 
51
  print("Loading and preprocessing audio...")
52
  y, sr = librosa.load(file_path, sr=16000, duration=5.0)
53
 
54
+ # --- NEW: Audio Validation (Gatekeeper) ---
55
+ from utils.audio_validator import validate_audio_is_cough
56
+ is_valid_cough, reason, val_conf = validate_audio_is_cough(y, sr)
57
+
58
+ if not is_valid_cough:
59
+ print("\n" + "="*50)
60
+ print(f"REJECTED: Audio Validation Failed!")
61
+ print(f"REASON: {reason}")
62
+ print(f"RECOMMENDATION: Please record a clear, loud cough in a quiet room.")
63
+ print("="*50)
64
+ return
65
+
66
  # Apply Advanced Preprocessing (Critical for correct result!)
67
  y_clean = advanced_preprocess(y, sr)
68
 
utils/analyze_misses.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.utils import shuffle
5
+ import os
6
+
7
+ # Paths
8
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
9
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
10
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
11
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
12
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
13
+
14
+ # Load and clean
15
+ def clean_y(y):
16
+ if y.dtype.kind in ['U', 'S']:
17
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
18
+ return y.astype(np.float32)
19
+
20
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
21
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
22
+
23
+ # Merge
24
+ X = np.concatenate([X1, X2], axis=0).astype(np.float32)
25
+ y = np.concatenate([y1, y2], axis=0)
26
+ # No shuffle yet to keep indices traceable if needed, but we'll shuffle after identifying
27
+ # Actually, we just need the samples.
28
+
29
+ # Load Model
30
+ model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v3.h5"
31
+ model = tf.keras.models.load_model(model_path)
32
+
33
+ # Predict all
34
+ preds = model.predict(X, batch_size=128)
35
+ y_pred_bin = (preds > 0.5).astype(int).flatten()
36
+
37
+ # Find misclassified SICK samples
38
+ sick_mask = (y == 1)
39
+ misclassified_sick = (sick_mask & (y_pred_bin == 0))
40
+ correctly_classified_sick = (sick_mask & (y_pred_bin == 1))
41
+
42
+ total_sick = np.sum(sick_mask)
43
+ total_mis_sick = np.sum(misclassified_sick)
44
+
45
+ # Find misclassified HEALTHY samples
46
+ healthy_mask = (y == 0)
47
+ misclassified_healthy = (healthy_mask & (y_pred_bin == 1))
48
+ total_healthy = np.sum(healthy_mask)
49
+ total_mis_healthy = np.sum(misclassified_healthy)
50
+
51
+ print(f"Total Sick: {total_sick}, Misclassified: {total_mis_sick} ({total_mis_sick/total_sick*100:.2f}%)")
52
+ print(f"Total Healthy: {total_healthy}, Misclassified: {total_mis_healthy} ({total_mis_healthy/total_healthy*100:.2f}%)")
53
+
54
+ # Let's see how many samples we have in total
55
+ print(f"Total Samples: {len(X)}")
utils/audio_validator.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+
4
+ def validate_audio_is_cough(y, sr):
5
+ """
6
+ Validates if the audio signal represents a typical cough pattern.
7
+ Returns: (is_valid: bool, reason: str, confidence: float)
8
+ """
9
+ try:
10
+ # Extract features
11
+ rms = librosa.feature.rms(y=y)[0]
12
+ zcr = librosa.feature.zero_crossing_rate(y=y)[0]
13
+ centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
14
+
15
+ # 1. Check for total silence or extremely low volume
16
+ mean_rms = np.mean(rms)
17
+ if mean_rms < 0.001:
18
+ return False, "Audio is too quiet or empty.", 0.0
19
+
20
+ # 2. Check for continuous talking/laughing (high harmonicity/steady energy)
21
+ # Coughs have high energy variance (bursts). Talking/laughing is more continuous.
22
+ rms_variance = np.var(rms)
23
+ rms_mean_ratio = rms_variance / (mean_rms + 1e-6)
24
+
25
+ # 3. Check Spectral Centroid (Pitch/Brightness)
26
+ # Laughing and talking often have a lower, more stable spectral centroid than the harsh burst of a cough.
27
+ mean_centroid = np.mean(centroid)
28
+
29
+ # 4. Check Zero Crossing Rate (Noisiness)
30
+ # Coughs are noisy (high ZCR bursts). Vowels in laughing/talking are harmonic (steady, lower ZCR).
31
+ mean_zcr = np.mean(zcr)
32
+
33
+ # Simple heuristic thresholding (can be tuned based on user files)
34
+ # A typical cough has high variance (spikes) and high noisiness.
35
+ is_continuous_noise = rms_mean_ratio < 0.015 and mean_zcr < 0.05
36
+
37
+ if is_continuous_noise:
38
+ # Looks like steady speech, laughing, or background humming
39
+ return False, "Audio detected as speaking, laughing, or steady noise. Please record a clear cough.", 0.95
40
+
41
+ # Passed basic validation
42
+ return True, "Valid audio signal detected.", 0.85
43
+
44
+ except Exception as e:
45
+ return False, f"Validation error: {str(e)}", 0.0
utils/audit_mislabels.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ import os
4
+
5
+ # Paths
6
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
7
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
8
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
9
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
10
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
11
+ mapping_path = os.path.join(base_dir, "hear_embeddings_coughvid", "processed_paths.txt")
12
+
13
+ # Clean y
14
+ def clean_y(y):
15
+ if y.dtype.kind in ['U', 'S']:
16
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
17
+ return y.astype(np.float32)
18
+
19
+ # Load data
20
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
21
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
22
+ X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
23
+ y_all = np.concatenate([y1, y2], axis=0)
24
+
25
+ # Load Model
26
+ model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
27
+ model = tf.keras.models.load_model(model_path)
28
+
29
+ # Predict all
30
+ print("Auditing 11,000 samples for mislabels...")
31
+ preds = model.predict(X_all, batch_size=128).flatten()
32
+
33
+ # Filter for SICK samples where model is VERY SURE they are healthy
34
+ # i.e., label=1, but pred < 0.1
35
+ sick_labels = (y_all == 1)
36
+ confused_sick_indices = np.where(sick_labels & (preds < 0.1))[0]
37
+
38
+ # Mapping
39
+ coughvid_paths = []
40
+ if os.path.exists(mapping_path):
41
+ with open(mapping_path, 'r') as f:
42
+ coughvid_paths = [line.strip() for line in f]
43
+
44
+ print(f"\nFound {len(confused_sick_indices)} Sick samples the model is SURE are Healthy.")
45
+
46
+ # Report Top 10
47
+ audit_report = "--- DATA AUDIT: SICK SAMPLES FLAGGED AS HEALTHY ---\n"
48
+ audit_report += f"Total Samples Audited: {len(X_all)}\n"
49
+ audit_report += f"Total Flags: {len(confused_sick_indices)}\n\n"
50
+
51
+ results_to_show = confused_sick_indices[:15]
52
+ for idx in results_to_show:
53
+ confidence_as_healthy = (1 - preds[idx]) * 100
54
+ if idx >= 6824: # From Coughvid
55
+ sub_idx = idx - 6824
56
+ if sub_idx < len(coughvid_paths):
57
+ path = coughvid_paths[sub_idx]
58
+ audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: {os.path.basename(path)}\n"
59
+ else:
60
+ audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: Unknown (Coughvid index {sub_idx})\n"
61
+ else:
62
+ audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: Unknown (Original Dataset)\n"
63
+
64
+ with open("audit_mislabels.txt", "w") as f:
65
+ f.write(audit_report)
66
+ print(audit_report)
utils/audit_mislabels.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- DATA AUDIT: SICK SAMPLES FLAGGED AS HEALTHY ---
2
+ Total Samples Audited: 11092
3
+ Total Flags: 213
4
+
5
+ Index 4 | Model Confidence: 90.3% (Healthy) | File: Unknown (Original Dataset)
6
+ Index 122 | Model Confidence: 91.2% (Healthy) | File: Unknown (Original Dataset)
7
+ Index 165 | Model Confidence: 93.8% (Healthy) | File: Unknown (Original Dataset)
8
+ Index 211 | Model Confidence: 93.1% (Healthy) | File: Unknown (Original Dataset)
9
+ Index 255 | Model Confidence: 90.7% (Healthy) | File: Unknown (Original Dataset)
10
+ Index 270 | Model Confidence: 98.4% (Healthy) | File: Unknown (Original Dataset)
11
+ Index 332 | Model Confidence: 98.1% (Healthy) | File: Unknown (Original Dataset)
12
+ Index 472 | Model Confidence: 90.9% (Healthy) | File: Unknown (Original Dataset)
13
+ Index 475 | Model Confidence: 94.0% (Healthy) | File: Unknown (Original Dataset)
14
+ Index 477 | Model Confidence: 93.2% (Healthy) | File: Unknown (Original Dataset)
15
+ Index 486 | Model Confidence: 92.0% (Healthy) | File: Unknown (Original Dataset)
16
+ Index 501 | Model Confidence: 90.8% (Healthy) | File: Unknown (Original Dataset)
17
+ Index 594 | Model Confidence: 92.1% (Healthy) | File: Unknown (Original Dataset)
18
+ Index 630 | Model Confidence: 90.1% (Healthy) | File: Unknown (Original Dataset)
19
+ Index 631 | Model Confidence: 92.3% (Healthy) | File: Unknown (Original Dataset)
utils/extract_elite_samples.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ import sys
6
+
7
+ # Add project root to path
8
+ PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
9
+ if PROJECT_ROOT not in sys.path:
10
+ sys.path.append(PROJECT_ROOT)
11
+
12
+ from utils.hear_extractor import HeARExtractor
13
+
14
+ # --- Config ---
15
+ META_PATH = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\metadata_compiled.csv"
16
+ AUDIO_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
17
+ OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_elite"
18
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
19
+
20
+ def main():
21
+ print("Identifying Elite Samples...")
22
+ df = pd.read_csv(META_PATH)
23
+
24
+ # SICK (Elite Match)
25
+ elite_sick_uuids = df[(df['status'].isin(['COVID-19', 'sick'])) & (df['cough_detected'] > 0.8) & (df['SNR'] > 5)]['uuid'].tolist()
26
+
27
+ # HEALTHY (Elite Match)
28
+ elite_healthy_uuids = df[(df['status'] == 'healthy') & (df['cough_detected'] > 0.95)]['uuid'].tolist()
29
+
30
+ print(f"Total Eligible Elite Sick: {len(elite_sick_uuids)}")
31
+ print(f"Total Eligible Elite Healthy: {len(elite_healthy_uuids)}")
32
+
33
+ # Let's limit Healthy to 1,000 for speed, Sick as many as we can find (approx 600-700)
34
+ sick_to_process = elite_sick_uuids[:1000]
35
+ healthy_to_process = elite_healthy_uuids[:1000]
36
+
37
+ # Map UUIDs to actual paths
38
+ all_tasks = []
39
+
40
+ # Sick mapping
41
+ sick_folder = os.path.join(AUDIO_ROOT, 'sick')
42
+ for uuid in sick_to_process:
43
+ found = False
44
+ for ext in ['.webm', '.wav', '.ogg']:
45
+ # Files in organized folder start with 'cv_'
46
+ path = os.path.join(sick_folder, "cv_" + uuid + ext)
47
+ if os.path.exists(path):
48
+ all_tasks.append((path, 'sick'))
49
+ found = True
50
+ break
51
+ if not found:
52
+ # Try without prefix just in case some are different
53
+ for ext in ['.webm', '.wav', '.ogg']:
54
+ path = os.path.join(sick_folder, uuid + ext)
55
+ if os.path.exists(path):
56
+ all_tasks.append((path, 'sick'))
57
+ break
58
+
59
+ # Healthy mapping
60
+ healthy_folder = os.path.join(AUDIO_ROOT, 'healthy')
61
+ for uuid in healthy_to_process:
62
+ found = False
63
+ for ext in ['.webm', '.wav', '.ogg']:
64
+ path = os.path.join(healthy_folder, "cv_" + uuid + ext)
65
+ if os.path.exists(path):
66
+ all_tasks.append((path, 'healthy'))
67
+ found = True
68
+ break
69
+ if not found:
70
+ for ext in ['.webm', '.wav', '.ogg']:
71
+ path = os.path.join(healthy_folder, uuid + ext)
72
+ if os.path.exists(path):
73
+ all_tasks.append((path, 'healthy'))
74
+ break
75
+
76
+ print(f"Starting Elite Extraction (Total: {len(all_tasks)} samples)...")
77
+
78
+ extractor = HeARExtractor()
79
+ features = []
80
+ labels = []
81
+
82
+ for path, label in tqdm(all_tasks):
83
+ try:
84
+ emb = extractor.extract(path)
85
+ if emb is not None:
86
+ features.append(emb)
87
+ labels.append(label)
88
+ except Exception:
89
+ continue
90
+
91
+ np.save(os.path.join(OUTPUT_DIR, "X_elite.npy"), np.array(features))
92
+ np.save(os.path.join(OUTPUT_DIR, "y_elite.npy"), np.array(labels))
93
+ print(f"Elite Data Saved: {len(features)} samples.")
94
+
95
+ if __name__ == "__main__":
96
+ main()
utils/extract_hybrid_v5.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+ from tqdm import tqdm
5
+ import sys
6
+
7
+ # Add project root to path
8
+ PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
9
+ if PROJECT_ROOT not in sys.path:
10
+ sys.path.append(PROJECT_ROOT)
11
+
12
+ from utils.hear_extractor import HeARExtractor
13
+
14
+ # --- Config ---
15
+ AUDIO_SEARCH_PATHS = [
16
+ r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized\sick",
17
+ r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized\healthy",
18
+ r"c:\Users\ASUS\lung_ai_project\data\cough",
19
+ r"c:\Users\ASUS\lung_ai_project\data\coswara\coswara_data\kaggle_data"
20
+ ]
21
+ OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hybrid_features"
22
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
23
+
24
+ def extract_traditional_features(y, sr):
25
+ # MFCC (13)
26
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
27
+ mfcc_mean = np.mean(mfcc, axis=1)
28
+
29
+ # Spectral Centroid
30
+ spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
31
+ spec_cent_mean = np.mean(spec_cent)
32
+
33
+ # Zero Crossing Rate
34
+ zcr = librosa.feature.zero_crossing_rate(y)
35
+ zcr_mean = np.mean(zcr)
36
+
37
+ # RMSE (Energy)
38
+ rmse = librosa.feature.rms(y=y)
39
+ rmse_mean = np.mean(rmse)
40
+
41
+ return np.concatenate([mfcc_mean, [spec_cent_mean, zcr_mean, rmse_mean]])
42
+
43
+ def main():
44
+ print("Finding audio files...")
45
+ all_files = []
46
+ for path in AUDIO_SEARCH_PATHS:
47
+ if not os.path.exists(path): continue
48
+ if "kaggle_data" in path: # Coswara structure
49
+ for pid in os.listdir(path):
50
+ p_dir = os.path.join(path, pid)
51
+ for f in ["cough-heavy.wav", "cough-shallow.wav", "cough.wav"]:
52
+ f_path = os.path.join(p_dir, f)
53
+ if os.path.exists(f_path):
54
+ # Determine label - need metadata for Coswara
55
+ # For now, let's just focus on coughvid where folders are explicit
56
+ pass
57
+ else:
58
+ label = 'sick' if 'sick' in path else 'healthy'
59
+ for f in os.listdir(path):
60
+ if f.endswith(('.wav', '.webm', '.ogg')):
61
+ all_files.append((os.path.join(path, f), label))
62
+
63
+ print(f"Total files found (Coughvid + Cough): {len(all_files)}")
64
+
65
+ # To speed up, we'll limit to 1000 balanced samples (500 each)
66
+ import random
67
+ random.shuffle(all_files)
68
+
69
+ sick_list = [f for f in all_files if f[1] == 'sick'][:500]
70
+ healthy_list = [f for f in all_files if f[1] == 'healthy'][:500]
71
+ balanced_files = sick_list + healthy_list
72
+
73
+ print(f"Processing {len(balanced_files)} balanced samples for Hybrid model...")
74
+
75
+ extractor = HeARExtractor()
76
+
77
+ hybrid_features = []
78
+ labels = []
79
+
80
+ for audio_path, label in tqdm(balanced_files):
81
+ try:
82
+ # 1. HeAR Embedding
83
+ emb = extractor.extract(audio_path)
84
+ if emb is None: continue
85
+
86
+ # 2. Traditional Features
87
+ y, sr = librosa.load(audio_path, sr=16000)
88
+ trad = extract_traditional_features(y, sr)
89
+
90
+ # Combine
91
+ combined = np.concatenate([emb, trad])
92
+ hybrid_features.append(combined)
93
+ labels.append(label)
94
+
95
+ except Exception as e:
96
+ continue
97
+
98
+ X_hybrid = np.array(hybrid_features)
99
+ y_hybrid = np.array(labels)
100
+
101
+ np.save(os.path.join(OUTPUT_DIR, "X_hybrid.npy"), X_hybrid)
102
+ np.save(os.path.join(OUTPUT_DIR, "y_hybrid.npy"), y_hybrid)
103
+ print(f"Saved {len(X_hybrid)} hybrid samples to {OUTPUT_DIR}")
104
+
105
+ if __name__ == "__main__":
106
+ main()
utils/hear_extractor.py CHANGED
@@ -1,5 +1,20 @@
1
  import os
2
  import sys
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
4
  import librosa
5
  import tensorflow as tf
@@ -26,8 +41,8 @@ class HeARExtractor:
26
  from huggingface_hub import login
27
  login(token=token)
28
 
29
- # Use /tmp for the model cache on the server to avoid permission errors
30
- model_cache_path = os.path.join("/tmp", "hear_model_cache")
31
 
32
  # Download model files manually to avoid symlink issues on Windows
33
  # and ignore unrelated folders (like event_detector) to speed up download
@@ -94,7 +109,8 @@ class HeARExtractor:
94
  return np.mean(embeddings, axis=0)
95
 
96
  except Exception as e:
97
- print(f"Extraction error: {e}")
 
98
  return None
99
 
100
  if __name__ == "__main__":
 
1
  import os
2
  import sys
3
+ import glob
4
+
5
+ # --- FFmpeg Path Fix for Windows ---
6
+ # 1. Try common WinGet location found on this machine
7
+ FFMPEG_DIR = r"C:\Users\ASUS\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-8.1-full_build\bin"
8
+ if not os.path.exists(FFMPEG_DIR):
9
+ # 2. Try dynamic search in WinGet folder as fallback
10
+ winget_base = os.path.join(os.environ.get("LOCALAPPDATA", ""), "Microsoft", "WinGet", "Packages")
11
+ ffmpeg_bins = glob.glob(os.path.join(winget_base, "*ffmpeg*", "**", "bin"), recursive=True)
12
+ if ffmpeg_bins:
13
+ FFMPEG_DIR = ffmpeg_bins[0]
14
+
15
+ if os.path.exists(FFMPEG_DIR) and FFMPEG_DIR not in os.environ["PATH"]:
16
+ os.environ["PATH"] += os.pathsep + FFMPEG_DIR
17
+
18
  import numpy as np
19
  import librosa
20
  import tensorflow as tf
 
41
  from huggingface_hub import login
42
  login(token=token)
43
 
44
+ # Use a local folder in the project for the model cache
45
+ model_cache_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "hear_model_cache")
46
 
47
  # Download model files manually to avoid symlink issues on Windows
48
  # and ignore unrelated folders (like event_detector) to speed up download
 
109
  return np.mean(embeddings, axis=0)
110
 
111
  except Exception as e:
112
+ error_msg = str(e) if str(e) else "Unknown error (check if FFmpeg is working or file is corrupted)"
113
+ print(f"Extraction error ({os.path.basename(audio_input) if isinstance(audio_input, str) else 'array'}): {error_msg}")
114
  return None
115
 
116
  if __name__ == "__main__":
utils/inspect_labels.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+
4
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
5
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
6
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
7
+
8
+ y1 = np.load(path1_y)
9
+ y2 = np.load(path2_y)
10
+
11
+ print(f"y1 dtype: {y1.dtype}, unique: {np.unique(y1)}")
12
+ print(f"y2 dtype: {y2.dtype}, unique: {np.unique(y2)}")
13
+
14
+ # Convert y1 if string
15
+ if y1.dtype.kind in ['U', 'S']:
16
+ y1_converted = np.where(y1 == 'sick', 1, 0).astype(np.int32)
17
+ print(f"y1 converted dtype: {y1_converted.dtype}, unique: {np.unique(y1_converted)}")
18
+ else:
19
+ y1_converted = y1.astype(np.int32)
20
+
21
+ # Convert y2 if string
22
+ if y2.dtype.kind in ['U', 'S']:
23
+ y2_converted = np.where(y2 == 'sick', 1, 0).astype(np.int32)
24
+ print(f"y2 converted dtype: {y2_converted.dtype}, unique: {np.unique(y2_converted)}")
25
+ else:
26
+ y2_converted = y2.astype(np.int32)
27
+
28
+ y_merged = np.concatenate([y1_converted, y2_converted])
29
+ print(f"y_merged dtype: {y_merged.dtype}, unique: {np.unique(y_merged)}")
utils/process_coughvid.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+ import sys
5
+ import ctypes # To keep Windows awake
6
+
7
+ # Add project root to path to allow absolute imports
8
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9
+ if PROJECT_ROOT not in sys.path:
10
+ sys.path.append(PROJECT_ROOT)
11
+
12
+ from utils.hear_extractor import HeARExtractor
13
+
14
+ # --- Configuration ---
15
+ DATA_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
16
+ OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_coughvid"
17
+ CHECKPOINT_EVERY = 50
18
+ TARGET_SICK_COUNT = 2500 # Extracting 1,000 more sick samples
19
+
20
+ def run_extraction():
21
+ # Keep Windows awake during extraction
22
+ try:
23
+ # ES_CONTINUOUS (0x80000000) | ES_SYSTEM_REQUIRED (0x00000001)
24
+ ctypes.windll.kernel32.SetThreadExecutionState(0x80000000 | 0x00000001)
25
+ print(">>> Windows 'Stay Awake' mode enabled.")
26
+ except Exception:
27
+ print(">>> Warning: Could not enable 'Stay Awake' mode.")
28
+
29
+ if not os.path.exists(OUTPUT_DIR):
30
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
31
+
32
+ features_path = os.path.join(OUTPUT_DIR, "X_coughvid.npy")
33
+ labels_path = os.path.join(OUTPUT_DIR, "y_coughvid.npy")
34
+
35
+ # Load existing if available to resume
36
+ features = []
37
+ labels = []
38
+ if os.path.exists(features_path) and os.path.exists(labels_path):
39
+ print("Loading existing embeddings...")
40
+ features = list(np.load(features_path))
41
+ labels = list(np.load(labels_path))
42
+
43
+ current_sick_count = sum(1 for l in labels if l == 'sick')
44
+ print(f"Current Sick Samples: {current_sick_count}")
45
+
46
+ if current_sick_count >= TARGET_SICK_COUNT:
47
+ print(f"Goal reached! You already have {current_sick_count} sick samples.")
48
+ return
49
+
50
+ # Tracker for processed paths to avoid duplicates
51
+ tracker_path = os.path.join(OUTPUT_DIR, "processed_paths.txt")
52
+ processed_paths = set()
53
+ if os.path.exists(tracker_path):
54
+ with open(tracker_path, 'r') as f:
55
+ processed_paths = set(line.strip() for line in f)
56
+
57
+ # Collect only SICK files
58
+ folder = os.path.join(DATA_ROOT, 'sick')
59
+ all_sick_files = []
60
+ if os.path.exists(folder):
61
+ for f in os.listdir(folder):
62
+ full_path = os.path.join(folder, f)
63
+ if f.endswith(('.webm', '.ogg', '.wav')) and full_path not in processed_paths:
64
+ all_sick_files.append(full_path)
65
+
66
+ remaining_to_goal = TARGET_SICK_COUNT - current_sick_count
67
+ files_to_process = all_sick_files[:remaining_to_goal]
68
+
69
+ print(f"Extraction Target: {len(files_to_process)} more sick samples.")
70
+
71
+ if not files_to_process:
72
+ print("No more unique sick files found to process.")
73
+ return
74
+
75
+ # Initialize Extractor
76
+ print("Initializing HeAR Extractor...")
77
+ extractor = HeARExtractor()
78
+
79
+ try:
80
+ count = 0
81
+ with open(tracker_path, 'a') as tracker:
82
+ for path in tqdm(files_to_process, desc="Extracting Sick"):
83
+ emb = extractor.extract(path)
84
+ if emb is not None:
85
+ features.append(emb)
86
+ labels.append('sick')
87
+ tracker.write(path + "\n")
88
+ count += 1
89
+
90
+ if count % CHECKPOINT_EVERY == 0 and count > 0:
91
+ np.save(features_path, np.array(features))
92
+ np.save(labels_path, np.array(labels))
93
+
94
+ # Final save
95
+ np.save(features_path, np.array(features))
96
+ np.save(labels_path, np.array(labels))
97
+ print(f"Success! Now you have {sum(1 for l in labels if l == 'sick')} sick samples in total.")
98
+
99
+ except KeyboardInterrupt:
100
+ print("\nStopping and saving progress...")
101
+ np.save(features_path, np.array(features))
102
+ np.save(labels_path, np.array(labels))
103
+ print("Progress saved.")
104
+ finally:
105
+ # Reset Windows sleep settings to normal
106
+ try:
107
+ ctypes.windll.kernel32.SetThreadExecutionState(0x80000000)
108
+ except Exception:
109
+ pass
110
+
111
+ if __name__ == "__main__":
112
+ run_extraction()
113
+
114
+ if __name__ == "__main__":
115
+ run_extraction()
utils/purge_and_retrain.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ import os
4
+ from sklearn.model_selection import StratifiedKFold, train_test_split
5
+ from sklearn.utils import shuffle
6
+
7
+ # Paths
8
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
9
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
10
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
11
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
12
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
13
+
14
+ def clean_y(y):
15
+ if y.dtype.kind in ['U', 'S']:
16
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
17
+ return y.astype(np.float32)
18
+
19
+ # Load data
20
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
21
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
22
+ X = np.concatenate([X1, X2], axis=0).astype(np.float32)
23
+ y = np.concatenate([y1, y2], axis=0)
24
+
25
+ # Load current best model
26
+ model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
27
+ model = tf.keras.models.load_model(model_path)
28
+
29
+ # Predict all
30
+ print("Finding unreliable samples for the GREAT PURGE...")
31
+ preds = model.predict(X, batch_size=128).flatten()
32
+
33
+ # PURGE CRITERIA:
34
+ # 1. Label says SICK (1), but model is >85% sure it is HEALTHY (pred < 0.15)
35
+ # 2. Label says HEALTHY (0), but model is >85% sure it is SICK (pred > 0.85)
36
+ purge_mask = ((y == 1) & (preds < 0.15)) | ((y == 0) & (preds > 0.85))
37
+
38
+ # KEEP ONLY RELIABLE SAMPLES
39
+ X_clean = X[~purge_mask]
40
+ y_clean = y[~purge_mask]
41
+
42
+ print(f"Purged {np.sum(purge_mask)} 'Contradictory' samples.")
43
+ print(f"Clean Dataset size: {len(X_clean)}")
44
+
45
+ # STRICT BALANCING (Match minority class size)
46
+ h_idx = np.where(y_clean == 0)[0]
47
+ s_idx = np.where(y_clean == 1)[0]
48
+ min_size = min(len(h_idx), len(s_idx))
49
+ print(f"Balancing Clean Set to {min_size} samples per class.")
50
+
51
+ np.random.seed(42)
52
+ bal_h = np.random.choice(h_idx, min_size, replace=False)
53
+ bal_s = np.random.choice(s_idx, min_size, replace=False)
54
+ idx = np.concatenate([bal_h, bal_s])
55
+ np.random.shuffle(idx)
56
+
57
+ X_final = X_clean[idx]
58
+ y_final = y_clean[idx]
59
+
60
+ # Final Training with Higher Capacity
61
+ final_model = tf.keras.Sequential([
62
+ tf.keras.layers.Dense(1024, activation='relu', input_shape=(X_final.shape[1],)),
63
+ tf.keras.layers.BatchNormalization(),
64
+ tf.keras.layers.Dropout(0.4),
65
+
66
+ tf.keras.layers.Dense(512, activation='relu'),
67
+ tf.keras.layers.BatchNormalization(),
68
+ tf.keras.layers.Dropout(0.4),
69
+
70
+ tf.keras.layers.Dense(256, activation='relu'),
71
+ tf.keras.layers.Dense(1, activation='sigmoid')
72
+ ])
73
+
74
+ final_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])
75
+ cb = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)]
76
+
77
+ # Train on 85/15 split
78
+ X_t, X_v, y_t, y_v = train_test_split(X_final, y_final, test_size=0.15, random_state=42)
79
+
80
+ print("Starting FINAL TRAINING on Purged & Balanced dataset...")
81
+ history = final_model.fit(X_t, y_t, validation_data=(X_v, y_v), epochs=150, batch_size=32, callbacks=cb)
82
+
83
+ # Save
84
+ save_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v6_final.h5"
85
+ final_model.save(save_path)
86
+ print(f"Purged Model saved to {save_path}")
87
+
88
+ best_acc = max(history.history['val_accuracy'])
89
+ print(f"Best Purged-Set Accuracy: {best_acc*100:.2f}%")
utils/test_overlap.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ from scipy.spatial.distance import cdist
4
+
5
+ # Paths
6
+ base_dir = r"c:\Users\ASUS\lung_ai_project\data"
7
+ path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
8
+ path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
9
+ path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
10
+ path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
11
+
12
+ # Load and clean
13
+ def clean_y(y):
14
+ if y.dtype.kind in ['U', 'S']:
15
+ return np.where(y == 'sick', 1, 0).astype(np.float32)
16
+ return y.astype(np.float32)
17
+
18
+ X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
19
+ X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
20
+
21
+ X = np.concatenate([X1, X2], axis=0).astype(np.float32)
22
+ y = np.concatenate([y1, y2], axis=0)
23
+
24
+ # Randomly sample some sick and healthy to check proximity
25
+ sick_indices = np.where(y == 1)[0]
26
+ healthy_indices = np.where(y == 0)[0]
27
+
28
+ # Pick a small Subset to check distances (full 11k cdist is too slow)
29
+ subs_s = np.random.choice(sick_indices, 500, replace=False)
30
+ subs_h = np.random.choice(healthy_indices, 500, replace=False)
31
+
32
+ X_s = X[subs_s]
33
+ X_h = X[subs_h]
34
+
35
+ # Check distances between 500 sick and 500 healthy samples
36
+ dist_matrix = cdist(X_s, X_h, 'cosine')
37
+
38
+ # Find how many sick samples are extremely close to healthy ones
39
+ very_close = np.where(dist_matrix < 0.05)
40
+ print(f"Overlap Analysis (Cosine Distance < 0.05): {len(very_close[0])} pairs found.")
41
+
42
+ avg_dist_sick_to_healthy = np.mean(dist_matrix)
43
+ print(f"Average Distance (Sick to Healthy): {avg_dist_sick_to_healthy:.4f}")
44
+
45
+ # Check distances within sick
46
+ dist_within_sick = cdist(X_s, X_s, 'cosine')
47
+ avg_dist_within_sick = np.mean(dist_within_sick)
48
+ print(f"Average Distance (Within Sick): {avg_dist_within_sick:.4f}")
49
+
50
+ # Check distances within healthy
51
+ dist_within_healthy = cdist(X_h, X_h, 'cosine')
52
+ avg_dist_within_healthy = np.mean(dist_within_healthy)
53
+ print(f"Average Distance (Within Healthy): {avg_dist_within_healthy:.4f}")