Spaces:

78anand
/

KasaHealth

Running

App Files Files Community

78anand commited on 28 days ago

Commit

4fcfef4

verified ·

1 Parent(s): a1d9ea4

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

app/main.py +23 -37
download_new_dataset.py +36 -28
kasahealth-frontend/index.html +3 -7
kasahealth-frontend/static/css/style.css +14 -8
kasahealth-frontend/static/images/logo.jpeg +0 -0
kasahealth-frontend/vercel.json +11 -0
models/compare_and_test.py +70 -0
models/final_test_50_healthy.py +64 -0
models/final_test_50_sick.py +63 -0
models/healthy_test_result.txt +1 -0
models/hear_classifier_v8_elite.h5 +3 -0
models/test_results.txt +0 -0
models/v8_final_test.py +66 -0
predict_user_file.py +12 -0
utils/analyze_misses.py +55 -0
utils/audio_validator.py +45 -0
utils/audit_mislabels.py +66 -0
utils/audit_mislabels.txt +19 -0
utils/extract_elite_samples.py +96 -0
utils/extract_hybrid_v5.py +106 -0
utils/hear_extractor.py +19 -3
utils/inspect_labels.py +29 -0
utils/process_coughvid.py +115 -0
utils/purge_and_retrain.py +89 -0
utils/test_overlap.py +53 -0

app/main.py CHANGED Viewed

@@ -52,34 +52,29 @@ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB limit
 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
 # Configuration
-MODEL_PATH = os.path.join(project_root, "models", "hear_classifier_advanced.h5")
-CLASSES_PATH = os.path.join(project_root, "models", "hear_classes_advanced.npy")
-# Global variables for laziness loading
 extractor = None
 classifier_model = None
-classes = None
 def load_resources():
-    global extractor, classifier_model, classes
     if extractor is None:
         print("Initializing HeAR Extractor...")
-        # Use the HF_TOKEN from Space Secrets for gated model access
         hf_token = os.environ.get('HF_TOKEN')
         extractor = HeARExtractor(token=hf_token)
     if classifier_model is None:
-        print(f"Loading Model from {MODEL_PATH}...")
         classifier_model = load_model(MODEL_PATH, compile=False)
-        classes = np.load(CLASSES_PATH)
-        print(f"Classes: {classes}")
 @app.route('/')
 def index():
     return jsonify({
         "status": "online",
-        "service": "KasaHealth Diagnostic API",
-        "version": "1.1.0",
         "message": "Send audio files via POST to /predict"
     })
@@ -98,39 +93,34 @@ def predict():
         file.save(filepath)
         try:
-            # Ensure resources are loaded
             load_resources()
-            # 1. Load and resample
             y, sr = librosa.load(filepath, sr=16000, duration=5.0)
-            # --- VAD FILTER (Voice Activity Detection) ---
-            # Calculate the root mean square (RMS) energy to find volume
             rms_energy = np.mean(librosa.feature.rms(y=y))
-            # If the recording is practically silent, reject it
             if rms_energy < 0.005:
                 os.remove(filepath)
-                return jsonify({
-                    "error": "No cough detected. The recording was too quiet. Please cough forcefully."
-                }), 400
-            # 2. Preprocess
             y_clean = advanced_preprocess(y, sr)
-            # 3. Extract Features
             emb = extractor.extract(y_clean)
             if emb is not None:
-                # 4. Predict
                 X = emb[np.newaxis, ...]
-                preds = classifier_model.predict(X, verbose=0)
-                pred_idx = np.argmax(preds[0])
-                raw_label = classes[pred_idx]
-                confidence = float(preds[0][pred_idx])
-                # --- Reliability Guard ---
-                THRESHOLD = 0.70
                 if raw_label == "sick" and confidence < THRESHOLD:
                     final_label = "healthy"
                     is_inconclusive = True
@@ -138,9 +128,7 @@ def predict():
                     final_label = raw_label
                     is_inconclusive = False
-                # Clean up file
                 os.remove(filepath)
                 return jsonify({
                     "status": "success",
                     "result": final_label,
@@ -151,12 +139,10 @@ def predict():
                 })
             else:
                 os.remove(filepath)
-                return jsonify({"error": "Could not extract features from audio"}), 500
         except Exception as e:
-            if os.path.exists(filepath):
-                os.remove(filepath)
-            print(f"Error processing audio: {e}")
             return jsonify({"error": str(e)}), 500
 def get_recommendation(label, is_inconclusive):

 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
 # Configuration
+MODEL_PATH = os.path.join(project_root, "models", "hear_classifier_v8_elite.h5")
+# Global variables for lazy loading
 extractor = None
 classifier_model = None
 def load_resources():
+    global extractor, classifier_model
     if extractor is None:
         print("Initializing HeAR Extractor...")
         hf_token = os.environ.get('HF_TOKEN')
         extractor = HeARExtractor(token=hf_token)
     if classifier_model is None:
+        print(f"Loading Elite Model from {MODEL_PATH}...")
         classifier_model = load_model(MODEL_PATH, compile=False)
 @app.route('/')
 def index():
     return jsonify({
         "status": "online",
+        "service": "KasaHealth Diagnostic API (Elite V8)",
+        "version": "1.2.0",
         "message": "Send audio files via POST to /predict"
     })
         file.save(filepath)
         try:
             load_resources()
             y, sr = librosa.load(filepath, sr=16000, duration=5.0)
+            # VAD Lite
             rms_energy = np.mean(librosa.feature.rms(y=y))
             if rms_energy < 0.005:
                 os.remove(filepath)
+                return jsonify({"error": "No cough detected. Please record in a quieter area."}), 400
             y_clean = advanced_preprocess(y, sr)
             emb = extractor.extract(y_clean)
             if emb is not None:
                 X = emb[np.newaxis, ...]
+                prob = classifier_model.predict(X, verbose=0)[0][0]
+                # Logic: sigmoid (0=healthy, 1=sick)
+                raw_label = "sick" if prob > 0.5 else "healthy"
+                # Confidence is distance from decision boundary
+                if raw_label == "sick":
+                    confidence = float(prob)
+                else:
+                    confidence = float(1.0 - prob)
+                # --- Elite Threshold Check ---
+                # Only report 'sick' if we are VERY sure (> 0.8)
+                THRESHOLD = 0.80
                 if raw_label == "sick" and confidence < THRESHOLD:
                     final_label = "healthy"
                     is_inconclusive = True
                     final_label = raw_label
                     is_inconclusive = False
                 os.remove(filepath)
                 return jsonify({
                     "status": "success",
                     "result": final_label,
                 })
             else:
                 os.remove(filepath)
+                return jsonify({"error": "Feature extraction failed"}), 500
         except Exception as e:
+            if os.path.exists(filepath): os.remove(filepath)
             return jsonify({"error": str(e)}), 500
 def get_recommendation(label, is_inconclusive):

download_new_dataset.py CHANGED Viewed

@@ -8,7 +8,7 @@ import shutil
 # --- Configuration ---
 DATASET_SLUG = "andrewmvd/covid19-cough-audio-classification"
 DOWNLOAD_DIR = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public"
-EXTRACT_DIR = os.path.join(DOWNLOAD_DIR, "extracted")
 ORGANIZED_DIR = os.path.join(DOWNLOAD_DIR, "organized")
 def run_command(cmd):
@@ -18,36 +18,46 @@ def run_command(cmd):
         print(f"Error executing command: {e}")
         return False
     return True
 def main():
     if not os.path.exists(DOWNLOAD_DIR):
         os.makedirs(DOWNLOAD_DIR)
-    # 1. Download dataset using Kaggle CLI
-    print(f"Step 1: Downloading {DATASET_SLUG}...")
-    if not run_command(f"kaggle datasets download -d {DATASET_SLUG} -p {DOWNLOAD_DIR}"):
-        print("Failed to download dataset. Check your Kaggle API key at ~/.kaggle/kaggle.json")
-        return
-    # 2. Extract dataset
-    zip_filename = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith(".zip")][0]
-    zip_path = os.path.join(DOWNLOAD_DIR, zip_filename)
-    print(f"Step 2: Extracting {zip_filename} to {EXTRACT_DIR}...")
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(EXTRACT_DIR)
-    # 3. Find and load metadata
-    metadata_file = None
-    for root, dirs, files in os.walk(EXTRACT_DIR):
-        for f in files:
-            if "metadata" in f.lower() and f.endswith(".csv"):
-                metadata_file = os.path.join(root, f)
-                break
-        if metadata_file: break
     if not metadata_file:
-        print("Metadata CSV not found. Attempting to locate in subfolders...")
         return
     print(f"Step 3: Organizing files based on {metadata_file}...")
@@ -89,9 +99,7 @@ def main():
             else: count_s += 1
     print(f"Done! Organized {count_h} healthy and {count_s} sick files.")
-    print(f"Source Folder: {ORGANIZED_DIR}")
-    print(f"Finished organizing {processed_count} files into {ORGANIZED_DIR}")
 if __name__ == "__main__":
     main()

 # --- Configuration ---
 DATASET_SLUG = "andrewmvd/covid19-cough-audio-classification"
 DOWNLOAD_DIR = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public"
+EXTRACT_DIR = DOWNLOAD_DIR # Use the base folder since user has manually unzipped there
 ORGANIZED_DIR = os.path.join(DOWNLOAD_DIR, "organized")
 def run_command(cmd):
         print(f"Error executing command: {e}")
         return False
     return True
 def main():
     if not os.path.exists(DOWNLOAD_DIR):
         os.makedirs(DOWNLOAD_DIR)
+    # Check if we already have the metadata CSV to skip steps 1 and 2
+    metadata_filename = "metadata_compiled.csv"
+    potential_metadata = os.path.join(DOWNLOAD_DIR, metadata_filename)
+    if os.path.exists(potential_metadata):
+        print(f"Step 1 & 2: Skipping! {metadata_filename} already found in {DOWNLOAD_DIR}")
+        metadata_file = potential_metadata
+    else:
+        # 1. Download dataset using Kaggle CLI
+        print(f"Step 1: Downloading {DATASET_SLUG}...")
+        if not run_command(f"kaggle datasets download -d {DATASET_SLUG} -p {DOWNLOAD_DIR}"):
+            print("Failed to download dataset. Check your Kaggle API key at ~/.kaggle/kaggle.json")
+            return
+        # 2. Extract dataset
+        zips = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith(".zip")]
+        if not zips:
+            print(f"No zip found in {DOWNLOAD_DIR}. Was it already unzipped?")
+        else:
+            zip_filename = zips[0]
+            zip_path = os.path.join(DOWNLOAD_DIR, zip_filename)
+            print(f"Step 2: Extracting {zip_filename} to {DOWNLOAD_DIR}...")
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(DOWNLOAD_DIR)
+        # 3. Find and load metadata again (in case it unzipped somewhere else)
+        metadata_file = None
+        for root, dirs, files in os.walk(DOWNLOAD_DIR):
+            for f in files:
+                if "metadata" in f.lower() and f.endswith(".csv"):
+                    metadata_file = os.path.join(root, f)
+                    break
+            if metadata_file: break
     if not metadata_file:
+        print("Metadata CSV not found. Please ensure the dataset is downloaded correctly.")
         return
     print(f"Step 3: Organizing files based on {metadata_file}...")
             else: count_s += 1
     print(f"Done! Organized {count_h} healthy and {count_s} sick files.")
+    print(f"Organized datasets location: {ORGANIZED_DIR}")
 if __name__ == "__main__":
     main()

kasahealth-frontend/index.html CHANGED Viewed

@@ -18,7 +18,7 @@
     <!-- Styles -->
     <link rel="stylesheet" href="static/css/style.css">
-    <link rel="icon" type="image/png" href="static/images/logo.png">
 </head>
 <body>
@@ -26,9 +26,8 @@
     <nav class="glass" id="main-nav">
         <div class="container nav-container">
             <a href="#" class="logo-wrap" onclick="location.reload()">
-                <img src="static/images/logo.png" alt="KasaHealth Logo">
             </a>
-            <span class="credibility-tag"><i class="fas fa-shield-alt"></i> Clinical AI Edge</span>
         </div>
     </nav>
@@ -39,9 +38,6 @@
         <!-- ============================================== -->
         <section id="screen-landing" class="screen active-screen">
             <div class="hero-content">
-                <div class="badge">
-                    <i class="fas fa-microchip"></i> Powered by Google HeAR API
-                </div>
                 <h1>Next-Gen Acoustic<br><span>Lung Screening</span></h1>
                 <p>Smartphone-based respiratory analysis to detect early signs of pulmonary anomalies with clinical-grade precision.</p>
@@ -75,7 +71,7 @@
                         <i class="fas fa-mobile-alt anim-float text-navy"></i>
                     </div>
                     <h4>1. Hold Phone Closer</h4>
-                    <p>Hold your smartphone about 15-20cm away from your mouth.</p>
                 </div>
                 <!-- Step 2 -->

     <!-- Styles -->
     <link rel="stylesheet" href="static/css/style.css">
+    <link rel="icon" type="image/jpeg" href="static/images/logo.jpeg">
 </head>
 <body>
     <nav class="glass" id="main-nav">
         <div class="container nav-container">
             <a href="#" class="logo-wrap" onclick="location.reload()">
+                <img src="static/images/logo.jpeg" alt="KasaHealth Logo">
             </a>
         </div>
     </nav>
         <!-- ============================================== -->
         <section id="screen-landing" class="screen active-screen">
             <div class="hero-content">
                 <h1>Next-Gen Acoustic<br><span>Lung Screening</span></h1>
                 <p>Smartphone-based respiratory analysis to detect early signs of pulmonary anomalies with clinical-grade precision.</p>
                         <i class="fas fa-mobile-alt anim-float text-navy"></i>
                     </div>
                     <h4>1. Hold Phone Closer</h4>
+                    <p>Keep it 4-5 inches away from your mouth for best acoustic quality.</p>
                 </div>
                 <!-- Step 2 -->

kasahealth-frontend/static/css/style.css CHANGED Viewed

@@ -1,14 +1,15 @@
 :root {
-    --primary-navy: #1E4D7B;
-    --primary-teal: #2BBF9E;
-    --accent-teal: #e6f7f4;
     --text-main: #2c3e50;
     --text-light: #576574;
     --bg-surface: #ffffff;
-    --bg-page: #f8fafc;
     --border-soft: #e2e8f0;
-    --color-success: #27ae60;
     --color-danger: #e74c3c;
     --color-warning: #f59e0b;
@@ -63,9 +64,14 @@ nav {
     border-bottom: 1px solid rgba(0,0,0,0.05);
 }
-.nav-container { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; }
-.logo-wrap img { height: 35px; }
-.credibility-tag { font-size: 0.75rem; font-weight: 700; color: var(--primary-teal); border-radius: 20px; background: var(--accent-teal); padding: 0.3rem 0.8rem; }
 /* Main Container (SPA layout) */
 .app-container {

 :root {
+    /* Exact colors sampled from KasaHealth Logo */
+    --primary-navy: #184e85;  /* From "Kasa" */
+    --primary-teal: #26b797;  /* From "Health" */
+    --accent-teal: #e2f6f2;   /* Soft background companion */
     --text-main: #2c3e50;
     --text-light: #576574;
     --bg-surface: #ffffff;
+    --bg-page: #f9fbfb;       /* Very subtle clinical blue-tint */
     --border-soft: #e2e8f0;
+    --color-success: #26b797; /* Re-mapped success to brand teal */
     --color-danger: #e74c3c;
     --color-warning: #f59e0b;
     border-bottom: 1px solid rgba(0,0,0,0.05);
 }
+.nav-container { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; max-width: 1200px; margin: 0 auto; width: 100%; }
+.logo-wrap { display: flex; align-items: center; height: 100%; }
+.logo-wrap img { height: 55px; object-fit: contain; } /* Mobile size */
+@media (min-width: 768px) {
+    .logo-wrap img { height: 85px; } /* Crisp, large size for laptops and desktops */
+    .nav-container { padding: 1.5rem 2rem; }
+}
 /* Main Container (SPA layout) */
 .app-container {

kasahealth-frontend/static/images/logo.jpeg ADDED Viewed

kasahealth-frontend/vercel.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "version": 2,
+  "name": "kasahealth-elite",
+  "builds": [
+    { "src": "index.html", "use": "@vercel/static" },
+    { "src": "static/**", "use": "@vercel/static" }
+  ],
+  "routes": [
+    { "src": "/(.*)", "dest": "/index.html" }
+  ]
+}

models/compare_and_test.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+import os
+import time
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+# Clean y
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+# Load data (Same data we used, but we need an "unseen" set)
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y_all = np.concatenate([y1, y2], axis=0)
+# Shift validation set for testing (Using a seed NOT used in training)
+# Actually, let's just use the validation split from the 11k to be sure it's consistent.
+# For truly unseen, let's use a 15% split like before but with 40 random samples.
+X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=999) # New seed 999
+def run_test_on_model(model_name, model_path):
+    if not os.path.exists(model_path):
+        print(f"Model {model_name} not found yet. Skipping...")
+        return
+    model = tf.keras.models.load_model(model_path)
+    # 20 Healthy
+    h_idx = np.where(y_target == 0)[0]
+    s_idx = np.where(y_target == 1)[0]
+    rng = np.random.default_rng(2026)
+    sel_h = rng.choice(h_idx, 20, replace=False)
+    sel_s = rng.choice(s_idx, 20, replace=False)
+    X_h_test = X_target[sel_h]
+    X_s_test = X_target[sel_s]
+    # Predictions
+    preds_h = (model.predict(X_h_test) > 0.5).astype(int).flatten()
+    preds_s = (model.predict(X_s_test) > 0.5).astype(int).flatten()
+    acc_h = (np.sum(preds_h == 0) / 20) * 100
+    acc_s = (np.sum(preds_s == 1) / 20) * 100
+    print(f"\n--- Model: {model_name} ---")
+    print(f"Healthy Accuracy (20/20 Target): {acc_h:.2f}% ({np.sum(preds_h == 0)}/20)")
+    print(f"Sick Accuracy (20/20 Target): {acc_s:.2f}% ({np.sum(preds_s == 1)}/20)")
+    print(f"Total Model Accuracy (40 samples): {(acc_h + acc_s)/2:.2f}%")
+# Models to compare
+models_map = {
+    "V3 (Standard)": r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v3.h5",
+    "V5 PRO (Balanced)": r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
+}
+for name, path in models_map.items():
+    run_test_on_model(name, path)

models/final_test_50_healthy.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+import os
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+# Clean y
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+# Load data
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y_all = np.concatenate([y1, y2], axis=0)
+# Unseen set
+X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=777) # Seed 777
+# Load Model
+model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
+model = tf.keras.models.load_model(model_path)
+# Filter for HEALTHY samples
+h_idx = np.where(y_target == 0)[0]
+if len(h_idx) < 50:
+    print(f"Warning: Only found {len(h_idx)} healthy samples.")
+    count = len(h_idx)
+else:
+    count = 50
+rng = np.random.default_rng(2026)
+sel_h = rng.choice(h_idx, count, replace=False)
+X_h_test = X_target[sel_h]
+y_h_test = y_target[sel_h]
+# Predictions
+preds = model.predict(X_h_test)
+y_pred_bin = (preds > 0.5).astype(int).flatten()
+# Results (Correct = 0)
+correct = np.sum(y_pred_bin == 0)
+accuracy = (correct / count) * 100
+print(f"\n--- Final Model: {os.path.basename(model_path)} ---")
+print(f"Total Unseen Healthy Samples Tested: {count}")
+print(f"Correct Identifications: {correct}")
+print(f"Accuracy: {accuracy:.2f}%")
+print("\n--- Summary of Correctness (Sick Confidence) ---")
+for i in range(count):
+    conf = preds[i][0]
+    # For healthy, we want low sick confidence
+    result = "✅ HEALTHY" if y_pred_bin[i] == 0 else "❌ SICK (False Alarm)"
+    print(f"Sample {i+1:2d} | Sick Prob {conf*100:5.2f}% | Result: {result}")

models/final_test_50_sick.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+import os
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+# Clean y
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+# Load data (Includes the new 1000 samples)
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y_all = np.concatenate([y1, y2], axis=0)
+# Shift validation set for testing (Using a seed NOT used in training)
+X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=777) # Seed 777
+# Load Model
+model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
+model = tf.keras.models.load_model(model_path)
+# Filter for SICK samples
+s_idx = np.where(y_target == 1)[0]
+if len(s_idx) < 50:
+    print(f"Warning: Only found {len(s_idx)} sick samples in target set.")
+    count = len(s_idx)
+else:
+    count = 50
+rng = np.random.default_rng(2026)
+sel_s = rng.choice(s_idx, count, replace=False)
+X_s_test = X_target[sel_s]
+y_s_test = y_target[sel_s]
+# Predictions
+preds = model.predict(X_s_test)
+y_pred_bin = (preds > 0.5).astype(int).flatten()
+# Results
+correct = np.sum(y_pred_bin == 1)
+accuracy = (correct / count) * 100
+print(f"\n--- Final Model: {os.path.basename(model_path)} ---")
+print(f"Total Unseen Sick Samples Tested: {count}")
+print(f"Correct Identifications: {correct}")
+print(f"Accuracy: {accuracy:.2f}%")
+print("\n--- Summary of Correctness ---")
+for i in range(count):
+    conf = preds[i][0]
+    result = "✅ SICK" if y_pred_bin[i] == 1 else "❌ HEALTHY (Miss)"
+    print(f"Sample {i+1:2d} | Confidence {conf*100:5.2f}% | Result: {result}")

models/healthy_test_result.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Correct Healthy: 19/20, Accuracy: 95.00%

models/hear_classifier_v8_elite.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e65637808333de73d6d389c835639dbe198b50c45ef6c8202604cd4c332d5e3b
+size 14286224

models/test_results.txt ADDED Viewed

Binary file (1.88 kB). View file

models/v8_final_test.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+import os
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+path_elite_x = os.path.join(base_dir, "hear_embeddings_elite", "X_elite.npy")
+path_elite_y = os.path.join(base_dir, "hear_embeddings_elite", "y_elite.npy")
+# Clean y
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+# Load data (use the FULL set to find "truly unseen" samples)
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+Xe, ye = np.load(path_elite_x), clean_y(np.load(path_elite_y))
+X_all = np.concatenate([X1, X2, Xe], axis=0).astype(np.float32)
+y_all = np.concatenate([y1, y2, ye], axis=0)
+# Unseen set (Seed 1000 for the final test)
+X_v, X_target, y_v, y_target = train_test_split(X_all, y_all, test_size=0.15, random_state=1000)
+# Load V8 Elite Model
+model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v8_elite.h5"
+model = tf.keras.models.load_model(model_path)
+# Filter for SICK and HEALTHY
+s_idx = np.where(y_target == 1)[0]
+h_idx = np.where(y_target == 0)[0]
+rng = np.random.default_rng(2027) # Change seed again to ensure randomness
+sel_s = rng.choice(s_idx, 50, replace=False)
+sel_h = rng.choice(h_idx, 50, replace=False)
+# Predictions - Sick
+preds_s = model.predict(X_target[sel_s])
+y_pred_s = (preds_s > 0.5).astype(int).flatten()
+acc_s = (np.sum(y_pred_s == 1) / 50) * 100
+# Predictions - Healthy
+preds_h = model.predict(X_target[sel_h])
+y_pred_h = (preds_h > 0.5).astype(int).flatten()
+acc_h = (np.sum(y_pred_h == 0) / 50) * 100
+print(f"\n--- Model V8: THE ELITE GUARD ---")
+print(f"Dataset: Elite-Merged & Purged (10,342 Balanced Samples)")
+print(f"\nSICK TEST (50 Samples): {acc_s:.2f}% ({np.sum(y_pred_s == 1)}/50)")
+print(f"HEALTHY TEST (50 Samples): {acc_h:.2f}% ({np.sum(y_pred_h == 0)}/50)")
+print(f"OVERALL ACCURACY: {(acc_s + acc_h)/2:.2f}%")
+# Individual report for Sick misses
+miss_s = 50 - np.sum(y_pred_s == 1)
+print(f"\nNote: Misidentified {miss_s} sick samples.")
+if miss_s > 0:
+    for i in range(miss_s):
+        miss_idx = np.where(y_pred_s == 0)[0][i]
+        print(f"Missed Sick {i+1:2d} | Confidence: {preds_s[miss_idx][0]*100:5.2f}%")

predict_user_file.py CHANGED Viewed

@@ -51,6 +51,18 @@ def predict_single_file(file_path):
         print("Loading and preprocessing audio...")
         y, sr = librosa.load(file_path, sr=16000, duration=5.0)
         # Apply Advanced Preprocessing (Critical for correct result!)
         y_clean = advanced_preprocess(y, sr)

         print("Loading and preprocessing audio...")
         y, sr = librosa.load(file_path, sr=16000, duration=5.0)
+        # --- NEW: Audio Validation (Gatekeeper) ---
+        from utils.audio_validator import validate_audio_is_cough
+        is_valid_cough, reason, val_conf = validate_audio_is_cough(y, sr)
+        if not is_valid_cough:
+            print("\n" + "="*50)
+            print(f"REJECTED: Audio Validation Failed!")
+            print(f"REASON: {reason}")
+            print(f"RECOMMENDATION: Please record a clear, loud cough in a quiet room.")
+            print("="*50)
+            return
         # Apply Advanced Preprocessing (Critical for correct result!)
         y_clean = advanced_preprocess(y, sr)

utils/analyze_misses.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+import os
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+# Load and clean
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+# Merge
+X = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y = np.concatenate([y1, y2], axis=0)
+# No shuffle yet to keep indices traceable if needed, but we'll shuffle after identifying
+# Actually, we just need the samples.
+# Load Model
+model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v3.h5"
+model = tf.keras.models.load_model(model_path)
+# Predict all
+preds = model.predict(X, batch_size=128)
+y_pred_bin = (preds > 0.5).astype(int).flatten()
+# Find misclassified SICK samples
+sick_mask = (y == 1)
+misclassified_sick = (sick_mask & (y_pred_bin == 0))
+correctly_classified_sick = (sick_mask & (y_pred_bin == 1))
+total_sick = np.sum(sick_mask)
+total_mis_sick = np.sum(misclassified_sick)
+# Find misclassified HEALTHY samples
+healthy_mask = (y == 0)
+misclassified_healthy = (healthy_mask & (y_pred_bin == 1))
+total_healthy = np.sum(healthy_mask)
+total_mis_healthy = np.sum(misclassified_healthy)
+print(f"Total Sick: {total_sick}, Misclassified: {total_mis_sick} ({total_mis_sick/total_sick*100:.2f}%)")
+print(f"Total Healthy: {total_healthy}, Misclassified: {total_mis_healthy} ({total_mis_healthy/total_healthy*100:.2f}%)")
+# Let's see how many samples we have in total
+print(f"Total Samples: {len(X)}")

utils/audio_validator.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import librosa
+import numpy as np
+def validate_audio_is_cough(y, sr):
+    """
+    Validates if the audio signal represents a typical cough pattern.
+    Returns: (is_valid: bool, reason: str, confidence: float)
+    """
+    try:
+        # Extract features
+        rms = librosa.feature.rms(y=y)[0]
+        zcr = librosa.feature.zero_crossing_rate(y=y)[0]
+        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
+        # 1. Check for total silence or extremely low volume
+        mean_rms = np.mean(rms)
+        if mean_rms < 0.001:
+            return False, "Audio is too quiet or empty.", 0.0
+        # 2. Check for continuous talking/laughing (high harmonicity/steady energy)
+        # Coughs have high energy variance (bursts). Talking/laughing is more continuous.
+        rms_variance = np.var(rms)
+        rms_mean_ratio = rms_variance / (mean_rms + 1e-6)
+        # 3. Check Spectral Centroid (Pitch/Brightness)
+        # Laughing and talking often have a lower, more stable spectral centroid than the harsh burst of a cough.
+        mean_centroid = np.mean(centroid)
+        # 4. Check Zero Crossing Rate (Noisiness)
+        # Coughs are noisy (high ZCR bursts). Vowels in laughing/talking are harmonic (steady, lower ZCR).
+        mean_zcr = np.mean(zcr)
+        # Simple heuristic thresholding (can be tuned based on user files)
+        # A typical cough has high variance (spikes) and high noisiness.
+        is_continuous_noise = rms_mean_ratio < 0.015 and mean_zcr < 0.05
+        if is_continuous_noise:
+             # Looks like steady speech, laughing, or background humming
+             return False, "Audio detected as speaking, laughing, or steady noise. Please record a clear cough.", 0.95
+        # Passed basic validation
+        return True, "Valid audio signal detected.", 0.85
+    except Exception as e:
+        return False, f"Validation error: {str(e)}", 0.0

utils/audit_mislabels.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+import tensorflow as tf
+import os
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+mapping_path = os.path.join(base_dir, "hear_embeddings_coughvid", "processed_paths.txt")
+# Clean y
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+# Load data
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+X_all = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y_all = np.concatenate([y1, y2], axis=0)
+# Load Model
+model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
+model = tf.keras.models.load_model(model_path)
+# Predict all
+print("Auditing 11,000 samples for mislabels...")
+preds = model.predict(X_all, batch_size=128).flatten()
+# Filter for SICK samples where model is VERY SURE they are healthy
+# i.e., label=1, but pred < 0.1
+sick_labels = (y_all == 1)
+confused_sick_indices = np.where(sick_labels & (preds < 0.1))[0]
+# Mapping
+coughvid_paths = []
+if os.path.exists(mapping_path):
+    with open(mapping_path, 'r') as f:
+        coughvid_paths = [line.strip() for line in f]
+print(f"\nFound {len(confused_sick_indices)} Sick samples the model is SURE are Healthy.")
+# Report Top 10
+audit_report = "--- DATA AUDIT: SICK SAMPLES FLAGGED AS HEALTHY ---\n"
+audit_report += f"Total Samples Audited: {len(X_all)}\n"
+audit_report += f"Total Flags: {len(confused_sick_indices)}\n\n"
+results_to_show = confused_sick_indices[:15]
+for idx in results_to_show:
+    confidence_as_healthy = (1 - preds[idx]) * 100
+    if idx >= 6824: # From Coughvid
+        sub_idx = idx - 6824
+        if sub_idx < len(coughvid_paths):
+            path = coughvid_paths[sub_idx]
+            audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: {os.path.basename(path)}\n"
+        else:
+            audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: Unknown (Coughvid index {sub_idx})\n"
+    else:
+        audit_report += f"Index {idx:5d} | Model Confidence: {confidence_as_healthy:.1f}% (Healthy) | File: Unknown (Original Dataset)\n"
+with open("audit_mislabels.txt", "w") as f:
+    f.write(audit_report)
+print(audit_report)

utils/audit_mislabels.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+--- DATA AUDIT: SICK SAMPLES FLAGGED AS HEALTHY ---
+Total Samples Audited: 11092
+Total Flags: 213
+Index     4 | Model Confidence: 90.3% (Healthy) | File: Unknown (Original Dataset)
+Index   122 | Model Confidence: 91.2% (Healthy) | File: Unknown (Original Dataset)
+Index   165 | Model Confidence: 93.8% (Healthy) | File: Unknown (Original Dataset)
+Index   211 | Model Confidence: 93.1% (Healthy) | File: Unknown (Original Dataset)
+Index   255 | Model Confidence: 90.7% (Healthy) | File: Unknown (Original Dataset)
+Index   270 | Model Confidence: 98.4% (Healthy) | File: Unknown (Original Dataset)
+Index   332 | Model Confidence: 98.1% (Healthy) | File: Unknown (Original Dataset)
+Index   472 | Model Confidence: 90.9% (Healthy) | File: Unknown (Original Dataset)
+Index   475 | Model Confidence: 94.0% (Healthy) | File: Unknown (Original Dataset)
+Index   477 | Model Confidence: 93.2% (Healthy) | File: Unknown (Original Dataset)
+Index   486 | Model Confidence: 92.0% (Healthy) | File: Unknown (Original Dataset)
+Index   501 | Model Confidence: 90.8% (Healthy) | File: Unknown (Original Dataset)
+Index   594 | Model Confidence: 92.1% (Healthy) | File: Unknown (Original Dataset)
+Index   630 | Model Confidence: 90.1% (Healthy) | File: Unknown (Original Dataset)
+Index   631 | Model Confidence: 92.3% (Healthy) | File: Unknown (Original Dataset)

utils/extract_elite_samples.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import sys
+# Add project root to path
+PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
+if PROJECT_ROOT not in sys.path:
+    sys.path.append(PROJECT_ROOT)
+from utils.hear_extractor import HeARExtractor
+# --- Config ---
+META_PATH = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\metadata_compiled.csv"
+AUDIO_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
+OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_elite"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def main():
+    print("Identifying Elite Samples...")
+    df = pd.read_csv(META_PATH)
+    # SICK (Elite Match)
+    elite_sick_uuids = df[(df['status'].isin(['COVID-19', 'sick'])) & (df['cough_detected'] > 0.8) & (df['SNR'] > 5)]['uuid'].tolist()
+    # HEALTHY (Elite Match)
+    elite_healthy_uuids = df[(df['status'] == 'healthy') & (df['cough_detected'] > 0.95)]['uuid'].tolist()
+    print(f"Total Eligible Elite Sick: {len(elite_sick_uuids)}")
+    print(f"Total Eligible Elite Healthy: {len(elite_healthy_uuids)}")
+    # Let's limit Healthy to 1,000 for speed, Sick as many as we can find (approx 600-700)
+    sick_to_process = elite_sick_uuids[:1000]
+    healthy_to_process = elite_healthy_uuids[:1000]
+    # Map UUIDs to actual paths
+    all_tasks = []
+    # Sick mapping
+    sick_folder = os.path.join(AUDIO_ROOT, 'sick')
+    for uuid in sick_to_process:
+        found = False
+        for ext in ['.webm', '.wav', '.ogg']:
+            # Files in organized folder start with 'cv_'
+            path = os.path.join(sick_folder, "cv_" + uuid + ext)
+            if os.path.exists(path):
+                all_tasks.append((path, 'sick'))
+                found = True
+                break
+        if not found:
+            # Try without prefix just in case some are different
+            for ext in ['.webm', '.wav', '.ogg']:
+                path = os.path.join(sick_folder, uuid + ext)
+                if os.path.exists(path):
+                    all_tasks.append((path, 'sick'))
+                    break
+    # Healthy mapping
+    healthy_folder = os.path.join(AUDIO_ROOT, 'healthy')
+    for uuid in healthy_to_process:
+        found = False
+        for ext in ['.webm', '.wav', '.ogg']:
+            path = os.path.join(healthy_folder, "cv_" + uuid + ext)
+            if os.path.exists(path):
+                all_tasks.append((path, 'healthy'))
+                found = True
+                break
+        if not found:
+            for ext in ['.webm', '.wav', '.ogg']:
+                path = os.path.join(healthy_folder, uuid + ext)
+                if os.path.exists(path):
+                    all_tasks.append((path, 'healthy'))
+                    break
+    print(f"Starting Elite Extraction (Total: {len(all_tasks)} samples)...")
+    extractor = HeARExtractor()
+    features = []
+    labels = []
+    for path, label in tqdm(all_tasks):
+        try:
+            emb = extractor.extract(path)
+            if emb is not None:
+                features.append(emb)
+                labels.append(label)
+        except Exception:
+            continue
+    np.save(os.path.join(OUTPUT_DIR, "X_elite.npy"), np.array(features))
+    np.save(os.path.join(OUTPUT_DIR, "y_elite.npy"), np.array(labels))
+    print(f"Elite Data Saved: {len(features)} samples.")
+if __name__ == "__main__":
+    main()

utils/extract_hybrid_v5.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import numpy as np
+import librosa
+from tqdm import tqdm
+import sys
+# Add project root to path
+PROJECT_ROOT = r"c:\Users\ASUS\lung_ai_project"
+if PROJECT_ROOT not in sys.path:
+    sys.path.append(PROJECT_ROOT)
+from utils.hear_extractor import HeARExtractor
+# --- Config ---
+AUDIO_SEARCH_PATHS = [
+    r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized\sick",
+    r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized\healthy",
+    r"c:\Users\ASUS\lung_ai_project\data\cough",
+    r"c:\Users\ASUS\lung_ai_project\data\coswara\coswara_data\kaggle_data"
+]
+OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hybrid_features"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def extract_traditional_features(y, sr):
+    # MFCC (13)
+    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+    mfcc_mean = np.mean(mfcc, axis=1)
+    # Spectral Centroid
+    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
+    spec_cent_mean = np.mean(spec_cent)
+    # Zero Crossing Rate
+    zcr = librosa.feature.zero_crossing_rate(y)
+    zcr_mean = np.mean(zcr)
+    # RMSE (Energy)
+    rmse = librosa.feature.rms(y=y)
+    rmse_mean = np.mean(rmse)
+    return np.concatenate([mfcc_mean, [spec_cent_mean, zcr_mean, rmse_mean]])
+def main():
+    print("Finding audio files...")
+    all_files = []
+    for path in AUDIO_SEARCH_PATHS:
+        if not os.path.exists(path): continue
+        if "kaggle_data" in path: # Coswara structure
+            for pid in os.listdir(path):
+                p_dir = os.path.join(path, pid)
+                for f in ["cough-heavy.wav", "cough-shallow.wav", "cough.wav"]:
+                    f_path = os.path.join(p_dir, f)
+                    if os.path.exists(f_path):
+                        # Determine label - need metadata for Coswara
+                        # For now, let's just focus on coughvid where folders are explicit
+                        pass
+        else:
+            label = 'sick' if 'sick' in path else 'healthy'
+            for f in os.listdir(path):
+                if f.endswith(('.wav', '.webm', '.ogg')):
+                    all_files.append((os.path.join(path, f), label))
+    print(f"Total files found (Coughvid + Cough): {len(all_files)}")
+    # To speed up, we'll limit to 1000 balanced samples (500 each)
+    import random
+    random.shuffle(all_files)
+    sick_list = [f for f in all_files if f[1] == 'sick'][:500]
+    healthy_list = [f for f in all_files if f[1] == 'healthy'][:500]
+    balanced_files = sick_list + healthy_list
+    print(f"Processing {len(balanced_files)} balanced samples for Hybrid model...")
+    extractor = HeARExtractor()
+    hybrid_features = []
+    labels = []
+    for audio_path, label in tqdm(balanced_files):
+        try:
+            # 1. HeAR Embedding
+            emb = extractor.extract(audio_path)
+            if emb is None: continue
+            # 2. Traditional Features
+            y, sr = librosa.load(audio_path, sr=16000)
+            trad = extract_traditional_features(y, sr)
+            # Combine
+            combined = np.concatenate([emb, trad])
+            hybrid_features.append(combined)
+            labels.append(label)
+        except Exception as e:
+            continue
+    X_hybrid = np.array(hybrid_features)
+    y_hybrid = np.array(labels)
+    np.save(os.path.join(OUTPUT_DIR, "X_hybrid.npy"), X_hybrid)
+    np.save(os.path.join(OUTPUT_DIR, "y_hybrid.npy"), y_hybrid)
+    print(f"Saved {len(X_hybrid)} hybrid samples to {OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()

utils/hear_extractor.py CHANGED Viewed

@@ -1,5 +1,20 @@
 import os
 import sys
 import numpy as np
 import librosa
 import tensorflow as tf
@@ -26,8 +41,8 @@ class HeARExtractor:
                 from huggingface_hub import login
                 login(token=token)
-            # Use /tmp for the model cache on the server to avoid permission errors
-            model_cache_path = os.path.join("/tmp", "hear_model_cache")
             # Download model files manually to avoid symlink issues on Windows
             # and ignore unrelated folders (like event_detector) to speed up download
@@ -94,7 +109,8 @@ class HeARExtractor:
             return np.mean(embeddings, axis=0)
         except Exception as e:
-            print(f"Extraction error: {e}")
             return None
 if __name__ == "__main__":

 import os
 import sys
+import glob
+# --- FFmpeg Path Fix for Windows ---
+# 1. Try common WinGet location found on this machine
+FFMPEG_DIR = r"C:\Users\ASUS\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-8.1-full_build\bin"
+if not os.path.exists(FFMPEG_DIR):
+    # 2. Try dynamic search in WinGet folder as fallback
+    winget_base = os.path.join(os.environ.get("LOCALAPPDATA", ""), "Microsoft", "WinGet", "Packages")
+    ffmpeg_bins = glob.glob(os.path.join(winget_base, "*ffmpeg*", "**", "bin"), recursive=True)
+    if ffmpeg_bins:
+        FFMPEG_DIR = ffmpeg_bins[0]
+if os.path.exists(FFMPEG_DIR) and FFMPEG_DIR not in os.environ["PATH"]:
+    os.environ["PATH"] += os.pathsep + FFMPEG_DIR
 import numpy as np
 import librosa
 import tensorflow as tf
                 from huggingface_hub import login
                 login(token=token)
+            # Use a local folder in the project for the model cache
+            model_cache_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "hear_model_cache")
             # Download model files manually to avoid symlink issues on Windows
             # and ignore unrelated folders (like event_detector) to speed up download
             return np.mean(embeddings, axis=0)
         except Exception as e:
+            error_msg = str(e) if str(e) else "Unknown error (check if FFmpeg is working or file is corrupted)"
+            print(f"Extraction error ({os.path.basename(audio_input) if isinstance(audio_input, str) else 'array'}): {error_msg}")
             return None
 if __name__ == "__main__":

utils/inspect_labels.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+import os
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+y1 = np.load(path1_y)
+y2 = np.load(path2_y)
+print(f"y1 dtype: {y1.dtype}, unique: {np.unique(y1)}")
+print(f"y2 dtype: {y2.dtype}, unique: {np.unique(y2)}")
+# Convert y1 if string
+if y1.dtype.kind in ['U', 'S']:
+    y1_converted = np.where(y1 == 'sick', 1, 0).astype(np.int32)
+    print(f"y1 converted dtype: {y1_converted.dtype}, unique: {np.unique(y1_converted)}")
+else:
+    y1_converted = y1.astype(np.int32)
+# Convert y2 if string
+if y2.dtype.kind in ['U', 'S']:
+    y2_converted = np.where(y2 == 'sick', 1, 0).astype(np.int32)
+    print(f"y2 converted dtype: {y2_converted.dtype}, unique: {np.unique(y2_converted)}")
+else:
+    y2_converted = y2.astype(np.int32)
+y_merged = np.concatenate([y1_converted, y2_converted])
+print(f"y_merged dtype: {y_merged.dtype}, unique: {np.unique(y_merged)}")

utils/process_coughvid.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import numpy as np
+from tqdm import tqdm
+import sys
+import ctypes  # To keep Windows awake
+# Add project root to path to allow absolute imports
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if PROJECT_ROOT not in sys.path:
+    sys.path.append(PROJECT_ROOT)
+from utils.hear_extractor import HeARExtractor
+# --- Configuration ---
+DATA_ROOT = r"c:\Users\ASUS\lung_ai_project\data\coughvid_public\organized"
+OUTPUT_DIR = r"c:\Users\ASUS\lung_ai_project\data\hear_embeddings_coughvid"
+CHECKPOINT_EVERY = 50
+TARGET_SICK_COUNT = 2500  # Extracting 1,000 more sick samples
+def run_extraction():
+    # Keep Windows awake during extraction
+    try:
+        # ES_CONTINUOUS (0x80000000) | ES_SYSTEM_REQUIRED (0x00000001)
+        ctypes.windll.kernel32.SetThreadExecutionState(0x80000000 | 0x00000001)
+        print(">>> Windows 'Stay Awake' mode enabled.")
+    except Exception:
+        print(">>> Warning: Could not enable 'Stay Awake' mode.")
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+    features_path = os.path.join(OUTPUT_DIR, "X_coughvid.npy")
+    labels_path = os.path.join(OUTPUT_DIR, "y_coughvid.npy")
+    # Load existing if available to resume
+    features = []
+    labels = []
+    if os.path.exists(features_path) and os.path.exists(labels_path):
+        print("Loading existing embeddings...")
+        features = list(np.load(features_path))
+        labels = list(np.load(labels_path))
+    current_sick_count = sum(1 for l in labels if l == 'sick')
+    print(f"Current Sick Samples: {current_sick_count}")
+    if current_sick_count >= TARGET_SICK_COUNT:
+        print(f"Goal reached! You already have {current_sick_count} sick samples.")
+        return
+    # Tracker for processed paths to avoid duplicates
+    tracker_path = os.path.join(OUTPUT_DIR, "processed_paths.txt")
+    processed_paths = set()
+    if os.path.exists(tracker_path):
+        with open(tracker_path, 'r') as f:
+            processed_paths = set(line.strip() for line in f)
+    # Collect only SICK files
+    folder = os.path.join(DATA_ROOT, 'sick')
+    all_sick_files = []
+    if os.path.exists(folder):
+        for f in os.listdir(folder):
+            full_path = os.path.join(folder, f)
+            if f.endswith(('.webm', '.ogg', '.wav')) and full_path not in processed_paths:
+                all_sick_files.append(full_path)
+    remaining_to_goal = TARGET_SICK_COUNT - current_sick_count
+    files_to_process = all_sick_files[:remaining_to_goal]
+    print(f"Extraction Target: {len(files_to_process)} more sick samples.")
+    if not files_to_process:
+        print("No more unique sick files found to process.")
+        return
+    # Initialize Extractor
+    print("Initializing HeAR Extractor...")
+    extractor = HeARExtractor()
+    try:
+        count = 0
+        with open(tracker_path, 'a') as tracker:
+            for path in tqdm(files_to_process, desc="Extracting Sick"):
+                emb = extractor.extract(path)
+                if emb is not None:
+                    features.append(emb)
+                    labels.append('sick')
+                    tracker.write(path + "\n")
+                    count += 1
+                if count % CHECKPOINT_EVERY == 0 and count > 0:
+                    np.save(features_path, np.array(features))
+                    np.save(labels_path, np.array(labels))
+        # Final save
+        np.save(features_path, np.array(features))
+        np.save(labels_path, np.array(labels))
+        print(f"Success! Now you have {sum(1 for l in labels if l == 'sick')} sick samples in total.")
+    except KeyboardInterrupt:
+        print("\nStopping and saving progress...")
+        np.save(features_path, np.array(features))
+        np.save(labels_path, np.array(labels))
+        print("Progress saved.")
+    finally:
+        # Reset Windows sleep settings to normal
+        try:
+            ctypes.windll.kernel32.SetThreadExecutionState(0x80000000)
+        except Exception:
+            pass
+if __name__ == "__main__":
+    run_extraction()
+if __name__ == "__main__":
+    run_extraction()

utils/purge_and_retrain.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import numpy as np
+import tensorflow as tf
+import os
+from sklearn.model_selection import StratifiedKFold, train_test_split
+from sklearn.utils import shuffle
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+# Load data
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+X = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y = np.concatenate([y1, y2], axis=0)
+# Load current best model
+model_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v5_pro.h5"
+model = tf.keras.models.load_model(model_path)
+# Predict all
+print("Finding unreliable samples for the GREAT PURGE...")
+preds = model.predict(X, batch_size=128).flatten()
+# PURGE CRITERIA:
+# 1. Label says SICK (1), but model is >85% sure it is HEALTHY (pred < 0.15)
+# 2. Label says HEALTHY (0), but model is >85% sure it is SICK (pred > 0.85)
+purge_mask = ((y == 1) & (preds < 0.15)) | ((y == 0) & (preds > 0.85))
+# KEEP ONLY RELIABLE SAMPLES
+X_clean = X[~purge_mask]
+y_clean = y[~purge_mask]
+print(f"Purged {np.sum(purge_mask)} 'Contradictory' samples.")
+print(f"Clean Dataset size: {len(X_clean)}")
+# STRICT BALANCING (Match minority class size)
+h_idx = np.where(y_clean == 0)[0]
+s_idx = np.where(y_clean == 1)[0]
+min_size = min(len(h_idx), len(s_idx))
+print(f"Balancing Clean Set to {min_size} samples per class.")
+np.random.seed(42)
+bal_h = np.random.choice(h_idx, min_size, replace=False)
+bal_s = np.random.choice(s_idx, min_size, replace=False)
+idx = np.concatenate([bal_h, bal_s])
+np.random.shuffle(idx)
+X_final = X_clean[idx]
+y_final = y_clean[idx]
+# Final Training with Higher Capacity
+final_model = tf.keras.Sequential([
+    tf.keras.layers.Dense(1024, activation='relu', input_shape=(X_final.shape[1],)),
+    tf.keras.layers.BatchNormalization(),
+    tf.keras.layers.Dropout(0.4),
+    tf.keras.layers.Dense(512, activation='relu'),
+    tf.keras.layers.BatchNormalization(),
+    tf.keras.layers.Dropout(0.4),
+    tf.keras.layers.Dense(256, activation='relu'),
+    tf.keras.layers.Dense(1, activation='sigmoid')
+])
+final_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])
+cb = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)]
+# Train on 85/15 split
+X_t, X_v, y_t, y_v = train_test_split(X_final, y_final, test_size=0.15, random_state=42)
+print("Starting FINAL TRAINING on Purged & Balanced dataset...")
+history = final_model.fit(X_t, y_t, validation_data=(X_v, y_v), epochs=150, batch_size=32, callbacks=cb)
+# Save
+save_path = r"c:\Users\ASUS\lung_ai_project\models\hear_classifier_v6_final.h5"
+final_model.save(save_path)
+print(f"Purged Model saved to {save_path}")
+best_acc = max(history.history['val_accuracy'])
+print(f"Best Purged-Set Accuracy: {best_acc*100:.2f}%")

utils/test_overlap.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+import os
+from scipy.spatial.distance import cdist
+# Paths
+base_dir = r"c:\Users\ASUS\lung_ai_project\data"
+path1_x = os.path.join(base_dir, "hear_embeddings_optimized", "X_hear_opt_merged.npy")
+path1_y = os.path.join(base_dir, "hear_embeddings_optimized", "y_hear_opt_merged.npy")
+path2_x = os.path.join(base_dir, "hear_embeddings_coughvid", "X_coughvid.npy")
+path2_y = os.path.join(base_dir, "hear_embeddings_coughvid", "y_coughvid.npy")
+# Load and clean
+def clean_y(y):
+    if y.dtype.kind in ['U', 'S']:
+        return np.where(y == 'sick', 1, 0).astype(np.float32)
+    return y.astype(np.float32)
+X1, y1 = np.load(path1_x), clean_y(np.load(path1_y))
+X2, y2 = np.load(path2_x), clean_y(np.load(path2_y))
+X = np.concatenate([X1, X2], axis=0).astype(np.float32)
+y = np.concatenate([y1, y2], axis=0)
+# Randomly sample some sick and healthy to check proximity
+sick_indices = np.where(y == 1)[0]
+healthy_indices = np.where(y == 0)[0]
+# Pick a small Subset to check distances (full 11k cdist is too slow)
+subs_s = np.random.choice(sick_indices, 500, replace=False)
+subs_h = np.random.choice(healthy_indices, 500, replace=False)
+X_s = X[subs_s]
+X_h = X[subs_h]
+# Check distances between 500 sick and 500 healthy samples
+dist_matrix = cdist(X_s, X_h, 'cosine')
+# Find how many sick samples are extremely close to healthy ones
+very_close = np.where(dist_matrix < 0.05)
+print(f"Overlap Analysis (Cosine Distance < 0.05): {len(very_close[0])} pairs found.")
+avg_dist_sick_to_healthy = np.mean(dist_matrix)
+print(f"Average Distance (Sick to Healthy): {avg_dist_sick_to_healthy:.4f}")
+# Check distances within sick
+dist_within_sick = cdist(X_s, X_s, 'cosine')
+avg_dist_within_sick = np.mean(dist_within_sick)
+print(f"Average Distance (Within Sick): {avg_dist_within_sick:.4f}")
+# Check distances within healthy
+dist_within_healthy = cdist(X_h, X_h, 'cosine')
+avg_dist_within_healthy = np.mean(dist_within_healthy)
+print(f"Average Distance (Within Healthy): {avg_dist_within_healthy:.4f}")