Spaces:

thomasjvu
/

lisper-zerogpu

Running on Zero

App Files Files Community

thomasjvu commited on 14 days ago

Commit

203078f

verified ·

1 Parent(s): 424ad4c

Deploy Lisper ZeroGPU Space

Browse files

Files changed (4) hide show

README.md +2 -1
acoustic_extratrees_v18.joblib +3 -0
app.py +66 -1
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -31,7 +31,7 @@ Input handling:
 - The app rejects silent, empty, too-short, or very low-energy recordings before calling Gemma. This prevents confident but falsified coaching on empty microphone captures.
 - After stopping a microphone recording, wait for the clip status to change to `Clip ready` before pressing Analyze. Gradio may need a moment to finalize and upload the browser recording.
-- Live analysis uses a lightweight acoustic hint model from `acoustic_model.json` before Gemma generation. Gemma still receives the raw audio, but the displayed `Detected class` line is anchored to the acoustic hint instead of letting Gemma freely guess the class.
 Set these Space variables/secrets:
@@ -41,6 +41,7 @@ Set these Space variables/secrets:
 - `LISPER_ZERO_GPU_AUDIO_DTYPE`: optional override for Gemma audio features. Adapter deployments default to `bfloat16`.
 - `LISPER_ZERO_GPU_LOAD_IN_4BIT`: defaults to `1` when `LISPER_ZERO_GPU_ADAPTER_ID` is set, otherwise `0`.
 - `LISPER_ZERO_GPU_ACOUSTIC_HINT`: defaults to `1`. Set to `0` only when intentionally testing direct Gemma audio classification.
 - `LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS`: defaults to `0` for adapter deployments and `1` for merged-model deployments.
 - `LISPER_ZERO_GPU_MAX_SEQ_LENGTH`: defaults to `2048`.
 - `LISPER_ZERO_GPU_SIZE`: `large` or `xlarge`. Defaults to `large`.

 - The app rejects silent, empty, too-short, or very low-energy recordings before calling Gemma. This prevents confident but falsified coaching on empty microphone captures.
 - After stopping a microphone recording, wait for the clip status to change to `Clip ready` before pressing Analyze. Gradio may need a moment to finalize and upload the browser recording.
+- Live analysis prefers the v18 ExtraTrees acoustic hint artifact in `acoustic_extratrees_v18.joblib` before Gemma generation. Gemma still receives the raw audio, but the displayed `Detected class` line is anchored to the acoustic hint instead of letting Gemma freely guess the class. If that artifact is missing, the app falls back to the smaller `acoustic_model.json` KNN demo hint.
 Set these Space variables/secrets:
 - `LISPER_ZERO_GPU_AUDIO_DTYPE`: optional override for Gemma audio features. Adapter deployments default to `bfloat16`.
 - `LISPER_ZERO_GPU_LOAD_IN_4BIT`: defaults to `1` when `LISPER_ZERO_GPU_ADAPTER_ID` is set, otherwise `0`.
 - `LISPER_ZERO_GPU_ACOUSTIC_HINT`: defaults to `1`. Set to `0` only when intentionally testing direct Gemma audio classification.
+- `LISPER_ZERO_GPU_ACOUSTIC_MODEL`: `auto`, `extratrees`, or `knn`. Defaults to `auto`, which uses v18 ExtraTrees when `acoustic_extratrees_v18.joblib` is present.
 - `LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS`: defaults to `0` for adapter deployments and `1` for merged-model deployments.
 - `LISPER_ZERO_GPU_MAX_SEQ_LENGTH`: defaults to `2048`.
 - `LISPER_ZERO_GPU_SIZE`: `large` or `xlarge`. Defaults to `large`.

acoustic_extratrees_v18.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80664b1d8159aad9a3f7217e104627cad0b8c1d89b1ebaca8732d19e498c2fc2
+size 1066798

app.py CHANGED Viewed

@@ -29,6 +29,7 @@ DEFAULT_MODEL_ID = "thomasjvu/lisper-gemma4-e2b-audio-full"
 DEFAULT_ADAPTER_ID = ""
 SPACE_ROOT = Path(__file__).resolve().parent
 ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
 ACOUSTIC_K = 5
 ACOUSTIC_MIN_CONFIDENCE = 0.42
 MIN_AUDIO_SECONDS = 0.45
@@ -116,6 +117,13 @@ def acoustic_hint_enabled() -> bool:
     return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"
 def audio_alignment_enabled() -> bool:
     default = "0" if adapter_id() else "1"
     return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"
@@ -399,6 +407,7 @@ def extract_acoustic_features(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
 ACOUSTIC_MODEL: dict[str, Any] | None = None
 def load_acoustic_model() -> dict[str, Any] | None:
@@ -412,7 +421,52 @@ def load_acoustic_model() -> dict[str, Any] | None:
     return ACOUSTIC_MODEL
-def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
     model = load_acoustic_model()
     if model is None:
         return None
@@ -450,6 +504,17 @@ def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
     }
 def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] | None) -> tuple[str, dict[str, Any]]:
     parsed = parse_response(response)
     if not acoustic_result:

 DEFAULT_ADAPTER_ID = ""
 SPACE_ROOT = Path(__file__).resolve().parent
 ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
+ACOUSTIC_EXTRATREES_MODEL_PATH = SPACE_ROOT / "acoustic_extratrees_v18.joblib"
 ACOUSTIC_K = 5
 ACOUSTIC_MIN_CONFIDENCE = 0.42
 MIN_AUDIO_SECONDS = 0.45
     return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"
+def acoustic_model_preference() -> str:
+    requested = os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_MODEL", "auto").strip().lower()
+    if requested in {"extratrees", "knn"}:
+        return requested
+    return "auto"
 def audio_alignment_enabled() -> bool:
     default = "0" if adapter_id() else "1"
     return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"
 ACOUSTIC_MODEL: dict[str, Any] | None = None
+ACOUSTIC_EXTRATREES_MODEL: dict[str, Any] | None = None
 def load_acoustic_model() -> dict[str, Any] | None:
     return ACOUSTIC_MODEL
+def load_acoustic_extratrees_model() -> dict[str, Any] | None:
+    global ACOUSTIC_EXTRATREES_MODEL
+    if not acoustic_hint_enabled():
+        return None
+    if not ACOUSTIC_EXTRATREES_MODEL_PATH.exists():
+        return None
+    if ACOUSTIC_EXTRATREES_MODEL is None:
+        import joblib
+        ACOUSTIC_EXTRATREES_MODEL = joblib.load(ACOUSTIC_EXTRATREES_MODEL_PATH)
+    return ACOUSTIC_EXTRATREES_MODEL
+def classify_acoustic_extratrees(waveform: np.ndarray) -> dict[str, Any] | None:
+    model = load_acoustic_extratrees_model()
+    if model is None:
+        return None
+    features = extract_acoustic_features(waveform, sr=int(model.get("sample_rate", 16000))).reshape(1, -1)
+    classifier = model["classifier"]
+    prediction = str(classifier.predict(features)[0])
+    confidence = 1.0
+    class_scores: dict[str, float] = {}
+    if hasattr(classifier, "predict_proba"):
+        probabilities = classifier.predict_proba(features)[0]
+        classes = [str(label) for label in classifier.classes_]
+        class_scores = {
+            label: round(float(probability), 6)
+            for label, probability in sorted(zip(classes, probabilities), key=lambda item: item[1], reverse=True)
+        }
+        confidence = float(class_scores.get(prediction, 0.0))
+    return {
+        "detected_class": prediction,
+        "raw_class": prediction,
+        "confidence": confidence,
+        "class_scores": class_scores,
+        "model_name": model.get("name", "lisper_v18_extratrees_acoustic_hint"),
+        "train_rows": model.get("train_rows"),
+        "feature_count": model.get("feature_count"),
+        "holdout_accuracy": model.get("holdout_accuracy"),
+        "low_confidence_defaulted_to_clear": False,
+    }
+def classify_acoustic_knn(waveform: np.ndarray) -> dict[str, Any] | None:
     model = load_acoustic_model()
     if model is None:
         return None
     }
+def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
+    preference = acoustic_model_preference()
+    if preference in {"auto", "extratrees"}:
+        extratrees_result = classify_acoustic_extratrees(waveform)
+        if extratrees_result is not None:
+            return extratrees_result
+        if preference == "extratrees":
+            return None
+    return classify_acoustic_knn(waveform)
 def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] | None) -> tuple[str, dict[str, Any]]:
     parsed = parse_response(response)
     if not acoustic_result:

requirements.txt CHANGED Viewed

@@ -1,9 +1,11 @@
 accelerate>=1.8.0
 bitsandbytes>=0.46.0
 gradio==5.29.1
 librosa>=0.10.2
 numpy>=2.0.0
 peft>=0.15.0
 spaces>=0.32.0
 soundfile>=0.13.1
 torch==2.8.0

 accelerate>=1.8.0
 bitsandbytes>=0.46.0
 gradio==5.29.1
+joblib>=1.4.2
 librosa>=0.10.2
 numpy>=2.0.0
 peft>=0.15.0
+scikit-learn>=1.5.0
 spaces>=0.32.0
 soundfile>=0.13.1
 torch==2.8.0