thomasjvu commited on
Commit
203078f
·
verified ·
1 Parent(s): 424ad4c

Deploy Lisper ZeroGPU Space

Browse files
Files changed (4) hide show
  1. README.md +2 -1
  2. acoustic_extratrees_v18.joblib +3 -0
  3. app.py +66 -1
  4. requirements.txt +2 -0
README.md CHANGED
@@ -31,7 +31,7 @@ Input handling:
31
 
32
  - The app rejects silent, empty, too-short, or very low-energy recordings before calling Gemma. This prevents confident but falsified coaching on empty microphone captures.
33
  - After stopping a microphone recording, wait for the clip status to change to `Clip ready` before pressing Analyze. Gradio may need a moment to finalize and upload the browser recording.
34
- - Live analysis uses a lightweight acoustic hint model from `acoustic_model.json` before Gemma generation. Gemma still receives the raw audio, but the displayed `Detected class` line is anchored to the acoustic hint instead of letting Gemma freely guess the class.
35
 
36
  Set these Space variables/secrets:
37
 
@@ -41,6 +41,7 @@ Set these Space variables/secrets:
41
  - `LISPER_ZERO_GPU_AUDIO_DTYPE`: optional override for Gemma audio features. Adapter deployments default to `bfloat16`.
42
  - `LISPER_ZERO_GPU_LOAD_IN_4BIT`: defaults to `1` when `LISPER_ZERO_GPU_ADAPTER_ID` is set, otherwise `0`.
43
  - `LISPER_ZERO_GPU_ACOUSTIC_HINT`: defaults to `1`. Set to `0` only when intentionally testing direct Gemma audio classification.
 
44
  - `LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS`: defaults to `0` for adapter deployments and `1` for merged-model deployments.
45
  - `LISPER_ZERO_GPU_MAX_SEQ_LENGTH`: defaults to `2048`.
46
  - `LISPER_ZERO_GPU_SIZE`: `large` or `xlarge`. Defaults to `large`.
 
31
 
32
  - The app rejects silent, empty, too-short, or very low-energy recordings before calling Gemma. This prevents confident but falsified coaching on empty microphone captures.
33
  - After stopping a microphone recording, wait for the clip status to change to `Clip ready` before pressing Analyze. Gradio may need a moment to finalize and upload the browser recording.
34
+ - Live analysis prefers the v18 ExtraTrees acoustic hint artifact in `acoustic_extratrees_v18.joblib` before Gemma generation. Gemma still receives the raw audio, but the displayed `Detected class` line is anchored to the acoustic hint instead of letting Gemma freely guess the class. If that artifact is missing, the app falls back to the smaller `acoustic_model.json` KNN demo hint.
35
 
36
  Set these Space variables/secrets:
37
 
 
41
  - `LISPER_ZERO_GPU_AUDIO_DTYPE`: optional override for Gemma audio features. Adapter deployments default to `bfloat16`.
42
  - `LISPER_ZERO_GPU_LOAD_IN_4BIT`: defaults to `1` when `LISPER_ZERO_GPU_ADAPTER_ID` is set, otherwise `0`.
43
  - `LISPER_ZERO_GPU_ACOUSTIC_HINT`: defaults to `1`. Set to `0` only when intentionally testing direct Gemma audio classification.
44
+ - `LISPER_ZERO_GPU_ACOUSTIC_MODEL`: `auto`, `extratrees`, or `knn`. Defaults to `auto`, which uses v18 ExtraTrees when `acoustic_extratrees_v18.joblib` is present.
45
  - `LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS`: defaults to `0` for adapter deployments and `1` for merged-model deployments.
46
  - `LISPER_ZERO_GPU_MAX_SEQ_LENGTH`: defaults to `2048`.
47
  - `LISPER_ZERO_GPU_SIZE`: `large` or `xlarge`. Defaults to `large`.
acoustic_extratrees_v18.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80664b1d8159aad9a3f7217e104627cad0b8c1d89b1ebaca8732d19e498c2fc2
3
+ size 1066798
app.py CHANGED
@@ -29,6 +29,7 @@ DEFAULT_MODEL_ID = "thomasjvu/lisper-gemma4-e2b-audio-full"
29
  DEFAULT_ADAPTER_ID = ""
30
  SPACE_ROOT = Path(__file__).resolve().parent
31
  ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
 
32
  ACOUSTIC_K = 5
33
  ACOUSTIC_MIN_CONFIDENCE = 0.42
34
  MIN_AUDIO_SECONDS = 0.45
@@ -116,6 +117,13 @@ def acoustic_hint_enabled() -> bool:
116
  return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"
117
 
118
 
 
 
 
 
 
 
 
119
  def audio_alignment_enabled() -> bool:
120
  default = "0" if adapter_id() else "1"
121
  return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"
@@ -399,6 +407,7 @@ def extract_acoustic_features(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
399
 
400
 
401
  ACOUSTIC_MODEL: dict[str, Any] | None = None
 
402
 
403
 
404
  def load_acoustic_model() -> dict[str, Any] | None:
@@ -412,7 +421,52 @@ def load_acoustic_model() -> dict[str, Any] | None:
412
  return ACOUSTIC_MODEL
413
 
414
 
415
- def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  model = load_acoustic_model()
417
  if model is None:
418
  return None
@@ -450,6 +504,17 @@ def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
450
  }
451
 
452
 
 
 
 
 
 
 
 
 
 
 
 
453
  def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] | None) -> tuple[str, dict[str, Any]]:
454
  parsed = parse_response(response)
455
  if not acoustic_result:
 
29
  DEFAULT_ADAPTER_ID = ""
30
  SPACE_ROOT = Path(__file__).resolve().parent
31
  ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
32
+ ACOUSTIC_EXTRATREES_MODEL_PATH = SPACE_ROOT / "acoustic_extratrees_v18.joblib"
33
  ACOUSTIC_K = 5
34
  ACOUSTIC_MIN_CONFIDENCE = 0.42
35
  MIN_AUDIO_SECONDS = 0.45
 
117
  return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"
118
 
119
 
120
+ def acoustic_model_preference() -> str:
121
+ requested = os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_MODEL", "auto").strip().lower()
122
+ if requested in {"extratrees", "knn"}:
123
+ return requested
124
+ return "auto"
125
+
126
+
127
  def audio_alignment_enabled() -> bool:
128
  default = "0" if adapter_id() else "1"
129
  return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"
 
407
 
408
 
409
  ACOUSTIC_MODEL: dict[str, Any] | None = None
410
+ ACOUSTIC_EXTRATREES_MODEL: dict[str, Any] | None = None
411
 
412
 
413
  def load_acoustic_model() -> dict[str, Any] | None:
 
421
  return ACOUSTIC_MODEL
422
 
423
 
424
+ def load_acoustic_extratrees_model() -> dict[str, Any] | None:
425
+ global ACOUSTIC_EXTRATREES_MODEL
426
+ if not acoustic_hint_enabled():
427
+ return None
428
+ if not ACOUSTIC_EXTRATREES_MODEL_PATH.exists():
429
+ return None
430
+ if ACOUSTIC_EXTRATREES_MODEL is None:
431
+ import joblib
432
+
433
+ ACOUSTIC_EXTRATREES_MODEL = joblib.load(ACOUSTIC_EXTRATREES_MODEL_PATH)
434
+ return ACOUSTIC_EXTRATREES_MODEL
435
+
436
+
437
+ def classify_acoustic_extratrees(waveform: np.ndarray) -> dict[str, Any] | None:
438
+ model = load_acoustic_extratrees_model()
439
+ if model is None:
440
+ return None
441
+
442
+ features = extract_acoustic_features(waveform, sr=int(model.get("sample_rate", 16000))).reshape(1, -1)
443
+ classifier = model["classifier"]
444
+ prediction = str(classifier.predict(features)[0])
445
+ confidence = 1.0
446
+ class_scores: dict[str, float] = {}
447
+ if hasattr(classifier, "predict_proba"):
448
+ probabilities = classifier.predict_proba(features)[0]
449
+ classes = [str(label) for label in classifier.classes_]
450
+ class_scores = {
451
+ label: round(float(probability), 6)
452
+ for label, probability in sorted(zip(classes, probabilities), key=lambda item: item[1], reverse=True)
453
+ }
454
+ confidence = float(class_scores.get(prediction, 0.0))
455
+
456
+ return {
457
+ "detected_class": prediction,
458
+ "raw_class": prediction,
459
+ "confidence": confidence,
460
+ "class_scores": class_scores,
461
+ "model_name": model.get("name", "lisper_v18_extratrees_acoustic_hint"),
462
+ "train_rows": model.get("train_rows"),
463
+ "feature_count": model.get("feature_count"),
464
+ "holdout_accuracy": model.get("holdout_accuracy"),
465
+ "low_confidence_defaulted_to_clear": False,
466
+ }
467
+
468
+
469
+ def classify_acoustic_knn(waveform: np.ndarray) -> dict[str, Any] | None:
470
  model = load_acoustic_model()
471
  if model is None:
472
  return None
 
504
  }
505
 
506
 
507
+ def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
508
+ preference = acoustic_model_preference()
509
+ if preference in {"auto", "extratrees"}:
510
+ extratrees_result = classify_acoustic_extratrees(waveform)
511
+ if extratrees_result is not None:
512
+ return extratrees_result
513
+ if preference == "extratrees":
514
+ return None
515
+ return classify_acoustic_knn(waveform)
516
+
517
+
518
  def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] | None) -> tuple[str, dict[str, Any]]:
519
  parsed = parse_response(response)
520
  if not acoustic_result:
requirements.txt CHANGED
@@ -1,9 +1,11 @@
1
  accelerate>=1.8.0
2
  bitsandbytes>=0.46.0
3
  gradio==5.29.1
 
4
  librosa>=0.10.2
5
  numpy>=2.0.0
6
  peft>=0.15.0
 
7
  spaces>=0.32.0
8
  soundfile>=0.13.1
9
  torch==2.8.0
 
1
  accelerate>=1.8.0
2
  bitsandbytes>=0.46.0
3
  gradio==5.29.1
4
+ joblib>=1.4.2
5
  librosa>=0.10.2
6
  numpy>=2.0.0
7
  peft>=0.15.0
8
+ scikit-learn>=1.5.0
9
  spaces>=0.32.0
10
  soundfile>=0.13.1
11
  torch==2.8.0