Spaces:
Running on Zero
Running on Zero
Deploy Lisper ZeroGPU Space
Browse files- README.md +2 -1
- acoustic_extratrees_v18.joblib +3 -0
- app.py +66 -1
- requirements.txt +2 -0
README.md
CHANGED
|
@@ -31,7 +31,7 @@ Input handling:
|
|
| 31 |
|
| 32 |
- The app rejects silent, empty, too-short, or very low-energy recordings before calling Gemma. This prevents confident but falsified coaching on empty microphone captures.
|
| 33 |
- After stopping a microphone recording, wait for the clip status to change to `Clip ready` before pressing Analyze. Gradio may need a moment to finalize and upload the browser recording.
|
| 34 |
-
- Live analysis
|
| 35 |
|
| 36 |
Set these Space variables/secrets:
|
| 37 |
|
|
@@ -41,6 +41,7 @@ Set these Space variables/secrets:
|
|
| 41 |
- `LISPER_ZERO_GPU_AUDIO_DTYPE`: optional override for Gemma audio features. Adapter deployments default to `bfloat16`.
|
| 42 |
- `LISPER_ZERO_GPU_LOAD_IN_4BIT`: defaults to `1` when `LISPER_ZERO_GPU_ADAPTER_ID` is set, otherwise `0`.
|
| 43 |
- `LISPER_ZERO_GPU_ACOUSTIC_HINT`: defaults to `1`. Set to `0` only when intentionally testing direct Gemma audio classification.
|
|
|
|
| 44 |
- `LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS`: defaults to `0` for adapter deployments and `1` for merged-model deployments.
|
| 45 |
- `LISPER_ZERO_GPU_MAX_SEQ_LENGTH`: defaults to `2048`.
|
| 46 |
- `LISPER_ZERO_GPU_SIZE`: `large` or `xlarge`. Defaults to `large`.
|
|
|
|
| 31 |
|
| 32 |
- The app rejects silent, empty, too-short, or very low-energy recordings before calling Gemma. This prevents confident but falsified coaching on empty microphone captures.
|
| 33 |
- After stopping a microphone recording, wait for the clip status to change to `Clip ready` before pressing Analyze. Gradio may need a moment to finalize and upload the browser recording.
|
| 34 |
+
- Live analysis prefers the v18 ExtraTrees acoustic hint artifact in `acoustic_extratrees_v18.joblib` before Gemma generation. Gemma still receives the raw audio, but the displayed `Detected class` line is anchored to the acoustic hint instead of letting Gemma freely guess the class. If that artifact is missing, the app falls back to the smaller `acoustic_model.json` KNN demo hint.
|
| 35 |
|
| 36 |
Set these Space variables/secrets:
|
| 37 |
|
|
|
|
| 41 |
- `LISPER_ZERO_GPU_AUDIO_DTYPE`: optional override for Gemma audio features. Adapter deployments default to `bfloat16`.
|
| 42 |
- `LISPER_ZERO_GPU_LOAD_IN_4BIT`: defaults to `1` when `LISPER_ZERO_GPU_ADAPTER_ID` is set, otherwise `0`.
|
| 43 |
- `LISPER_ZERO_GPU_ACOUSTIC_HINT`: defaults to `1`. Set to `0` only when intentionally testing direct Gemma audio classification.
|
| 44 |
+
- `LISPER_ZERO_GPU_ACOUSTIC_MODEL`: `auto`, `extratrees`, or `knn`. Defaults to `auto`, which uses v18 ExtraTrees when `acoustic_extratrees_v18.joblib` is present.
|
| 45 |
- `LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS`: defaults to `0` for adapter deployments and `1` for merged-model deployments.
|
| 46 |
- `LISPER_ZERO_GPU_MAX_SEQ_LENGTH`: defaults to `2048`.
|
| 47 |
- `LISPER_ZERO_GPU_SIZE`: `large` or `xlarge`. Defaults to `large`.
|
acoustic_extratrees_v18.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80664b1d8159aad9a3f7217e104627cad0b8c1d89b1ebaca8732d19e498c2fc2
|
| 3 |
+
size 1066798
|
app.py
CHANGED
|
@@ -29,6 +29,7 @@ DEFAULT_MODEL_ID = "thomasjvu/lisper-gemma4-e2b-audio-full"
|
|
| 29 |
DEFAULT_ADAPTER_ID = ""
|
| 30 |
SPACE_ROOT = Path(__file__).resolve().parent
|
| 31 |
ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
|
|
|
|
| 32 |
ACOUSTIC_K = 5
|
| 33 |
ACOUSTIC_MIN_CONFIDENCE = 0.42
|
| 34 |
MIN_AUDIO_SECONDS = 0.45
|
|
@@ -116,6 +117,13 @@ def acoustic_hint_enabled() -> bool:
|
|
| 116 |
return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"
|
| 117 |
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def audio_alignment_enabled() -> bool:
|
| 120 |
default = "0" if adapter_id() else "1"
|
| 121 |
return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"
|
|
@@ -399,6 +407,7 @@ def extract_acoustic_features(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
|
|
| 399 |
|
| 400 |
|
| 401 |
ACOUSTIC_MODEL: dict[str, Any] | None = None
|
|
|
|
| 402 |
|
| 403 |
|
| 404 |
def load_acoustic_model() -> dict[str, Any] | None:
|
|
@@ -412,7 +421,52 @@ def load_acoustic_model() -> dict[str, Any] | None:
|
|
| 412 |
return ACOUSTIC_MODEL
|
| 413 |
|
| 414 |
|
| 415 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
model = load_acoustic_model()
|
| 417 |
if model is None:
|
| 418 |
return None
|
|
@@ -450,6 +504,17 @@ def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
|
|
| 450 |
}
|
| 451 |
|
| 452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] | None) -> tuple[str, dict[str, Any]]:
|
| 454 |
parsed = parse_response(response)
|
| 455 |
if not acoustic_result:
|
|
|
|
| 29 |
DEFAULT_ADAPTER_ID = ""
|
| 30 |
SPACE_ROOT = Path(__file__).resolve().parent
|
| 31 |
ACOUSTIC_MODEL_PATH = SPACE_ROOT / "acoustic_model.json"
|
| 32 |
+
ACOUSTIC_EXTRATREES_MODEL_PATH = SPACE_ROOT / "acoustic_extratrees_v18.joblib"
|
| 33 |
ACOUSTIC_K = 5
|
| 34 |
ACOUSTIC_MIN_CONFIDENCE = 0.42
|
| 35 |
MIN_AUDIO_SECONDS = 0.45
|
|
|
|
| 117 |
return os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_HINT", "1").strip() != "0"
|
| 118 |
|
| 119 |
|
| 120 |
+
def acoustic_model_preference() -> str:
|
| 121 |
+
requested = os.environ.get("LISPER_ZERO_GPU_ACOUSTIC_MODEL", "auto").strip().lower()
|
| 122 |
+
if requested in {"extratrees", "knn"}:
|
| 123 |
+
return requested
|
| 124 |
+
return "auto"
|
| 125 |
+
|
| 126 |
+
|
| 127 |
def audio_alignment_enabled() -> bool:
|
| 128 |
default = "0" if adapter_id() else "1"
|
| 129 |
return os.environ.get("LISPER_ZERO_GPU_ALIGN_AUDIO_TOKENS", default).strip() != "0"
|
|
|
|
| 407 |
|
| 408 |
|
| 409 |
ACOUSTIC_MODEL: dict[str, Any] | None = None
|
| 410 |
+
ACOUSTIC_EXTRATREES_MODEL: dict[str, Any] | None = None
|
| 411 |
|
| 412 |
|
| 413 |
def load_acoustic_model() -> dict[str, Any] | None:
|
|
|
|
| 421 |
return ACOUSTIC_MODEL
|
| 422 |
|
| 423 |
|
| 424 |
+
def load_acoustic_extratrees_model() -> dict[str, Any] | None:
|
| 425 |
+
global ACOUSTIC_EXTRATREES_MODEL
|
| 426 |
+
if not acoustic_hint_enabled():
|
| 427 |
+
return None
|
| 428 |
+
if not ACOUSTIC_EXTRATREES_MODEL_PATH.exists():
|
| 429 |
+
return None
|
| 430 |
+
if ACOUSTIC_EXTRATREES_MODEL is None:
|
| 431 |
+
import joblib
|
| 432 |
+
|
| 433 |
+
ACOUSTIC_EXTRATREES_MODEL = joblib.load(ACOUSTIC_EXTRATREES_MODEL_PATH)
|
| 434 |
+
return ACOUSTIC_EXTRATREES_MODEL
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def classify_acoustic_extratrees(waveform: np.ndarray) -> dict[str, Any] | None:
|
| 438 |
+
model = load_acoustic_extratrees_model()
|
| 439 |
+
if model is None:
|
| 440 |
+
return None
|
| 441 |
+
|
| 442 |
+
features = extract_acoustic_features(waveform, sr=int(model.get("sample_rate", 16000))).reshape(1, -1)
|
| 443 |
+
classifier = model["classifier"]
|
| 444 |
+
prediction = str(classifier.predict(features)[0])
|
| 445 |
+
confidence = 1.0
|
| 446 |
+
class_scores: dict[str, float] = {}
|
| 447 |
+
if hasattr(classifier, "predict_proba"):
|
| 448 |
+
probabilities = classifier.predict_proba(features)[0]
|
| 449 |
+
classes = [str(label) for label in classifier.classes_]
|
| 450 |
+
class_scores = {
|
| 451 |
+
label: round(float(probability), 6)
|
| 452 |
+
for label, probability in sorted(zip(classes, probabilities), key=lambda item: item[1], reverse=True)
|
| 453 |
+
}
|
| 454 |
+
confidence = float(class_scores.get(prediction, 0.0))
|
| 455 |
+
|
| 456 |
+
return {
|
| 457 |
+
"detected_class": prediction,
|
| 458 |
+
"raw_class": prediction,
|
| 459 |
+
"confidence": confidence,
|
| 460 |
+
"class_scores": class_scores,
|
| 461 |
+
"model_name": model.get("name", "lisper_v18_extratrees_acoustic_hint"),
|
| 462 |
+
"train_rows": model.get("train_rows"),
|
| 463 |
+
"feature_count": model.get("feature_count"),
|
| 464 |
+
"holdout_accuracy": model.get("holdout_accuracy"),
|
| 465 |
+
"low_confidence_defaulted_to_clear": False,
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def classify_acoustic_knn(waveform: np.ndarray) -> dict[str, Any] | None:
|
| 470 |
model = load_acoustic_model()
|
| 471 |
if model is None:
|
| 472 |
return None
|
|
|
|
| 504 |
}
|
| 505 |
|
| 506 |
|
| 507 |
+
def classify_acoustic(waveform: np.ndarray) -> dict[str, Any] | None:
|
| 508 |
+
preference = acoustic_model_preference()
|
| 509 |
+
if preference in {"auto", "extratrees"}:
|
| 510 |
+
extratrees_result = classify_acoustic_extratrees(waveform)
|
| 511 |
+
if extratrees_result is not None:
|
| 512 |
+
return extratrees_result
|
| 513 |
+
if preference == "extratrees":
|
| 514 |
+
return None
|
| 515 |
+
return classify_acoustic_knn(waveform)
|
| 516 |
+
|
| 517 |
+
|
| 518 |
def enforce_acoustic_response(response: str, acoustic_result: dict[str, Any] | None) -> tuple[str, dict[str, Any]]:
|
| 519 |
parsed = parse_response(response)
|
| 520 |
if not acoustic_result:
|
requirements.txt
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
accelerate>=1.8.0
|
| 2 |
bitsandbytes>=0.46.0
|
| 3 |
gradio==5.29.1
|
|
|
|
| 4 |
librosa>=0.10.2
|
| 5 |
numpy>=2.0.0
|
| 6 |
peft>=0.15.0
|
|
|
|
| 7 |
spaces>=0.32.0
|
| 8 |
soundfile>=0.13.1
|
| 9 |
torch==2.8.0
|
|
|
|
| 1 |
accelerate>=1.8.0
|
| 2 |
bitsandbytes>=0.46.0
|
| 3 |
gradio==5.29.1
|
| 4 |
+
joblib>=1.4.2
|
| 5 |
librosa>=0.10.2
|
| 6 |
numpy>=2.0.0
|
| 7 |
peft>=0.15.0
|
| 8 |
+
scikit-learn>=1.5.0
|
| 9 |
spaces>=0.32.0
|
| 10 |
soundfile>=0.13.1
|
| 11 |
torch==2.8.0
|