Spaces:
Sleeping
Sleeping
added docker config and optimized model for HF Spaces
Browse files- .gitattributes +1 -0
- .gitignore +5 -1
- Dockerfile +48 -0
- README.md +16 -0
- models/onnx_quantized/config.json +127 -0
- models/onnx_quantized/model_quantized.onnx +3 -0
- models/onnx_quantized/ort_config.json +32 -0
- models/onnx_quantized/preprocessor_config.json +9 -0
.gitattributes
CHANGED
|
@@ -4,3 +4,4 @@
|
|
| 4 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 6 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 4 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 6 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -5,13 +5,17 @@ __pycache__/
|
|
| 5 |
*.pyd
|
| 6 |
.venv/
|
| 7 |
|
| 8 |
-
# Model Files (DO NOT COMMIT)
|
| 9 |
models/hub/
|
| 10 |
models/wav2vec2-finetuned/
|
| 11 |
models/*.pth
|
| 12 |
models/*.pt
|
| 13 |
models/*.safetensors
|
| 14 |
models/*.bin
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
models/
|
| 16 |
# Derived Data (re-generate with scripts)
|
| 17 |
data/embeddings/
|
|
|
|
| 5 |
*.pyd
|
| 6 |
.venv/
|
| 7 |
|
| 8 |
+
# Model Files (DO NOT COMMIT heavy weights)
|
| 9 |
models/hub/
|
| 10 |
models/wav2vec2-finetuned/
|
| 11 |
models/*.pth
|
| 12 |
models/*.pt
|
| 13 |
models/*.safetensors
|
| 14 |
models/*.bin
|
| 15 |
+
|
| 16 |
+
# Allow ONLY the optimized engine for deployment
|
| 17 |
+
!models/onnx_quantized/
|
| 18 |
+
|
| 19 |
models/
|
| 20 |
# Derived Data (re-generate with scripts)
|
| 21 |
data/embeddings/
|
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use a modern Python base image
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
+
PORT=8501
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install system dependencies (FFMPEG is critical for librosa)
|
| 12 |
+
RUN apt-get update && apt-get install -y \
|
| 13 |
+
build-essential \
|
| 14 |
+
curl \
|
| 15 |
+
git \
|
| 16 |
+
ffmpeg \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Install uv for fast dependency management
|
| 20 |
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
| 21 |
+
|
| 22 |
+
# Copy configuration files
|
| 23 |
+
COPY pyproject.toml uv.lock ./
|
| 24 |
+
|
| 25 |
+
# Install dependencies into the system site-packages (simpler for Docker)
|
| 26 |
+
RUN uv pip install --system -r pyproject.toml
|
| 27 |
+
|
| 28 |
+
# Copy project files
|
| 29 |
+
COPY src/ ./src/
|
| 30 |
+
COPY models/ ./models/
|
| 31 |
+
|
| 32 |
+
# Create a non-root user for security (Hugging Face recommendation)
|
| 33 |
+
RUN useradd -m -u 1000 user
|
| 34 |
+
USER user
|
| 35 |
+
ENV HOME=/home/user \
|
| 36 |
+
PATH=/home/user/.local/bin:$PATH
|
| 37 |
+
|
| 38 |
+
WORKDIR $HOME/app
|
| 39 |
+
COPY --chown=user . $HOME/app
|
| 40 |
+
|
| 41 |
+
# Expose the HF default port
|
| 42 |
+
EXPOSE 8501
|
| 43 |
+
|
| 44 |
+
# Healthcheck
|
| 45 |
+
HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
|
| 46 |
+
|
| 47 |
+
# Launch the standalone app
|
| 48 |
+
ENTRYPOINT ["streamlit", "run", "src/ui/app_standalone.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README.md
CHANGED
|
@@ -1,3 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# VigilAudio: AI-Powered Audio Moderation Engine
|
| 2 |
|
| 3 |
**A production-ready audio emotion classification system built for content moderation.**
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VigilAudio
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
tags:
|
| 9 |
+
- streamlit
|
| 10 |
+
- onnx
|
| 11 |
+
- audio-classification
|
| 12 |
+
pinned: false
|
| 13 |
+
short_description: Emotional audio classification
|
| 14 |
+
license: mit
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
# VigilAudio: AI-Powered Audio Moderation Engine
|
| 18 |
|
| 19 |
**A production-ready audio emotion classification system built for content moderation.**
|
models/onnx_quantized/config.json
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0.1,
|
| 3 |
+
"adapter_attn_dim": null,
|
| 4 |
+
"adapter_kernel_size": 3,
|
| 5 |
+
"adapter_stride": 2,
|
| 6 |
+
"add_adapter": false,
|
| 7 |
+
"apply_spec_augment": true,
|
| 8 |
+
"architectures": [
|
| 9 |
+
"Wav2Vec2ForSequenceClassification"
|
| 10 |
+
],
|
| 11 |
+
"attention_dropout": 0.1,
|
| 12 |
+
"bos_token_id": 1,
|
| 13 |
+
"classifier_proj_size": 256,
|
| 14 |
+
"codevector_dim": 256,
|
| 15 |
+
"contrastive_logits_temperature": 0.1,
|
| 16 |
+
"conv_bias": false,
|
| 17 |
+
"conv_dim": [
|
| 18 |
+
512,
|
| 19 |
+
512,
|
| 20 |
+
512,
|
| 21 |
+
512,
|
| 22 |
+
512,
|
| 23 |
+
512,
|
| 24 |
+
512
|
| 25 |
+
],
|
| 26 |
+
"conv_kernel": [
|
| 27 |
+
10,
|
| 28 |
+
3,
|
| 29 |
+
3,
|
| 30 |
+
3,
|
| 31 |
+
3,
|
| 32 |
+
2,
|
| 33 |
+
2
|
| 34 |
+
],
|
| 35 |
+
"conv_stride": [
|
| 36 |
+
5,
|
| 37 |
+
2,
|
| 38 |
+
2,
|
| 39 |
+
2,
|
| 40 |
+
2,
|
| 41 |
+
2,
|
| 42 |
+
2
|
| 43 |
+
],
|
| 44 |
+
"ctc_loss_reduction": "sum",
|
| 45 |
+
"ctc_zero_infinity": false,
|
| 46 |
+
"diversity_loss_weight": 0.1,
|
| 47 |
+
"do_stable_layer_norm": false,
|
| 48 |
+
"dtype": "float32",
|
| 49 |
+
"eos_token_id": 2,
|
| 50 |
+
"feat_extract_activation": "gelu",
|
| 51 |
+
"feat_extract_dropout": 0.0,
|
| 52 |
+
"feat_extract_norm": "group",
|
| 53 |
+
"feat_proj_dropout": 0.1,
|
| 54 |
+
"feat_quantizer_dropout": 0.0,
|
| 55 |
+
"final_dropout": 0.1,
|
| 56 |
+
"gradient_checkpointing": false,
|
| 57 |
+
"hidden_act": "gelu",
|
| 58 |
+
"hidden_dropout": 0.1,
|
| 59 |
+
"hidden_dropout_prob": 0.1,
|
| 60 |
+
"hidden_size": 768,
|
| 61 |
+
"id2label": {
|
| 62 |
+
"0": "angry",
|
| 63 |
+
"1": "disgusted",
|
| 64 |
+
"2": "fearful",
|
| 65 |
+
"3": "happy",
|
| 66 |
+
"4": "neutral",
|
| 67 |
+
"5": "sad",
|
| 68 |
+
"6": "suprised"
|
| 69 |
+
},
|
| 70 |
+
"initializer_range": 0.02,
|
| 71 |
+
"intermediate_size": 3072,
|
| 72 |
+
"label2id": {
|
| 73 |
+
"angry": 0,
|
| 74 |
+
"disgusted": 1,
|
| 75 |
+
"fearful": 2,
|
| 76 |
+
"happy": 3,
|
| 77 |
+
"neutral": 4,
|
| 78 |
+
"sad": 5,
|
| 79 |
+
"suprised": 6
|
| 80 |
+
},
|
| 81 |
+
"layer_norm_eps": 1e-05,
|
| 82 |
+
"layerdrop": 0.1,
|
| 83 |
+
"mask_feature_length": 10,
|
| 84 |
+
"mask_feature_min_masks": 0,
|
| 85 |
+
"mask_feature_prob": 0.0,
|
| 86 |
+
"mask_time_length": 10,
|
| 87 |
+
"mask_time_min_masks": 2,
|
| 88 |
+
"mask_time_prob": 0.05,
|
| 89 |
+
"model_type": "wav2vec2",
|
| 90 |
+
"num_adapter_layers": 3,
|
| 91 |
+
"num_attention_heads": 12,
|
| 92 |
+
"num_codevector_groups": 2,
|
| 93 |
+
"num_codevectors_per_group": 320,
|
| 94 |
+
"num_conv_pos_embedding_groups": 16,
|
| 95 |
+
"num_conv_pos_embeddings": 128,
|
| 96 |
+
"num_feat_extract_layers": 7,
|
| 97 |
+
"num_hidden_layers": 12,
|
| 98 |
+
"num_negatives": 100,
|
| 99 |
+
"output_hidden_size": 768,
|
| 100 |
+
"pad_token_id": 0,
|
| 101 |
+
"proj_codevector_dim": 256,
|
| 102 |
+
"tdnn_dilation": [
|
| 103 |
+
1,
|
| 104 |
+
2,
|
| 105 |
+
3,
|
| 106 |
+
1,
|
| 107 |
+
1
|
| 108 |
+
],
|
| 109 |
+
"tdnn_dim": [
|
| 110 |
+
512,
|
| 111 |
+
512,
|
| 112 |
+
512,
|
| 113 |
+
512,
|
| 114 |
+
1500
|
| 115 |
+
],
|
| 116 |
+
"tdnn_kernel": [
|
| 117 |
+
5,
|
| 118 |
+
3,
|
| 119 |
+
3,
|
| 120 |
+
1,
|
| 121 |
+
1
|
| 122 |
+
],
|
| 123 |
+
"transformers_version": "4.57.3",
|
| 124 |
+
"use_weighted_layer_sum": false,
|
| 125 |
+
"vocab_size": 32,
|
| 126 |
+
"xvector_output_dim": 512
|
| 127 |
+
}
|
models/onnx_quantized/model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a7fbc3bca7e8210afacc795c3913e3293117e083d50979b59df1850c965ee77
|
| 3 |
+
size 122141352
|
models/onnx_quantized/ort_config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"one_external_file": true,
|
| 3 |
+
"opset": null,
|
| 4 |
+
"optimization": {},
|
| 5 |
+
"quantization": {
|
| 6 |
+
"activations_dtype": "QUInt8",
|
| 7 |
+
"activations_symmetric": false,
|
| 8 |
+
"format": "QOperator",
|
| 9 |
+
"is_static": false,
|
| 10 |
+
"mode": "IntegerOps",
|
| 11 |
+
"nodes_to_exclude": [],
|
| 12 |
+
"nodes_to_quantize": [],
|
| 13 |
+
"operators_to_quantize": [
|
| 14 |
+
"MatMul",
|
| 15 |
+
"Attention",
|
| 16 |
+
"LSTM",
|
| 17 |
+
"Gather",
|
| 18 |
+
"Transpose",
|
| 19 |
+
"EmbedLayerNormalization"
|
| 20 |
+
],
|
| 21 |
+
"per_channel": false,
|
| 22 |
+
"qdq_add_pair_to_weight": false,
|
| 23 |
+
"qdq_dedicated_pair": false,
|
| 24 |
+
"qdq_op_type_per_channel_support_to_axis": {
|
| 25 |
+
"MatMul": 1
|
| 26 |
+
},
|
| 27 |
+
"reduce_range": false,
|
| 28 |
+
"weights_dtype": "QInt8",
|
| 29 |
+
"weights_symmetric": true
|
| 30 |
+
},
|
| 31 |
+
"use_external_data_format": false
|
| 32 |
+
}
|
models/onnx_quantized/preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0.0,
|
| 7 |
+
"return_attention_mask": false,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|