Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,6 +13,8 @@ from transformers import pipeline
|
|
| 13 |
import numpy as np
|
| 14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
| 15 |
import librosa # fallback / resampling
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Optional: modest thread hints for CPU Spaces
|
| 18 |
try:
|
|
@@ -25,50 +27,24 @@ except Exception:
|
|
| 25 |
# Basic logging so we can verify which model is loaded per inference
|
| 26 |
logging.basicConfig(level=logging.INFO)
|
| 27 |
|
| 28 |
-
# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
|
| 29 |
-
from datasets import Dataset, Features, Value, Audio, load_dataset
|
| 30 |
-
|
| 31 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
| 32 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
| 33 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 34 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
| 35 |
|
| 36 |
-
HF_FEATURES = Features({
|
| 37 |
-
"timestamp": Value("string"),
|
| 38 |
-
"session_id": Value("string"),
|
| 39 |
-
"language_display": Value("string"),
|
| 40 |
-
"model_id": Value("string"),
|
| 41 |
-
"model_revision": Value("string"),
|
| 42 |
-
|
| 43 |
-
"audio": Audio(sampling_rate=None), # uploaded only if user consents
|
| 44 |
-
"audio_duration_s": Value("float32"),
|
| 45 |
-
"sample_rate": Value("int32"),
|
| 46 |
-
"source": Value("string"),
|
| 47 |
-
"decode_params": Value("string"),
|
| 48 |
-
|
| 49 |
-
"transcript_hyp": Value("string"),
|
| 50 |
-
"corrected_text": Value("string"),
|
| 51 |
-
|
| 52 |
-
"latency_ms": Value("int32"),
|
| 53 |
-
"rtf": Value("float32"),
|
| 54 |
-
|
| 55 |
-
"score_out_of_10": Value("int32"),
|
| 56 |
-
"share_publicly": Value("bool"),
|
| 57 |
-
})
|
| 58 |
-
|
| 59 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
| 60 |
"""
|
| 61 |
-
Append a single example to the HF dataset repo
|
| 62 |
-
|
| 63 |
"""
|
| 64 |
if not PUSH_TO_HF:
|
| 65 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
| 66 |
|
| 67 |
example = dict(row)
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
example["
|
| 71 |
-
|
| 72 |
# Normalize types
|
| 73 |
def _to_int(v):
|
| 74 |
try:
|
|
@@ -86,23 +62,62 @@ def _push_row_to_hf_dataset(row, audio_file_path):
|
|
| 86 |
for k in ["rtf", "audio_duration_s"]:
|
| 87 |
example[k] = _to_float(example.get(k))
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
try:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# --- Map display names to your HF Hub model IDs ---
|
| 108 |
language_models = {
|
|
@@ -137,7 +152,7 @@ language_models = {
|
|
| 137 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
| 138 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
| 139 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
| 140 |
-
"Krio":
|
| 141 |
}
|
| 142 |
|
| 143 |
# -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
|
|
@@ -310,6 +325,7 @@ def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, au
|
|
| 310 |
status = f"Feedback saved. {hf_status}"
|
| 311 |
except Exception as e:
|
| 312 |
status = f"Failed to push to HF Dataset: {e}"
|
|
|
|
| 313 |
|
| 314 |
return {
|
| 315 |
"status": status,
|
|
|
|
| 13 |
import numpy as np
|
| 14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
| 15 |
import librosa # fallback / resampling
|
| 16 |
+
import pandas as pd
|
| 17 |
+
from huggingface_hub import HfApi
|
| 18 |
|
| 19 |
# Optional: modest thread hints for CPU Spaces
|
| 20 |
try:
|
|
|
|
| 27 |
# Basic logging so we can verify which model is loaded per inference
|
| 28 |
logging.basicConfig(level=logging.INFO)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
| 31 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
| 32 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
| 36 |
"""
|
| 37 |
+
Append a single example to the HF dataset repo using Parquet files.
|
| 38 |
+
This approach is more robust for incremental updates.
|
| 39 |
"""
|
| 40 |
if not PUSH_TO_HF:
|
| 41 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
| 42 |
|
| 43 |
example = dict(row)
|
| 44 |
|
| 45 |
+
# Store audio path reference if audio should be saved
|
| 46 |
+
example["audio_stored"] = bool(audio_file_path and os.path.isfile(audio_file_path))
|
| 47 |
+
|
| 48 |
# Normalize types
|
| 49 |
def _to_int(v):
|
| 50 |
try:
|
|
|
|
| 62 |
for k in ["rtf", "audio_duration_s"]:
|
| 63 |
example[k] = _to_float(example.get(k))
|
| 64 |
|
| 65 |
+
# Create a unique filename for this submission
|
| 66 |
+
timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
|
| 67 |
+
unique_id = str(uuid.uuid4())[:8]
|
| 68 |
+
parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
|
| 69 |
+
|
| 70 |
+
# Convert to DataFrame and save as Parquet
|
| 71 |
+
df = pd.DataFrame([example])
|
| 72 |
+
|
| 73 |
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
| 74 |
+
df.to_parquet(tmp_file.name, engine='pyarrow')
|
| 75 |
+
tmp_path = tmp_file.name
|
| 76 |
+
|
| 77 |
try:
|
| 78 |
+
# Upload the Parquet file to the dataset repo
|
| 79 |
+
api = HfApi()
|
| 80 |
+
|
| 81 |
+
# Upload to a data/ directory in the repo
|
| 82 |
+
api.upload_file(
|
| 83 |
+
path_or_fileobj=tmp_path,
|
| 84 |
+
path_in_repo=f"data/{parquet_filename}",
|
| 85 |
+
repo_id=HF_DATASET_REPO,
|
| 86 |
+
repo_type="dataset",
|
| 87 |
+
token=HF_TOKEN,
|
| 88 |
+
commit_message=f"Add feedback row {timestamp}"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Clean up temp file
|
| 92 |
+
os.remove(tmp_path)
|
| 93 |
+
|
| 94 |
+
# If audio file should be stored
|
| 95 |
+
if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly"):
|
| 96 |
+
try:
|
| 97 |
+
audio_ext = os.path.splitext(audio_file_path)[1] or ".wav"
|
| 98 |
+
audio_filename = f"audio_{timestamp}_{unique_id}{audio_ext}"
|
| 99 |
+
api.upload_file(
|
| 100 |
+
path_or_fileobj=audio_file_path,
|
| 101 |
+
path_in_repo=f"audio/{audio_filename}",
|
| 102 |
+
repo_id=HF_DATASET_REPO,
|
| 103 |
+
repo_type="dataset",
|
| 104 |
+
token=HF_TOKEN,
|
| 105 |
+
commit_message=f"Add audio for feedback {timestamp}"
|
| 106 |
+
)
|
| 107 |
+
example["audio_filename"] = audio_filename
|
| 108 |
+
except Exception as audio_error:
|
| 109 |
+
logging.warning(f"Failed to upload audio: {audio_error}")
|
| 110 |
+
|
| 111 |
+
return f"Pushed to HF Dataset as {parquet_filename}"
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
# Clean up temp file on error
|
| 115 |
+
if os.path.exists(tmp_path):
|
| 116 |
+
try:
|
| 117 |
+
os.remove(tmp_path)
|
| 118 |
+
except:
|
| 119 |
+
pass
|
| 120 |
+
return f"Failed to push to HF Dataset: {e}"
|
| 121 |
|
| 122 |
# --- Map display names to your HF Hub model IDs ---
|
| 123 |
language_models = {
|
|
|
|
| 152 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
| 153 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
| 154 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
| 155 |
+
"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
|
| 156 |
}
|
| 157 |
|
| 158 |
# -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
|
|
|
|
| 325 |
status = f"Feedback saved. {hf_status}"
|
| 326 |
except Exception as e:
|
| 327 |
status = f"Failed to push to HF Dataset: {e}"
|
| 328 |
+
logging.error(f"Push error: {e}")
|
| 329 |
|
| 330 |
return {
|
| 331 |
"status": status,
|