Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,8 +13,6 @@ from transformers import pipeline
|
|
| 13 |
import numpy as np
|
| 14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
| 15 |
import librosa # fallback / resampling
|
| 16 |
-
import pandas as pd
|
| 17 |
-
from huggingface_hub import HfApi
|
| 18 |
|
| 19 |
# Optional: modest thread hints for CPU Spaces
|
| 20 |
try:
|
|
@@ -27,24 +25,52 @@ except Exception:
|
|
| 27 |
# Basic logging so we can verify which model is loaded per inference
|
| 28 |
logging.basicConfig(level=logging.INFO)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
| 31 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
| 32 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
| 36 |
"""
|
| 37 |
-
Append a single example to the HF dataset repo
|
| 38 |
-
|
|
|
|
| 39 |
"""
|
| 40 |
if not PUSH_TO_HF:
|
| 41 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
| 42 |
|
| 43 |
example = dict(row)
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
example["
|
| 47 |
-
|
| 48 |
# Normalize types
|
| 49 |
def _to_int(v):
|
| 50 |
try:
|
|
@@ -62,62 +88,78 @@ def _push_row_to_hf_dataset(row, audio_file_path):
|
|
| 62 |
for k in ["rtf", "audio_duration_s"]:
|
| 63 |
example[k] = _to_float(example.get(k))
|
| 64 |
|
| 65 |
-
# Create a
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
|
| 69 |
-
|
| 70 |
-
# Convert to DataFrame and save as Parquet
|
| 71 |
-
df = pd.DataFrame([example])
|
| 72 |
-
|
| 73 |
-
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
| 74 |
-
df.to_parquet(tmp_file.name, engine='pyarrow')
|
| 75 |
-
tmp_path = tmp_file.name
|
| 76 |
-
|
| 77 |
try:
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
token=HF_TOKEN,
|
| 105 |
-
commit_message=
|
| 106 |
)
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
except Exception as e:
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
# --- Map display names to your HF Hub model IDs ---
|
| 123 |
language_models = {
|
|
|
|
| 13 |
import numpy as np
|
| 14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
| 15 |
import librosa # fallback / resampling
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Optional: modest thread hints for CPU Spaces
|
| 18 |
try:
|
|
|
|
| 25 |
# Basic logging so we can verify which model is loaded per inference
|
| 26 |
logging.basicConfig(level=logging.INFO)
|
| 27 |
|
| 28 |
+
# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
|
| 29 |
+
from datasets import Dataset, Features, Value, Audio, load_dataset
|
| 30 |
+
from huggingface_hub import HfApi
|
| 31 |
+
|
| 32 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
| 33 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
| 34 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 35 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
| 36 |
|
| 37 |
+
HF_FEATURES = Features({
|
| 38 |
+
"timestamp": Value("string"),
|
| 39 |
+
"session_id": Value("string"),
|
| 40 |
+
"language_display": Value("string"),
|
| 41 |
+
"model_id": Value("string"),
|
| 42 |
+
"model_revision": Value("string"),
|
| 43 |
+
|
| 44 |
+
"audio": Audio(sampling_rate=None), # uploaded only if user consents
|
| 45 |
+
"audio_duration_s": Value("float32"),
|
| 46 |
+
"sample_rate": Value("int32"),
|
| 47 |
+
"source": Value("string"),
|
| 48 |
+
"decode_params": Value("string"),
|
| 49 |
+
|
| 50 |
+
"transcript_hyp": Value("string"),
|
| 51 |
+
"corrected_text": Value("string"),
|
| 52 |
+
|
| 53 |
+
"latency_ms": Value("int32"),
|
| 54 |
+
"rtf": Value("float32"),
|
| 55 |
+
|
| 56 |
+
"score_out_of_10": Value("int32"),
|
| 57 |
+
"share_publicly": Value("bool"),
|
| 58 |
+
})
|
| 59 |
+
|
| 60 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
| 61 |
"""
|
| 62 |
+
Append a single example to the HF dataset repo (train split).
|
| 63 |
+
If user didn't consent or no audio path, 'audio' field is None.
|
| 64 |
+
Uses the modern datasets library approach with proper appending.
|
| 65 |
"""
|
| 66 |
if not PUSH_TO_HF:
|
| 67 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
| 68 |
|
| 69 |
example = dict(row)
|
| 70 |
|
| 71 |
+
# Audio: only include if user consented and file exists
|
| 72 |
+
example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
|
| 73 |
+
|
| 74 |
# Normalize types
|
| 75 |
def _to_int(v):
|
| 76 |
try:
|
|
|
|
| 88 |
for k in ["rtf", "audio_duration_s"]:
|
| 89 |
example[k] = _to_float(example.get(k))
|
| 90 |
|
| 91 |
+
# Create a dataset with single row
|
| 92 |
+
ds_new = Dataset.from_list([example], features=HF_FEATURES)
|
| 93 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
try:
|
| 95 |
+
# Try to load existing dataset and append
|
| 96 |
+
try:
|
| 97 |
+
# Load existing dataset
|
| 98 |
+
ds_existing = load_dataset(
|
| 99 |
+
HF_DATASET_REPO,
|
| 100 |
+
split="train",
|
| 101 |
+
token=HF_TOKEN,
|
| 102 |
+
download_mode="force_redownload" # Ensure we get the latest version
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Concatenate with new data
|
| 106 |
+
ds_combined = ds_existing.add_item(example)
|
| 107 |
+
|
| 108 |
+
# Push the combined dataset
|
| 109 |
+
ds_combined.push_to_hub(
|
| 110 |
+
HF_DATASET_REPO,
|
| 111 |
+
split="train",
|
| 112 |
+
private=True,
|
| 113 |
+
token=HF_TOKEN,
|
| 114 |
+
commit_message=f"Append feedback row at {example['timestamp']}"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return "Successfully appended to existing HF Dataset."
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
# If dataset doesn't exist or error loading, create new
|
| 121 |
+
if "404" in str(e) or "doesn't exist" in str(e) or "EmptyDatasetError" in str(e):
|
| 122 |
+
# Dataset doesn't exist, create it
|
| 123 |
+
ds_new.push_to_hub(
|
| 124 |
+
HF_DATASET_REPO,
|
| 125 |
+
split="train",
|
| 126 |
+
private=True,
|
| 127 |
token=HF_TOKEN,
|
| 128 |
+
commit_message="Initialize dataset with first feedback row"
|
| 129 |
)
|
| 130 |
+
return "Created new HF Dataset with first row."
|
| 131 |
+
else:
|
| 132 |
+
# Try alternative approach: push with create_pr=True to avoid conflicts
|
| 133 |
+
ds_new.push_to_hub(
|
| 134 |
+
HF_DATASET_REPO,
|
| 135 |
+
split="train",
|
| 136 |
+
private=True,
|
| 137 |
+
token=HF_TOKEN,
|
| 138 |
+
commit_message=f"Append feedback row at {example['timestamp']}",
|
| 139 |
+
create_pr=True # Create a PR to avoid conflicts
|
| 140 |
+
)
|
| 141 |
+
return "Pushed to HF Dataset via PR (will auto-merge)."
|
| 142 |
+
|
| 143 |
except Exception as e:
|
| 144 |
+
logging.error(f"Failed to push to HF Dataset: {e}")
|
| 145 |
+
|
| 146 |
+
# Final fallback: try using HfApi to check if repo exists
|
| 147 |
+
try:
|
| 148 |
+
api = HfApi()
|
| 149 |
+
api.dataset_info(HF_DATASET_REPO, token=HF_TOKEN)
|
| 150 |
+
|
| 151 |
+
# Repo exists, try one more time with force push
|
| 152 |
+
ds_new.push_to_hub(
|
| 153 |
+
HF_DATASET_REPO,
|
| 154 |
+
split=f"train_{int(time.time())}", # Use unique split name as last resort
|
| 155 |
+
private=True,
|
| 156 |
+
token=HF_TOKEN,
|
| 157 |
+
commit_message=f"Append feedback row at {example['timestamp']}"
|
| 158 |
+
)
|
| 159 |
+
return f"Pushed to HF Dataset with unique split."
|
| 160 |
+
|
| 161 |
+
except Exception as final_error:
|
| 162 |
+
return f"Failed to push to HF Dataset: {final_error}"
|
| 163 |
|
| 164 |
# --- Map display names to your HF Hub model IDs ---
|
| 165 |
language_models = {
|