Spaces:

DarliAI
/

Evaluation

Sleeping

App Files Files Community

FarmerlineML commited on Aug 14, 2025

Commit

fbcc780

verified ·

1 Parent(s): 9f891d6

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -48

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from transformers import pipeline
 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
 # Optional: modest thread hints for CPU Spaces
 try:
@@ -25,50 +27,24 @@ except Exception:
 # Basic logging so we can verify which model is loaded per inference
 logging.basicConfig(level=logging.INFO)
-# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
-from datasets import Dataset, Features, Value, Audio, load_dataset
 # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
-HF_FEATURES = Features({
-    "timestamp":        Value("string"),
-    "session_id":       Value("string"),
-    "language_display": Value("string"),
-    "model_id":         Value("string"),
-    "model_revision":   Value("string"),
-    "audio":            Audio(sampling_rate=None),   # uploaded only if user consents
-    "audio_duration_s": Value("float32"),
-    "sample_rate":      Value("int32"),
-    "source":           Value("string"),
-    "decode_params":    Value("string"),
-    "transcript_hyp":   Value("string"),
-    "corrected_text":   Value("string"),
-    "latency_ms":       Value("int32"),
-    "rtf":              Value("float32"),
-    "score_out_of_10":  Value("int32"),
-    "share_publicly":   Value("bool"),
-})
 def _push_row_to_hf_dataset(row, audio_file_path):
     """
-    Append a single example to the HF dataset repo (train split).
-    If user didn't consent or no audio path, 'audio' field is None.
     """
     if not PUSH_TO_HF:
         return "HF push disabled (missing HF_TOKEN or repo)."
     example = dict(row)
-    # Audio: only include if user consented and file exists
-    example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
     # Normalize types
     def _to_int(v):
         try:
@@ -86,23 +62,62 @@ def _push_row_to_hf_dataset(row, audio_file_path):
     for k in ["rtf", "audio_duration_s"]:
         example[k] = _to_float(example.get(k))
-    ds = Dataset.from_list([example], features=HF_FEATURES)
-    # Load existing split if present, then append
     try:
-        existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
-        merged = existing.concatenate(ds)
-    except Exception:
-        merged = ds
-    merged.push_to_hub(
-        HF_DATASET_REPO,
-        split="train",
-        private=True,
-        token=HF_TOKEN,
-        commit_message="append feedback row"
-    )
-    return "Pushed to HF Dataset."
 # --- Map display names to your HF Hub model IDs ---
 language_models = {
@@ -137,7 +152,7 @@ language_models = {
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
-    "Krio":                   "FarmerlineML/w2v-bert-2.0_krio_v3",
 }
 # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
@@ -310,6 +325,7 @@ def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, au
         status = f"Feedback saved. {hf_status}"
     except Exception as e:
         status = f"Failed to push to HF Dataset: {e}"
     return {
         "status": status,

 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
+import pandas as pd
+from huggingface_hub import HfApi
 # Optional: modest thread hints for CPU Spaces
 try:
 # Basic logging so we can verify which model is loaded per inference
 logging.basicConfig(level=logging.INFO)
 # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
 def _push_row_to_hf_dataset(row, audio_file_path):
     """
+    Append a single example to the HF dataset repo using Parquet files.
+    This approach is more robust for incremental updates.
     """
     if not PUSH_TO_HF:
         return "HF push disabled (missing HF_TOKEN or repo)."
     example = dict(row)
+    # Store audio path reference if audio should be saved
+    example["audio_stored"] = bool(audio_file_path and os.path.isfile(audio_file_path))
     # Normalize types
     def _to_int(v):
         try:
     for k in ["rtf", "audio_duration_s"]:
         example[k] = _to_float(example.get(k))
+    # Create a unique filename for this submission
+    timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
+    unique_id = str(uuid.uuid4())[:8]
+    parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
+    # Convert to DataFrame and save as Parquet
+    df = pd.DataFrame([example])
+    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
+        df.to_parquet(tmp_file.name, engine='pyarrow')
+        tmp_path = tmp_file.name
     try:
+        # Upload the Parquet file to the dataset repo
+        api = HfApi()
+        # Upload to a data/ directory in the repo
+        api.upload_file(
+            path_or_fileobj=tmp_path,
+            path_in_repo=f"data/{parquet_filename}",
+            repo_id=HF_DATASET_REPO,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message=f"Add feedback row {timestamp}"
+        )
+        # Clean up temp file
+        os.remove(tmp_path)
+        # If audio file should be stored
+        if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly"):
+            try:
+                audio_ext = os.path.splitext(audio_file_path)[1] or ".wav"
+                audio_filename = f"audio_{timestamp}_{unique_id}{audio_ext}"
+                api.upload_file(
+                    path_or_fileobj=audio_file_path,
+                    path_in_repo=f"audio/{audio_filename}",
+                    repo_id=HF_DATASET_REPO,
+                    repo_type="dataset",
+                    token=HF_TOKEN,
+                    commit_message=f"Add audio for feedback {timestamp}"
+                )
+                example["audio_filename"] = audio_filename
+            except Exception as audio_error:
+                logging.warning(f"Failed to upload audio: {audio_error}")
+        return f"Pushed to HF Dataset as {parquet_filename}"
+    except Exception as e:
+        # Clean up temp file on error
+        if os.path.exists(tmp_path):
+            try:
+                os.remove(tmp_path)
+            except:
+                pass
+        return f"Failed to push to HF Dataset: {e}"
 # --- Map display names to your HF Hub model IDs ---
 language_models = {
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
+    "Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3",
 }
 # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
         status = f"Feedback saved. {hf_status}"
     except Exception as e:
         status = f"Failed to push to HF Dataset: {e}"
+        logging.error(f"Push error: {e}")
     return {
         "status": status,