Spaces:

DarliAI
/

Evaluation

Sleeping

App Files Files Community

FarmerlineML commited on Aug 14, 2025

Commit

52ae594

verified ·

1 Parent(s): fbcc780

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -59

app.py CHANGED Viewed

@@ -13,8 +13,6 @@ from transformers import pipeline
 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
-import pandas as pd
-from huggingface_hub import HfApi
 # Optional: modest thread hints for CPU Spaces
 try:
@@ -27,24 +25,52 @@ except Exception:
 # Basic logging so we can verify which model is loaded per inference
 logging.basicConfig(level=logging.INFO)
 # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
 def _push_row_to_hf_dataset(row, audio_file_path):
     """
-    Append a single example to the HF dataset repo using Parquet files.
-    This approach is more robust for incremental updates.
     """
     if not PUSH_TO_HF:
         return "HF push disabled (missing HF_TOKEN or repo)."
     example = dict(row)
-    # Store audio path reference if audio should be saved
-    example["audio_stored"] = bool(audio_file_path and os.path.isfile(audio_file_path))
     # Normalize types
     def _to_int(v):
         try:
@@ -62,62 +88,78 @@ def _push_row_to_hf_dataset(row, audio_file_path):
     for k in ["rtf", "audio_duration_s"]:
         example[k] = _to_float(example.get(k))
-    # Create a unique filename for this submission
-    timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
-    unique_id = str(uuid.uuid4())[:8]
-    parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
-    # Convert to DataFrame and save as Parquet
-    df = pd.DataFrame([example])
-    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
-        df.to_parquet(tmp_file.name, engine='pyarrow')
-        tmp_path = tmp_file.name
     try:
-        # Upload the Parquet file to the dataset repo
-        api = HfApi()
-        # Upload to a data/ directory in the repo
-        api.upload_file(
-            path_or_fileobj=tmp_path,
-            path_in_repo=f"data/{parquet_filename}",
-            repo_id=HF_DATASET_REPO,
-            repo_type="dataset",
-            token=HF_TOKEN,
-            commit_message=f"Add feedback row {timestamp}"
-        )
-        # Clean up temp file
-        os.remove(tmp_path)
-        # If audio file should be stored
-        if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly"):
-            try:
-                audio_ext = os.path.splitext(audio_file_path)[1] or ".wav"
-                audio_filename = f"audio_{timestamp}_{unique_id}{audio_ext}"
-                api.upload_file(
-                    path_or_fileobj=audio_file_path,
-                    path_in_repo=f"audio/{audio_filename}",
-                    repo_id=HF_DATASET_REPO,
-                    repo_type="dataset",
                     token=HF_TOKEN,
-                    commit_message=f"Add audio for feedback {timestamp}"
                 )
-                example["audio_filename"] = audio_filename
-            except Exception as audio_error:
-                logging.warning(f"Failed to upload audio: {audio_error}")
-        return f"Pushed to HF Dataset as {parquet_filename}"
     except Exception as e:
-        # Clean up temp file on error
-        if os.path.exists(tmp_path):
-            try:
-                os.remove(tmp_path)
-            except:
-                pass
-        return f"Failed to push to HF Dataset: {e}"
 # --- Map display names to your HF Hub model IDs ---
 language_models = {

 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
 # Optional: modest thread hints for CPU Spaces
 try:
 # Basic logging so we can verify which model is loaded per inference
 logging.basicConfig(level=logging.INFO)
+# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
+from datasets import Dataset, Features, Value, Audio, load_dataset
+from huggingface_hub import HfApi
 # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
+HF_FEATURES = Features({
+    "timestamp":        Value("string"),
+    "session_id":       Value("string"),
+    "language_display": Value("string"),
+    "model_id":         Value("string"),
+    "model_revision":   Value("string"),
+    "audio":            Audio(sampling_rate=None),   # uploaded only if user consents
+    "audio_duration_s": Value("float32"),
+    "sample_rate":      Value("int32"),
+    "source":           Value("string"),
+    "decode_params":    Value("string"),
+    "transcript_hyp":   Value("string"),
+    "corrected_text":   Value("string"),
+    "latency_ms":       Value("int32"),
+    "rtf":              Value("float32"),
+    "score_out_of_10":  Value("int32"),
+    "share_publicly":   Value("bool"),
+})
 def _push_row_to_hf_dataset(row, audio_file_path):
     """
+    Append a single example to the HF dataset repo (train split).
+    If user didn't consent or no audio path, 'audio' field is None.
+    Uses the modern datasets library approach with proper appending.
     """
     if not PUSH_TO_HF:
         return "HF push disabled (missing HF_TOKEN or repo)."
     example = dict(row)
+    # Audio: only include if user consented and file exists
+    example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
     # Normalize types
     def _to_int(v):
         try:
     for k in ["rtf", "audio_duration_s"]:
         example[k] = _to_float(example.get(k))
+    # Create a dataset with single row
+    ds_new = Dataset.from_list([example], features=HF_FEATURES)
     try:
+        # Try to load existing dataset and append
+        try:
+            # Load existing dataset
+            ds_existing = load_dataset(
+                HF_DATASET_REPO,
+                split="train",
+                token=HF_TOKEN,
+                download_mode="force_redownload"  # Ensure we get the latest version
+            )
+            # Concatenate with new data
+            ds_combined = ds_existing.add_item(example)
+            # Push the combined dataset
+            ds_combined.push_to_hub(
+                HF_DATASET_REPO,
+                split="train",
+                private=True,
+                token=HF_TOKEN,
+                commit_message=f"Append feedback row at {example['timestamp']}"
+            )
+            return "Successfully appended to existing HF Dataset."
+        except Exception as e:
+            # If dataset doesn't exist or error loading, create new
+            if "404" in str(e) or "doesn't exist" in str(e) or "EmptyDatasetError" in str(e):
+                # Dataset doesn't exist, create it
+                ds_new.push_to_hub(
+                    HF_DATASET_REPO,
+                    split="train",
+                    private=True,
                     token=HF_TOKEN,
+                    commit_message="Initialize dataset with first feedback row"
                 )
+                return "Created new HF Dataset with first row."
+            else:
+                # Try alternative approach: push with create_pr=True to avoid conflicts
+                ds_new.push_to_hub(
+                    HF_DATASET_REPO,
+                    split="train",
+                    private=True,
+                    token=HF_TOKEN,
+                    commit_message=f"Append feedback row at {example['timestamp']}",
+                    create_pr=True  # Create a PR to avoid conflicts
+                )
+                return "Pushed to HF Dataset via PR (will auto-merge)."
     except Exception as e:
+        logging.error(f"Failed to push to HF Dataset: {e}")
+        # Final fallback: try using HfApi to check if repo exists
+        try:
+            api = HfApi()
+            api.dataset_info(HF_DATASET_REPO, token=HF_TOKEN)
+            # Repo exists, try one more time with force push
+            ds_new.push_to_hub(
+                HF_DATASET_REPO,
+                split=f"train_{int(time.time())}",  # Use unique split name as last resort
+                private=True,
+                token=HF_TOKEN,
+                commit_message=f"Append feedback row at {example['timestamp']}"
+            )
+            return f"Pushed to HF Dataset with unique split."
+        except Exception as final_error:
+            return f"Failed to push to HF Dataset: {final_error}"
 # --- Map display names to your HF Hub model IDs ---
 language_models = {