Spaces:
Sleeping
Sleeping
File size: 6,150 Bytes
30b1c80 956820f 30b1c80 0fc99d7 103ea6f 30b1c80 956820f 30b1c80 ff43104 30b1c80 0fc99d7 30b1c80 50f75fc 30b1c80 50f75fc 30b1c80 0fc99d7 30b1c80 0fc99d7 103ea6f 0fc99d7 103ea6f 0fc99d7 30b1c80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from huggingface_hub import HfApi, CommitScheduler
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path
import gradio as gr
import logfire
import json
import os
import re
import random
# Load API keys
load_dotenv()
# Set repo ID for Hugging Face dataset
REPO_ID = "jedick/noteworthy-differences-feedback"
# Setup user feedback file for uploading to HF dataset
# https://huggingface.co/spaces/Wauplin/space_to_dataset_saver
# https://huggingface.co/docs/huggingface_hub/v0.16.3/en/guides/upload#scheduled-uploads
USER_FEEDBACK_DIR = Path("user_feedback")
USER_FEEDBACK_DIR.mkdir(parents=True, exist_ok=True)
if gr.NO_RELOAD:
# Create dataset if one doesn't exist
api = HfApi()
if not api.repo_exists(REPO_ID, repo_type="dataset"):
api.create_repo(REPO_ID, repo_type="dataset")
# Initialize commit scheduler if we're running on Hugging Face spaces
if "SPACE_ID" in os.environ:
scheduler = CommitScheduler(
repo_id=REPO_ID,
repo_type="dataset",
folder_path=USER_FEEDBACK_DIR,
path_in_repo="data",
)
def save_feedback(*args, feedback_value: str) -> None:
"""
Save complete app state and user feedback to a
JSON Lines file for upload to a Hugging Face dataset.
"""
# Assign dict keys to positional arguments
keys = [
"page_title",
"number_behind",
"units_behind",
"old_timestamp",
"new_timestamp",
"old_revision",
"new_revision",
"heuristic_rationale",
"fewshot_rationale",
"judge_reasoning",
"heuristic_noteworthy",
"fewshot_noteworthy",
"judge_noteworthy",
"confidence_score",
]
feedback_dict = dict(zip(keys, args))
# Data cleanup
# Split dictionary in two
keys_start = {"page_title", "number_behind", "units_behind"}
dict_start = {k: v for k, v in feedback_dict.items() if k in keys_start}
dict_end = {k: v for k, v in feedback_dict.items() if k not in keys_start}
# Normalize timestamp and extract revisions behind from display string
raw_new_timestamp = dict_end.get("new_timestamp", "")
# Extract ISO timestamp (e.g. 2013-12-09T04:48:24Z)
ts_match = re.search(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z", raw_new_timestamp)
if ts_match:
dict_end["new_timestamp"] = ts_match.group(0)
# Do the same for old timestamp
raw_old_timestamp = dict_end.get("old_timestamp", "")
ts_match = re.search(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z", raw_old_timestamp)
if ts_match:
dict_end["old_timestamp"] = ts_match.group(0)
# Extract revisions behind (e.g. "35 revisions behind")
rev_match = re.search(r"(\d+)\s+revisions?\s+behind", raw_old_timestamp)
if rev_match:
dict_start["revisions_behind"] = int(rev_match.group(1))
else:
dict_start["revisions_behind"] = None
# Merge the start and end dictionaries
feedback_dict = dict_start | dict_end
# Add feedback value
feedback_dict["feedback"] = feedback_value
do_save = True
feedback_action = "Saved"
# Check for duplicate feedback against the most recent feedback file
feedback_files = sorted(
USER_FEEDBACK_DIR.glob("*.json"),
key=lambda p: p.stat().st_mtime,
reverse=True,
)
if feedback_files:
latest_feedback_path = feedback_files[0]
with latest_feedback_path.open("r") as latest_f:
# Files are written as JSON Lines, but currently contain a single line
latest_line = latest_f.readline().strip()
if latest_line:
latest_feedback = json.loads(latest_line)
# Pop the last feedback value
old_feedback = latest_feedback.pop("feedback", None)
# Use the same feedback value (for detecting resubmission)
latest_feedback["feedback"] = feedback_value
if latest_feedback == feedback_dict:
# The user resubmitted feedback: remove the previous feedback
latest_feedback_path.unlink()
if feedback_dict["feedback"] == old_feedback:
# The user submitted the same feedback: Issue a warning
gr.Warning(
f"Removed feedback: <strong>{old_feedback}</strong>",
duration=5,
)
do_save = False
else:
# The user changed the feedback: Proceed to update the feedback
feedback_action = "Updated"
if do_save:
# Randomly assign to train or test split (40% probability for test)
split = "test" if random.random() < 0.4 else "train"
# Save feedback to file
feedback_file = f"{split}-{datetime.now().isoformat()}.json"
feedback_path = USER_FEEDBACK_DIR / feedback_file
with feedback_path.open("a") as f:
f.write(json.dumps(feedback_dict))
f.write("\n")
if feedback_action == "Updated":
gr.Info(f"Updated feedback: <strong>{feedback_value}</strong>", duration=5)
else:
gr.Success(f"Saved feedback: <strong>{feedback_value}</strong>", duration=5)
@logfire.instrument("Save feedback: agree")
def save_feedback_agree(*args) -> None:
"""Wrapper to save feedback with 'agree' value."""
# Schedule feedback for commit if we're running on Hugging Face spaces
if "SPACE_ID" in os.environ:
# Use a thread lock to avoid concurrent writes from different users
with scheduler.lock:
save_feedback(*args, feedback_value="agree")
else:
save_feedback(*args, feedback_value="agree")
@logfire.instrument("Save feedback: disagree")
def save_feedback_disagree(*args) -> None:
"""Wrapper to save feedback with 'disagree' value."""
if "SPACE_ID" in os.environ:
with scheduler.lock:
save_feedback(*args, feedback_value="disagree")
else:
save_feedback(*args, feedback_value="disagree")
|