Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -67,6 +67,74 @@ def list_samples(samples_dir):
|
|
| 67 |
return files
|
| 68 |
|
| 69 |
def save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
"""
|
| 71 |
Push a list of dict rows to a private HF dataset, one JSON file per row.
|
| 72 |
|
|
@@ -108,7 +176,7 @@ def save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = N
|
|
| 108 |
json_bytes = json.dumps(row_dict, indent=2).encode("utf-8")
|
| 109 |
|
| 110 |
api.upload_file(
|
| 111 |
-
|
| 112 |
path_in_repo=path_in_repo,
|
| 113 |
repo_id=repo_id,
|
| 114 |
repo_type="dataset",
|
|
|
|
| 67 |
return files
|
| 68 |
|
| 69 |
def save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = None):
|
| 70 |
+
"""
|
| 71 |
+
Append new rows to a CSV file in a private Hugging Face dataset.
|
| 72 |
+
|
| 73 |
+
- Reads the existing CSV (if present).
|
| 74 |
+
- Appends new rows.
|
| 75 |
+
- Uploads the updated file back to the repo.
|
| 76 |
+
|
| 77 |
+
Each 'row' should be a dict with consistent keys.
|
| 78 |
+
|
| 79 |
+
NOTE:
|
| 80 |
+
- Replaces the entire CSV on each update (no true append on the server side).
|
| 81 |
+
- Use small/medium datasets; large ones should use the `datasets` library instead.
|
| 82 |
+
"""
|
| 83 |
+
if HfApi is None:
|
| 84 |
+
return {"status": "hf_unavailable", "reason": "missing_packages"}
|
| 85 |
+
|
| 86 |
+
token = token or os.environ.get("HF_TOKEN")
|
| 87 |
+
repo_id = repo_id or os.environ.get("HF_DATASET_ID")
|
| 88 |
+
if not token or not repo_id:
|
| 89 |
+
return {"status": "hf_skipped", "reason": "missing_token_or_repo_env"}
|
| 90 |
+
|
| 91 |
+
api = HfApi(token=token)
|
| 92 |
+
path_in_repo = "data/responses.csv" # fixed CSV location in repo
|
| 93 |
+
repo_err = None
|
| 94 |
+
|
| 95 |
+
# Ensure dataset exists
|
| 96 |
+
try:
|
| 97 |
+
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
repo_err = str(e)
|
| 100 |
+
|
| 101 |
+
# Try downloading existing CSV
|
| 102 |
+
existing_df = pd.DataFrame()
|
| 103 |
+
try:
|
| 104 |
+
local_path = hf_hub_download(
|
| 105 |
+
repo_id=repo_id,
|
| 106 |
+
filename=path_in_repo,
|
| 107 |
+
repo_type="dataset",
|
| 108 |
+
token=token,
|
| 109 |
+
)
|
| 110 |
+
existing_df = pd.read_csv(local_path)
|
| 111 |
+
except Exception:
|
| 112 |
+
# File doesn't exist or is unreadable — start fresh
|
| 113 |
+
pass
|
| 114 |
+
|
| 115 |
+
# Convert new rows to DataFrame and append
|
| 116 |
+
new_df = pd.DataFrame(rows)
|
| 117 |
+
combined_df = pd.concat([existing_df, new_df], ignore_index=True)
|
| 118 |
+
|
| 119 |
+
# Save to memory as CSV
|
| 120 |
+
csv_buffer = io.StringIO()
|
| 121 |
+
combined_df.to_csv(csv_buffer, index=False)
|
| 122 |
+
csv_bytes = csv_buffer.getvalue().encode("utf-8")
|
| 123 |
+
|
| 124 |
+
# Upload the updated CSV
|
| 125 |
+
try:
|
| 126 |
+
api.upload_file(
|
| 127 |
+
path_or_fileobj=csv_bytes,
|
| 128 |
+
path_in_repo=path_in_repo,
|
| 129 |
+
repo_id=repo_id,
|
| 130 |
+
repo_type="dataset",
|
| 131 |
+
)
|
| 132 |
+
except Exception as e:
|
| 133 |
+
return {"status": "hf_push_error", "error": str(e), "repo_error": repo_err}
|
| 134 |
+
|
| 135 |
+
return {"status": "hf_pushed", "rows_added": len(rows), "repo": repo_id, "repo_error": repo_err}
|
| 136 |
+
|
| 137 |
+
def _save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = None):
|
| 138 |
"""
|
| 139 |
Push a list of dict rows to a private HF dataset, one JSON file per row.
|
| 140 |
|
|
|
|
| 176 |
json_bytes = json.dumps(row_dict, indent=2).encode("utf-8")
|
| 177 |
|
| 178 |
api.upload_file(
|
| 179 |
+
path_or_obj=json_bytes,
|
| 180 |
path_in_repo=path_in_repo,
|
| 181 |
repo_id=repo_id,
|
| 182 |
repo_type="dataset",
|