asuni commited on
Commit
a09b358
·
verified ·
1 Parent(s): 16e12da

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -1
app.py CHANGED
@@ -67,6 +67,74 @@ def list_samples(samples_dir):
67
  return files
68
 
69
  def save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  """
71
  Push a list of dict rows to a private HF dataset, one JSON file per row.
72
 
@@ -108,7 +176,7 @@ def save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = N
108
  json_bytes = json.dumps(row_dict, indent=2).encode("utf-8")
109
 
110
  api.upload_file(
111
- path_or_fileobj=json_bytes,
112
  path_in_repo=path_in_repo,
113
  repo_id=repo_id,
114
  repo_type="dataset",
 
67
  return files
68
 
69
  def save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = None):
70
+ """
71
+ Append new rows to a CSV file in a private Hugging Face dataset.
72
+
73
+ - Reads the existing CSV (if present).
74
+ - Appends new rows.
75
+ - Uploads the updated file back to the repo.
76
+
77
+ Each 'row' should be a dict with consistent keys.
78
+
79
+ NOTE:
80
+ - Replaces the entire CSV on each update (no true append on the server side).
81
+ - Use small/medium datasets; large ones should use the `datasets` library instead.
82
+ """
83
+ if HfApi is None:
84
+ return {"status": "hf_unavailable", "reason": "missing_packages"}
85
+
86
+ token = token or os.environ.get("HF_TOKEN")
87
+ repo_id = repo_id or os.environ.get("HF_DATASET_ID")
88
+ if not token or not repo_id:
89
+ return {"status": "hf_skipped", "reason": "missing_token_or_repo_env"}
90
+
91
+ api = HfApi(token=token)
92
+ path_in_repo = "data/responses.csv" # fixed CSV location in repo
93
+ repo_err = None
94
+
95
+ # Ensure dataset exists
96
+ try:
97
+ api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
98
+ except Exception as e:
99
+ repo_err = str(e)
100
+
101
+ # Try downloading existing CSV
102
+ existing_df = pd.DataFrame()
103
+ try:
104
+ local_path = hf_hub_download(
105
+ repo_id=repo_id,
106
+ filename=path_in_repo,
107
+ repo_type="dataset",
108
+ token=token,
109
+ )
110
+ existing_df = pd.read_csv(local_path)
111
+ except Exception:
112
+ # File doesn't exist or is unreadable — start fresh
113
+ pass
114
+
115
+ # Convert new rows to DataFrame and append
116
+ new_df = pd.DataFrame(rows)
117
+ combined_df = pd.concat([existing_df, new_df], ignore_index=True)
118
+
119
+ # Save to memory as CSV
120
+ csv_buffer = io.StringIO()
121
+ combined_df.to_csv(csv_buffer, index=False)
122
+ csv_bytes = csv_buffer.getvalue().encode("utf-8")
123
+
124
+ # Upload the updated CSV
125
+ try:
126
+ api.upload_file(
127
+ path_or_fileobj=csv_bytes,
128
+ path_in_repo=path_in_repo,
129
+ repo_id=repo_id,
130
+ repo_type="dataset",
131
+ )
132
+ except Exception as e:
133
+ return {"status": "hf_push_error", "error": str(e), "repo_error": repo_err}
134
+
135
+ return {"status": "hf_pushed", "rows_added": len(rows), "repo": repo_id, "repo_error": repo_err}
136
+
137
+ def _save_responses_to_hf(rows, repo_id: str | None = None, token: str | None = None):
138
  """
139
  Push a list of dict rows to a private HF dataset, one JSON file per row.
140
 
 
176
  json_bytes = json.dumps(row_dict, indent=2).encode("utf-8")
177
 
178
  api.upload_file(
179
+ path_or_obj=json_bytes,
180
  path_in_repo=path_in_repo,
181
  repo_id=repo_id,
182
  repo_type="dataset",