deepmage121 commited on
Commit
e4c8b1d
·
1 Parent(s): 8718486

added post_merge

Browse files
Files changed (2) hide show
  1. app.py +15 -0
  2. post_merge.py +175 -0
app.py CHANGED
@@ -15,6 +15,7 @@ from huggingface_hub import hf_hub_download
15
  from huggingface_hub.utils import EntryNotFoundError
16
 
17
  from dedup import DATASET_REPO_ID, DedupReport, check_duplicates, load_manifest
 
18
  from validate_data import FileValidationResult, validate_with_pydantic
19
 
20
  logging.basicConfig(
@@ -218,6 +219,11 @@ def process_pr(pr_num: int) -> dict:
218
  """Run full validation + dedup on a PR and post results as a comment."""
219
  logger.info("Processing PR #%d", pr_num)
220
 
 
 
 
 
 
221
  # Find changed data files by comparing PR tree to main
222
  changed_files = find_changed_files(pr_num)
223
  if not changed_files:
@@ -343,6 +349,15 @@ async def validate(payload):
343
 
344
  pr_num = payload.discussion.num
345
 
 
 
 
 
 
 
 
 
 
346
  try:
347
  return process_pr(pr_num)
348
  except Exception:
 
15
  from huggingface_hub.utils import EntryNotFoundError
16
 
17
  from dedup import DATASET_REPO_ID, DedupReport, check_duplicates, load_manifest
18
+ from post_merge import handle_merge
19
  from validate_data import FileValidationResult, validate_with_pydantic
20
 
21
  logging.basicConfig(
 
219
  """Run full validation + dedup on a PR and post results as a comment."""
220
  logger.info("Processing PR #%d", pr_num)
221
 
222
+ # Guard: skip if already validated for the current state
223
+ if not pr_needs_validation(pr_num):
224
+ logger.info("PR #%d already validated for current state, skipping", pr_num)
225
+ return {"status": "skipped", "reason": "already validated"}
226
+
227
  # Find changed data files by comparing PR tree to main
228
  changed_files = find_changed_files(pr_num)
229
  if not changed_files:
 
349
 
350
  pr_num = payload.discussion.num
351
 
352
+ # Handle merged PRs — update manifest + dataset card
353
+ if payload.discussion.status == "merged":
354
+ try:
355
+ return handle_merge(pr_num)
356
+ except Exception:
357
+ logger.exception("Post-merge failed for PR #%d", pr_num)
358
+ return {"status": "error", "reason": "post-merge failed"}
359
+
360
+ # Handle open PRs — validate + dedup
361
  try:
362
  return process_pr(pr_num)
363
  except Exception:
post_merge.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Post-merge actions: update manifest.json and dataset card on PR merge."""
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ from datetime import datetime, timezone
7
+
8
+ from huggingface_hub import HfApi, hf_hub_download
9
+ from huggingface_hub.utils import EntryNotFoundError
10
+
11
+ from dedup import DATASET_REPO_ID, compute_fingerprint, compute_sha256
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _list_data_dirs(api: HfApi) -> list[str]:
17
+ """List top-level directory names under data/ (these become config/split names)."""
18
+ dirs: list[str] = []
19
+ for entry in api.list_repo_tree(
20
+ repo_id=DATASET_REPO_ID,
21
+ repo_type="dataset",
22
+ revision="main",
23
+ path_in_repo="data",
24
+ ):
25
+ if not hasattr(entry, "rfilename"): # it's a directory
26
+ # entry.path is like "data/global-mmlu-lite"
27
+ name = entry.path.split("/", 1)[-1]
28
+ dirs.append(name)
29
+ return sorted(dirs)
30
+
31
+
32
+ def _build_dataset_card(configs: list[str]) -> str:
33
+ """Build a dataset card README.md with YAML frontmatter for the viewer."""
34
+ yaml_configs = []
35
+ for config in configs:
36
+ yaml_configs.append(f" - config_name: {config}")
37
+ yaml_configs.append(f" data_files:")
38
+ yaml_configs.append(f" - split: train")
39
+ yaml_configs.append(f" path: data/{config}/**/*.json")
40
+
41
+ yaml_block = "\n".join(yaml_configs)
42
+
43
+ return f"""---
44
+ configs:
45
+ {yaml_block}
46
+ license: mit
47
+ ---
48
+
49
+ # EEE Datastore
50
+
51
+ Evaluation data for the EEE project.
52
+ """
53
+
54
+
55
+ def update_manifest(api: HfApi, merged_files: list[str]) -> None:
56
+ """Download merged files from main, compute hashes, and update manifest.json."""
57
+ # Load existing manifest
58
+ try:
59
+ manifest_path = hf_hub_download(
60
+ repo_id=DATASET_REPO_ID,
61
+ filename="manifest.json",
62
+ repo_type="dataset",
63
+ revision="main",
64
+ )
65
+ with open(manifest_path, "r") as f:
66
+ manifest = json.load(f)
67
+ except (EntryNotFoundError, Exception):
68
+ manifest = {"files": {}}
69
+
70
+ now = datetime.now(timezone.utc).isoformat()
71
+
72
+ for file_path in merged_files:
73
+ try:
74
+ local_path = hf_hub_download(
75
+ repo_id=DATASET_REPO_ID,
76
+ filename=file_path,
77
+ repo_type="dataset",
78
+ revision="main",
79
+ )
80
+ with open(local_path, "rb") as f:
81
+ content = f.read()
82
+
83
+ sha256 = compute_sha256(content)
84
+
85
+ if file_path.endswith(".json"):
86
+ fingerprint = compute_fingerprint(content)
87
+ else:
88
+ fingerprint = sha256
89
+
90
+ manifest["files"][file_path] = {
91
+ "sha256": sha256,
92
+ "fingerprint": fingerprint,
93
+ "added_at": now,
94
+ }
95
+ logger.info("Added %s to manifest", file_path)
96
+ except Exception:
97
+ logger.exception("Failed to process %s for manifest", file_path)
98
+
99
+ # Upload updated manifest
100
+ manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode()
101
+ api.upload_file(
102
+ path_or_fileobj=io.BytesIO(manifest_bytes),
103
+ path_in_repo="manifest.json",
104
+ repo_id=DATASET_REPO_ID,
105
+ repo_type="dataset",
106
+ commit_message="Update manifest.json after PR merge",
107
+ )
108
+ logger.info("Uploaded updated manifest.json (%d files)", len(manifest["files"]))
109
+
110
+
111
+ def update_dataset_card(api: HfApi) -> None:
112
+ """Regenerate the dataset card README.md with configs for all data/ subdirs."""
113
+ configs = _list_data_dirs(api)
114
+ if not configs:
115
+ logger.warning("No data directories found, skipping dataset card update")
116
+ return
117
+
118
+ card_content = _build_dataset_card(configs)
119
+ api.upload_file(
120
+ path_or_fileobj=io.BytesIO(card_content.encode()),
121
+ path_in_repo="README.md",
122
+ repo_id=DATASET_REPO_ID,
123
+ repo_type="dataset",
124
+ commit_message="Update dataset card with configs for viewer",
125
+ )
126
+ logger.info("Updated dataset card with %d configs: %s", len(configs), configs)
127
+
128
+
129
+ def handle_merge(pr_num: int) -> dict:
130
+ """Run all post-merge actions for a PR."""
131
+ logger.info("Handling merge for PR #%d", pr_num)
132
+
133
+ # Find which data files were added/changed by this PR
134
+ # After merge, everything is on main, so we list all data files
135
+ # and update manifest for any that aren't already tracked
136
+ try:
137
+ manifest_path = hf_hub_download(
138
+ repo_id=DATASET_REPO_ID,
139
+ filename="manifest.json",
140
+ repo_type="dataset",
141
+ revision="main",
142
+ )
143
+ with open(manifest_path, "r") as f:
144
+ manifest = json.load(f)
145
+ except (EntryNotFoundError, Exception):
146
+ manifest = {"files": {}}
147
+
148
+ # List all data files on main and find untracked ones
149
+ all_data_files: list[str] = []
150
+ for entry in api.list_repo_tree(
151
+ repo_id=DATASET_REPO_ID,
152
+ repo_type="dataset",
153
+ revision="main",
154
+ recursive=True,
155
+ ):
156
+ if not hasattr(entry, "rfilename"):
157
+ continue
158
+ path = entry.rfilename
159
+ if path.startswith("data/") and (path.endswith(".json") or path.endswith(".jsonl")):
160
+ all_data_files.append(path)
161
+
162
+ untracked = [f for f in all_data_files if f not in manifest.get("files", {})]
163
+ logger.info("Found %d untracked data files after merge", len(untracked))
164
+
165
+ if untracked:
166
+ update_manifest(api, untracked)
167
+
168
+ update_dataset_card(api)
169
+
170
+ return {
171
+ "status": "ok",
172
+ "action": "post_merge",
173
+ "pr": pr_num,
174
+ "files_added_to_manifest": len(untracked),
175
+ }