rafmacalaba commited on
Commit
fb404c5
Β·
1 Parent(s): 7c5c449

feat: prepare_data.py script + use has_revalidation filter in documents API

Browse files

- prepare_data.py: scans local docs, uploads missing _direct_judged.jsonl
to HF, generates updated wbg_pdf_links.json with src_docname and
has_revalidation fields
- documents API: only lists docs with has_revalidation=true
- Fixed misleading error message in storage.js

Files changed (3) hide show
  1. app/api/documents/route.js +4 -2
  2. prepare_data.py +242 -0
  3. utils/storage.js +1 -1
app/api/documents/route.js CHANGED
@@ -21,8 +21,10 @@ export async function GET() {
21
 
22
  const links = await linksRes.json();
23
 
24
- // Filter to successful links and take the first N
25
- const successLinks = links.filter(l => l.status === 'success').slice(0, MAX_DOCS_TO_SCAN);
 
 
26
 
27
  // Parallel fetch β€” much faster than sequential scanning
28
  const results = await Promise.allSettled(
 
21
 
22
  const links = await linksRes.json();
23
 
24
+ // Filter to docs with revalidation data and take the first N
25
+ const successLinks = links
26
+ .filter(l => l.status === 'success' && l.has_revalidation === true)
27
+ .slice(0, MAX_DOCS_TO_SCAN);
28
 
29
  // Parallel fetch β€” much faster than sequential scanning
30
  const results = await Promise.allSettled(
prepare_data.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ prepare_data.py
4
+
5
+ Scans local wbg_extractions, identifies docs with real _direct_judged.jsonl
6
+ (not dummy), uploads them to HF, and generates/uploads an updated wbg_pdf_links.json.
7
+
8
+ Usage:
9
+ # Dry run (scan only, no uploads):
10
+ python prepare_data.py --dry-run
11
+
12
+ # Upload missing docs + generate new pdf_links:
13
+ python prepare_data.py
14
+
15
+ # Only generate pdf_links without uploading docs:
16
+ python prepare_data.py --links-only
17
+
18
+ Requires: pip install huggingface_hub requests
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import sys
25
+ import requests
26
+ from pathlib import Path
27
+ from huggingface_hub import HfApi
28
+
29
+ # ─── Configuration ───────────────────────────────
30
+ HF_TOKEN = os.environ.get("HF_TOKEN")
31
+ REPO_ID = "ai4data/annotation_data"
32
+ LOCAL_BASE = Path(__file__).parent / "annotation_data" / "wbg_extractions"
33
+ LINKS_REPO_PATH = "annotation_data/wbg_data/wbg_pdf_links.json"
34
+
35
+
36
+ def get_hf_token():
37
+ """Get HF token from env, .env file, or cached token."""
38
+ if HF_TOKEN:
39
+ return HF_TOKEN
40
+ # Try .env file
41
+ env_path = Path(__file__).parent / ".env"
42
+ if env_path.exists():
43
+ for line in env_path.read_text().splitlines():
44
+ if line.startswith("HF_TOKEN="):
45
+ return line.split("=", 1)[1].strip()
46
+ # Try cached token
47
+ cached = Path.home() / ".cache" / "huggingface" / "token"
48
+ if cached.exists():
49
+ return cached.read_text().strip()
50
+ return None
51
+
52
+
53
+ def scan_local_docs():
54
+ """Scan local wbg_extractions and classify docs."""
55
+ docs = sorted(
56
+ [d for d in os.listdir(LOCAL_BASE) if d.startswith("doc_")],
57
+ key=lambda x: int(x.split("_")[1]),
58
+ )
59
+
60
+ results = {"real": [], "dummy": [], "no_file": []}
61
+
62
+ for doc in docs:
63
+ idx = int(doc.split("_")[1])
64
+ raw_dir = LOCAL_BASE / doc / "raw"
65
+
66
+ real_file = raw_dir / f"{doc}_direct_judged.jsonl"
67
+ dummy_file = raw_dir / f"{doc}_dummy_direct_judged.jsonl"
68
+
69
+ if real_file.exists():
70
+ results["real"].append({"name": doc, "index": idx, "path": str(real_file)})
71
+ elif dummy_file.exists():
72
+ results["dummy"].append({"name": doc, "index": idx})
73
+ else:
74
+ results["no_file"].append({"name": doc, "index": idx})
75
+
76
+ return results
77
+
78
+
79
+ def get_existing_hf_docs(api):
80
+ """Check which docs already have _direct_judged.jsonl on HF."""
81
+ try:
82
+ items = list(api.list_repo_tree(
83
+ REPO_ID, repo_type="dataset",
84
+ path_in_repo="annotation_data/wbg_extractions"
85
+ ))
86
+ doc_names = [item.path.split("/")[-1] for item in items if hasattr(item, "path")]
87
+ return set(doc_names)
88
+ except Exception as e:
89
+ print(f" Warning: Could not list HF repo: {e}")
90
+ return set()
91
+
92
+
93
+ def upload_docs(api, docs_to_upload, dry_run=False):
94
+ """Upload _direct_judged.jsonl files to HF for docs that are missing."""
95
+ uploaded = 0
96
+ skipped = 0
97
+
98
+ for doc in docs_to_upload:
99
+ repo_path = f"annotation_data/wbg_extractions/{doc['name']}/raw/{doc['name']}_direct_judged.jsonl"
100
+
101
+ if dry_run:
102
+ print(f" [DRY RUN] Would upload: {doc['name']}")
103
+ continue
104
+
105
+ try:
106
+ api.upload_file(
107
+ path_or_fileobj=doc["path"],
108
+ path_in_repo=repo_path,
109
+ repo_id=REPO_ID,
110
+ repo_type="dataset",
111
+ commit_message=f"Upload {doc['name']}_direct_judged.jsonl",
112
+ )
113
+ print(f" βœ… Uploaded: {doc['name']}")
114
+ uploaded += 1
115
+ except Exception as e:
116
+ print(f" ❌ Failed {doc['name']}: {e}")
117
+ skipped += 1
118
+
119
+ return uploaded, skipped
120
+
121
+
122
+ def fetch_current_links(api, token):
123
+ """Fetch current wbg_pdf_links.json from HF."""
124
+ url = f"https://huggingface.co/datasets/{REPO_ID}/raw/main/{LINKS_REPO_PATH}"
125
+ resp = requests.get(url, headers={"Authorization": f"Bearer {token}"})
126
+ if resp.status_code == 200:
127
+ return resp.json()
128
+ print(f" Warning: Could not fetch existing links (HTTP {resp.status_code})")
129
+ return []
130
+
131
+
132
+ def generate_updated_links(current_links, local_docs, token):
133
+ """
134
+ Generate updated wbg_pdf_links.json with:
135
+ - src_docname: doc_{index}
136
+ - has_revalidation: true if _direct_judged.jsonl exists (not dummy)
137
+ """
138
+ real_indices = {d["index"] for d in local_docs["real"]}
139
+ dummy_indices = {d["index"] for d in local_docs["dummy"]}
140
+
141
+ # Build a lookup from current links
142
+ links_by_index = {link["index"]: link for link in current_links}
143
+
144
+ updated_links = []
145
+ for link in current_links:
146
+ idx = link["index"]
147
+ entry = {
148
+ "index": idx,
149
+ "src_docname": f"doc_{idx}",
150
+ "landing_page_url": link.get("landing_page_url", ""),
151
+ "direct_pdf_url": link.get("direct_pdf_url", ""),
152
+ "status": link.get("status", "unknown"),
153
+ "has_revalidation": idx in real_indices,
154
+ }
155
+ updated_links.append(entry)
156
+
157
+ return updated_links
158
+
159
+
160
+ def upload_links(api, links, dry_run=False):
161
+ """Upload the updated wbg_pdf_links.json to HF."""
162
+ content = json.dumps(links, indent=2)
163
+
164
+ if dry_run:
165
+ print(f" [DRY RUN] Would upload updated wbg_pdf_links.json ({len(links)} entries)")
166
+ return
167
+
168
+ # Save locally first
169
+ local_path = Path(__file__).parent / "annotation_data" / "wbg_data"
170
+ local_path.mkdir(parents=True, exist_ok=True)
171
+ local_file = local_path / "wbg_pdf_links.json"
172
+ local_file.write_text(content)
173
+ print(f" πŸ’Ύ Saved locally: {local_file}")
174
+
175
+ api.upload_file(
176
+ path_or_fileobj=str(local_file),
177
+ path_in_repo=LINKS_REPO_PATH,
178
+ repo_id=REPO_ID,
179
+ repo_type="dataset",
180
+ commit_message="Update wbg_pdf_links.json with src_docname and has_revalidation",
181
+ )
182
+ print(f" βœ… Uploaded wbg_pdf_links.json to HF")
183
+
184
+
185
+ def main():
186
+ parser = argparse.ArgumentParser(description="Prepare and upload annotation data")
187
+ parser.add_argument("--dry-run", action="store_true", help="Scan only, don't upload")
188
+ parser.add_argument("--links-only", action="store_true",
189
+ help="Only generate/upload updated pdf_links, skip doc uploads")
190
+ args = parser.parse_args()
191
+
192
+ token = get_hf_token()
193
+ if not token:
194
+ print("❌ No HF_TOKEN found. Set it via environment variable or .env file.")
195
+ sys.exit(1)
196
+
197
+ api = HfApi(token=token)
198
+
199
+ # 1. Scan local docs
200
+ print("\nπŸ“‚ Scanning local wbg_extractions...")
201
+ local_docs = scan_local_docs()
202
+ print(f" Real _direct_judged.jsonl: {len(local_docs['real'])}")
203
+ print(f" Dummy (skipped): {len(local_docs['dummy'])}")
204
+ print(f" No file: {len(local_docs['no_file'])}")
205
+
206
+ if not args.links_only:
207
+ # 2. Check what's already on HF
208
+ print("\nπŸ” Checking existing docs on HF...")
209
+ existing = get_existing_hf_docs(api)
210
+ print(f" Found {len(existing)} doc folders on HF")
211
+
212
+ # 3. Find docs to upload (real but not yet on HF, or need update)
213
+ to_upload = [d for d in local_docs["real"] if d["name"] not in existing]
214
+ already_on_hf = [d for d in local_docs["real"] if d["name"] in existing]
215
+ print(f"\nπŸ“€ Docs to upload: {len(to_upload)}")
216
+ print(f" Already on HF: {len(already_on_hf)}")
217
+
218
+ if to_upload:
219
+ print("\nπŸš€ Uploading missing docs...")
220
+ uploaded, skipped = upload_docs(api, to_upload, dry_run=args.dry_run)
221
+ if not args.dry_run:
222
+ print(f" Uploaded: {uploaded}, Skipped: {skipped}")
223
+
224
+ # 4. Generate updated pdf_links
225
+ print("\nπŸ“‹ Generating updated wbg_pdf_links.json...")
226
+ current_links = fetch_current_links(api, token)
227
+ updated_links = generate_updated_links(current_links, local_docs, token)
228
+
229
+ with_revalidation = sum(1 for l in updated_links if l["has_revalidation"])
230
+ print(f" Total entries: {len(updated_links)}")
231
+ print(f" With revalidation: {with_revalidation}")
232
+ print(f" Without: {len(updated_links) - with_revalidation}")
233
+
234
+ # 5. Upload
235
+ print("\nπŸ“€ Uploading updated wbg_pdf_links.json...")
236
+ upload_links(api, updated_links, dry_run=args.dry_run)
237
+
238
+ print("\nβœ… Done!")
239
+
240
+
241
+ if __name__ == "__main__":
242
+ main()
utils/storage.js CHANGED
@@ -132,7 +132,7 @@ export async function saveAnnotation(annotation) {
132
  } else {
133
  // Local: read, modify, write
134
  const pagesData = readDocLocal(docIndex);
135
- if (!pagesData) throw new Error(`doc_${docIndex}_raw.json not found locally`);
136
 
137
  const pageIdx = findPageIndex(pagesData, pageNumber);
138
  if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
 
132
  } else {
133
  // Local: read, modify, write
134
  const pagesData = readDocLocal(docIndex);
135
+ if (!pagesData) throw new Error(`doc_${docIndex}_direct_judged.jsonl not found locally`);
136
 
137
  const pageIdx = findPageIndex(pagesData, pageNumber);
138
  if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);