rafmacalaba commited on
Commit
85eb22c
Β·
1 Parent(s): e7f7858

feat: add language detection to prepare_data.py, exclude non-English docs

Browse files

- Uses langdetect to identify document language (samples pages 2-5)
- 12 non-English docs excluded (8 Arabic, 4 French)
- 99 English docs with has_revalidation=true
- Added 'language' field to wbg_pdf_links.json entries

Files changed (1) hide show
  1. prepare_data.py +64 -30
prepare_data.py CHANGED
@@ -3,19 +3,20 @@
3
  prepare_data.py
4
 
5
  Scans local wbg_extractions, identifies docs with real _direct_judged.jsonl
6
- (not dummy), uploads them to HF, and generates/uploads an updated wbg_pdf_links.json.
 
7
 
8
  Usage:
9
  # Dry run (scan only, no uploads):
10
- python prepare_data.py --dry-run
11
 
12
  # Upload missing docs + generate new pdf_links:
13
- python prepare_data.py
14
 
15
  # Only generate pdf_links without uploading docs:
16
- python prepare_data.py --links-only
17
 
18
- Requires: pip install huggingface_hub requests
19
  """
20
 
21
  import argparse
@@ -25,6 +26,7 @@ import sys
25
  import requests
26
  from pathlib import Path
27
  from huggingface_hub import HfApi
 
28
 
29
  # ─── Configuration ───────────────────────────────
30
  HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -37,27 +39,43 @@ def get_hf_token():
37
  """Get HF token from env, .env file, or cached token."""
38
  if HF_TOKEN:
39
  return HF_TOKEN
40
- # Try .env file
41
  env_path = Path(__file__).parent / ".env"
42
  if env_path.exists():
43
  for line in env_path.read_text().splitlines():
44
  if line.startswith("HF_TOKEN="):
45
  return line.split("=", 1)[1].strip()
46
- # Try cached token
47
  cached = Path.home() / ".cache" / "huggingface" / "token"
48
  if cached.exists():
49
  return cached.read_text().strip()
50
  return None
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def scan_local_docs():
54
- """Scan local wbg_extractions and classify docs."""
55
  docs = sorted(
56
  [d for d in os.listdir(LOCAL_BASE) if d.startswith("doc_")],
57
  key=lambda x: int(x.split("_")[1]),
58
  )
59
 
60
- results = {"real": [], "dummy": [], "no_file": []}
61
 
62
  for doc in docs:
63
  idx = int(doc.split("_")[1])
@@ -67,7 +85,12 @@ def scan_local_docs():
67
  dummy_file = raw_dir / f"{doc}_dummy_direct_judged.jsonl"
68
 
69
  if real_file.exists():
70
- results["real"].append({"name": doc, "index": idx, "path": str(real_file)})
 
 
 
 
 
71
  elif dummy_file.exists():
72
  results["dummy"].append({"name": doc, "index": idx})
73
  else:
@@ -133,13 +156,17 @@ def generate_updated_links(current_links, local_docs, token):
133
  """
134
  Generate updated wbg_pdf_links.json with:
135
  - src_docname: doc_{index}
136
- - has_revalidation: true if _direct_judged.jsonl exists (not dummy)
 
137
  """
138
- real_indices = {d["index"] for d in local_docs["real"]}
139
- dummy_indices = {d["index"] for d in local_docs["dummy"]}
 
 
 
 
140
 
141
- # Build a lookup from current links
142
- links_by_index = {link["index"]: link for link in current_links}
143
 
144
  updated_links = []
145
  for link in current_links:
@@ -150,7 +177,8 @@ def generate_updated_links(current_links, local_docs, token):
150
  "landing_page_url": link.get("landing_page_url", ""),
151
  "direct_pdf_url": link.get("direct_pdf_url", ""),
152
  "status": link.get("status", "unknown"),
153
- "has_revalidation": idx in real_indices,
 
154
  }
155
  updated_links.append(entry)
156
 
@@ -165,7 +193,6 @@ def upload_links(api, links, dry_run=False):
165
  print(f" [DRY RUN] Would upload updated wbg_pdf_links.json ({len(links)} entries)")
166
  return
167
 
168
- # Save locally first
169
  local_path = Path(__file__).parent / "annotation_data" / "wbg_data"
170
  local_path.mkdir(parents=True, exist_ok=True)
171
  local_file = local_path / "wbg_pdf_links.json"
@@ -177,7 +204,7 @@ def upload_links(api, links, dry_run=False):
177
  path_in_repo=LINKS_REPO_PATH,
178
  repo_id=REPO_ID,
179
  repo_type="dataset",
180
- commit_message="Update wbg_pdf_links.json with src_docname and has_revalidation",
181
  )
182
  print(f" βœ… Uploaded wbg_pdf_links.json to HF")
183
 
@@ -196,12 +223,18 @@ def main():
196
 
197
  api = HfApi(token=token)
198
 
199
- # 1. Scan local docs
200
- print("\nπŸ“‚ Scanning local wbg_extractions...")
201
  local_docs = scan_local_docs()
202
- print(f" Real _direct_judged.jsonl: {len(local_docs['real'])}")
203
- print(f" Dummy (skipped): {len(local_docs['dummy'])}")
204
- print(f" No file: {len(local_docs['no_file'])}")
 
 
 
 
 
 
205
 
206
  if not args.links_only:
207
  # 2. Check what's already on HF
@@ -209,14 +242,14 @@ def main():
209
  existing = get_existing_hf_docs(api)
210
  print(f" Found {len(existing)} doc folders on HF")
211
 
212
- # 3. Find docs to upload (real but not yet on HF, or need update)
213
  to_upload = [d for d in local_docs["real"] if d["name"] not in existing]
214
  already_on_hf = [d for d in local_docs["real"] if d["name"] in existing]
215
- print(f"\nπŸ“€ Docs to upload: {len(to_upload)}")
216
- print(f" Already on HF: {len(already_on_hf)}")
217
 
218
  if to_upload:
219
- print("\nπŸš€ Uploading missing docs...")
220
  uploaded, skipped = upload_docs(api, to_upload, dry_run=args.dry_run)
221
  if not args.dry_run:
222
  print(f" Uploaded: {uploaded}, Skipped: {skipped}")
@@ -227,9 +260,10 @@ def main():
227
  updated_links = generate_updated_links(current_links, local_docs, token)
228
 
229
  with_revalidation = sum(1 for l in updated_links if l["has_revalidation"])
230
- print(f" Total entries: {len(updated_links)}")
231
- print(f" With revalidation: {with_revalidation}")
232
- print(f" Without: {len(updated_links) - with_revalidation}")
 
233
 
234
  # 5. Upload
235
  print("\nπŸ“€ Uploading updated wbg_pdf_links.json...")
 
3
  prepare_data.py
4
 
5
  Scans local wbg_extractions, identifies docs with real _direct_judged.jsonl
6
+ (not dummy), detects language, uploads English docs to HF, and generates/uploads
7
+ an updated wbg_pdf_links.json.
8
 
9
  Usage:
10
  # Dry run (scan only, no uploads):
11
+ uv run --with huggingface_hub,requests,langdetect python3 prepare_data.py --dry-run
12
 
13
  # Upload missing docs + generate new pdf_links:
14
+ uv run --with huggingface_hub,requests,langdetect python3 prepare_data.py
15
 
16
  # Only generate pdf_links without uploading docs:
17
+ uv run --with huggingface_hub,requests,langdetect python3 prepare_data.py --links-only
18
 
19
+ Requires: huggingface_hub, requests, langdetect
20
  """
21
 
22
  import argparse
 
26
  import requests
27
  from pathlib import Path
28
  from huggingface_hub import HfApi
29
+ from langdetect import detect, LangDetectException
30
 
31
  # ─── Configuration ───────────────────────────────
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
39
  """Get HF token from env, .env file, or cached token."""
40
  if HF_TOKEN:
41
  return HF_TOKEN
 
42
  env_path = Path(__file__).parent / ".env"
43
  if env_path.exists():
44
  for line in env_path.read_text().splitlines():
45
  if line.startswith("HF_TOKEN="):
46
  return line.split("=", 1)[1].strip()
 
47
  cached = Path.home() / ".cache" / "huggingface" / "token"
48
  if cached.exists():
49
  return cached.read_text().strip()
50
  return None
51
 
52
 
53
+ def detect_language(doc_path):
54
+ """
55
+ Detect language of a document by sampling pages 2-5 (skipping first page
56
+ which often contains abbreviation tables / currency equivalents).
57
+ Returns ISO 639-1 language code (e.g. 'en', 'fr', 'ar').
58
+ """
59
+ try:
60
+ data = json.loads(Path(doc_path).read_text())
61
+ # Sample from pages 2-5 to avoid abbreviation-heavy first pages
62
+ texts = " ".join(p.get("input_text", "")[:500] for p in data[1:5])
63
+ if len(texts.strip()) < 50:
64
+ # Fallback to first 3 pages if later pages are empty
65
+ texts = " ".join(p.get("input_text", "")[:500] for p in data[:3])
66
+ return detect(texts)
67
+ except (LangDetectException, json.JSONDecodeError, FileNotFoundError):
68
+ return "unknown"
69
+
70
+
71
  def scan_local_docs():
72
+ """Scan local wbg_extractions and classify docs with language detection."""
73
  docs = sorted(
74
  [d for d in os.listdir(LOCAL_BASE) if d.startswith("doc_")],
75
  key=lambda x: int(x.split("_")[1]),
76
  )
77
 
78
+ results = {"real": [], "real_non_english": [], "dummy": [], "no_file": []}
79
 
80
  for doc in docs:
81
  idx = int(doc.split("_")[1])
 
85
  dummy_file = raw_dir / f"{doc}_dummy_direct_judged.jsonl"
86
 
87
  if real_file.exists():
88
+ lang = detect_language(str(real_file))
89
+ entry = {"name": doc, "index": idx, "path": str(real_file), "language": lang}
90
+ if lang == "en":
91
+ results["real"].append(entry)
92
+ else:
93
+ results["real_non_english"].append(entry)
94
  elif dummy_file.exists():
95
  results["dummy"].append({"name": doc, "index": idx})
96
  else:
 
156
  """
157
  Generate updated wbg_pdf_links.json with:
158
  - src_docname: doc_{index}
159
+ - has_revalidation: true if English _direct_judged.jsonl exists
160
+ - language: detected language code
161
  """
162
+ # Build lookup: index β†’ language
163
+ lang_map = {}
164
+ for d in local_docs["real"]:
165
+ lang_map[d["index"]] = d.get("language", "en")
166
+ for d in local_docs["real_non_english"]:
167
+ lang_map[d["index"]] = d.get("language", "unknown")
168
 
169
+ real_english_indices = {d["index"] for d in local_docs["real"]}
 
170
 
171
  updated_links = []
172
  for link in current_links:
 
177
  "landing_page_url": link.get("landing_page_url", ""),
178
  "direct_pdf_url": link.get("direct_pdf_url", ""),
179
  "status": link.get("status", "unknown"),
180
+ "has_revalidation": idx in real_english_indices,
181
+ "language": lang_map.get(idx, "unknown"),
182
  }
183
  updated_links.append(entry)
184
 
 
193
  print(f" [DRY RUN] Would upload updated wbg_pdf_links.json ({len(links)} entries)")
194
  return
195
 
 
196
  local_path = Path(__file__).parent / "annotation_data" / "wbg_data"
197
  local_path.mkdir(parents=True, exist_ok=True)
198
  local_file = local_path / "wbg_pdf_links.json"
 
204
  path_in_repo=LINKS_REPO_PATH,
205
  repo_id=REPO_ID,
206
  repo_type="dataset",
207
+ commit_message="Update wbg_pdf_links.json with language field, exclude non-English",
208
  )
209
  print(f" βœ… Uploaded wbg_pdf_links.json to HF")
210
 
 
223
 
224
  api = HfApi(token=token)
225
 
226
+ # 1. Scan local docs with language detection
227
+ print("\nπŸ“‚ Scanning local wbg_extractions (with language detection)...")
228
  local_docs = scan_local_docs()
229
+ print(f" Real (English): {len(local_docs['real'])}")
230
+ print(f" Real (non-English): {len(local_docs['real_non_english'])}")
231
+ print(f" Dummy (skipped): {len(local_docs['dummy'])}")
232
+ print(f" No file: {len(local_docs['no_file'])}")
233
+
234
+ if local_docs["real_non_english"]:
235
+ print("\n Non-English docs excluded:")
236
+ for d in local_docs["real_non_english"]:
237
+ print(f" {d['name']}: {d['language']}")
238
 
239
  if not args.links_only:
240
  # 2. Check what's already on HF
 
242
  existing = get_existing_hf_docs(api)
243
  print(f" Found {len(existing)} doc folders on HF")
244
 
245
+ # 3. Upload only English docs not yet on HF
246
  to_upload = [d for d in local_docs["real"] if d["name"] not in existing]
247
  already_on_hf = [d for d in local_docs["real"] if d["name"] in existing]
248
+ print(f"\nπŸ“€ English docs to upload: {len(to_upload)}")
249
+ print(f" Already on HF: {len(already_on_hf)}")
250
 
251
  if to_upload:
252
+ print("\nπŸš€ Uploading missing English docs...")
253
  uploaded, skipped = upload_docs(api, to_upload, dry_run=args.dry_run)
254
  if not args.dry_run:
255
  print(f" Uploaded: {uploaded}, Skipped: {skipped}")
 
260
  updated_links = generate_updated_links(current_links, local_docs, token)
261
 
262
  with_revalidation = sum(1 for l in updated_links if l["has_revalidation"])
263
+ non_english = sum(1 for l in updated_links if l["language"] not in ("en", "unknown"))
264
+ print(f" Total entries: {len(updated_links)}")
265
+ print(f" English with revalidation: {with_revalidation}")
266
+ print(f" Non-English (excluded): {non_english}")
267
 
268
  # 5. Upload
269
  print("\nπŸ“€ Uploading updated wbg_pdf_links.json...")