JackSparrow89 commited on
Commit
019f095
·
verified ·
1 Parent(s): 7eb0a48

Upload download_beir_datasets.py

Browse files
Files changed (1) hide show
  1. download_beir_datasets.py +75 -0
download_beir_datasets.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+ import urllib.request
6
+ import zipfile
7
+ from pathlib import Path
8
+
9
+
10
+ BASE_DIR = Path(__file__).resolve().parent
11
+ DATA_DIR = BASE_DIR / "data"
12
+ URL_TEMPLATE = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{name}.zip"
13
+
14
+ DATASETS = {
15
+ "scifact": "5f7d1de60b170fc8027bb7898e2efca1",
16
+ "nfcorpus": "a89dba18a62ef92f7d323ec890a0d38d",
17
+ }
18
+
19
+ REQUIRED_FILES = ("corpus.jsonl", "queries.jsonl")
20
+
21
+
22
+ def has_dataset(dataset_dir: Path) -> bool:
23
+ return dataset_dir.is_dir() and all((dataset_dir / name).exists() for name in REQUIRED_FILES)
24
+
25
+
26
+ def md5sum(path: Path) -> str:
27
+ digest = hashlib.md5()
28
+ with path.open("rb") as f:
29
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
30
+ digest.update(chunk)
31
+ return digest.hexdigest()
32
+
33
+
34
+ def download_file(url: str, destination: Path) -> None:
35
+ with urllib.request.urlopen(url) as response, destination.open("wb") as out_file:
36
+ shutil.copyfileobj(response, out_file)
37
+
38
+
39
+ def ensure_dataset(name: str, expected_md5: str) -> None:
40
+ dataset_dir = DATA_DIR / name
41
+ if has_dataset(dataset_dir):
42
+ print(f"[Dataset] {name} already present at {dataset_dir}")
43
+ return
44
+
45
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
46
+ url = URL_TEMPLATE.format(name=name)
47
+
48
+ with tempfile.TemporaryDirectory() as temp_dir:
49
+ zip_path = Path(temp_dir) / f"{name}.zip"
50
+ print(f"[Dataset] Downloading {name} from {url}")
51
+ download_file(url, zip_path)
52
+
53
+ actual_md5 = md5sum(zip_path)
54
+ if actual_md5 != expected_md5:
55
+ raise RuntimeError(
56
+ f"{name} checksum mismatch: expected {expected_md5}, got {actual_md5}"
57
+ )
58
+
59
+ print(f"[Dataset] Extracting {name} into {DATA_DIR}")
60
+ with zipfile.ZipFile(zip_path, "r") as archive:
61
+ archive.extractall(DATA_DIR)
62
+
63
+ if not has_dataset(dataset_dir):
64
+ raise RuntimeError(f"{name} download finished, but required files are missing in {dataset_dir}")
65
+
66
+ print(f"[Dataset] {name} ready at {dataset_dir}")
67
+
68
+
69
+ def main() -> None:
70
+ for name, checksum in DATASETS.items():
71
+ ensure_dataset(name, checksum)
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()