taraky's picture
Upload folder using huggingface_hub
b7f3196 verified
import json, jsonlines, pathlib
import concurrent.futures
from tqdm import tqdm
from datasets import load_dataset
from math import ceil
from pubmed import download_pubmed
OUT = pathlib.Path("data/corpora")
OUT.mkdir(parents=True, exist_ok=True)
PUBMED_ARTICLES_PER_XML_FILE = 30000
def write_jsonl(path, rows):
print(f"Writing {len(rows)} records to {path}")
with jsonlines.open(path, "w") as out:
out.write_all(rows)
print(f"Finished writing {path}")
# 1) LasseRegin medical Q&A
def build_lasseregin():
print("Starting LasseRegin build...")
import urllib.request
url = "https://raw.githubusercontent.com/LasseRegin/medical-question-answer-data/master/icliniqQAs.json"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode("utf-8"))
except Exception as e:
print(f"Failed to download LasseRegin data: {e}")
return
rows = []
for i, r in enumerate(tqdm(data, desc="LasseRegin", leave=False)):
rows.append({
"id": f"icliniq:{i}",
"title": r.get("title",""),
"question": r.get("question",""),
"answer": r.get("answer",""),
"source": "icliniq"
})
write_jsonl(OUT / "medical_qa.jsonl", rows)
print("Completed LasseRegin build.")
# 2) MIRIAD-4.4M-split
def build_miriad(sample_size=200_000):
print(f"Starting MIRIAD build (sample_size={sample_size})...")
try:
ds = load_dataset("miriad/miriad-4.4M", num_proc=4, split="train")
ds = ds.shuffle(seed=42).select(range(min(sample_size, len(ds))))
except Exception as e:
print(f"Failed to load MIRIAD dataset: {e}")
return
rows = []
for i, ex in enumerate(tqdm(ds, desc="miriad", leave=False)):
rows.append({
"id": f"miriad:{i}",
"title": ex.get("paper_title",""),
"question": ex.get("question", ""),
"answer": ex.get("passage_text", ""),
"year": ex.get("year",""),
"specialty": ex.get("specialty",""),
})
write_jsonl(OUT / "miriad_text.jsonl", rows)
print("Completed MIRIAD build.")
# 3) PubMed abstracts
def build_pubmed(max_records=500_000):
num_files = int(ceil(max_records / PUBMED_ARTICLES_PER_XML_FILE))
print(f"Starting PubMed build (num_files={num_files}, max_records={max_records})...")
download_pubmed(OUT / "pubmed.jsonl", num_files)
print("Completed PubMed build.")
# 4) UniDoc-Bench (QA)
def build_unidoc(max_items=1000):
print(f"Starting UniDoc build (max_items={max_items})...")
try:
ds = load_dataset("Salesforce/UniDoc-Bench", split="healthcare")
except Exception as e:
print(f"Failed to load UniDoc dataset: {e}")
return
rows = []
for i, ex in enumerate(tqdm(ds, desc="unidoc", leave=False)):
q = ex.get("question","") or ex.get("query","")
a = ex.get("answer","") or ""
pdf = ex.get("pdf_path") or ex.get("document_path") or ""
domain = ex.get("domain","")
rows.append({
"id": f"unidoc:{i}",
"title": f"{domain} PDF",
"question": q,
"answer": a,
"pdf_path": pdf
})
if i+1 >= max_items:
break
write_jsonl(OUT / "unidoc_qa.jsonl", rows)
print("Completed UniDoc build.")
def main():
print("Starting parallel corpora build...")
# Define tasks
tasks = [
(build_lasseregin, []),
(build_miriad, [1000]),
(build_pubmed, [500_000]),
(build_unidoc, [1000])
]
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(func, *args) for func, args in tasks]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"A task failed: {e}")
print("✅ All corpora built successfully in data/corpora/")
if __name__ == "__main__":
main()