meet4150/alive_pine / scripts /download_dataset.py
download
raw
2.02 kB
from __future__ import annotations
import json
import subprocess
from pathlib import Path
from datasets import load_dataset
PROJECT_ROOT = Path(__file__).resolve().parents[1]
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_PATH = DATA_DIR / "raw_medquad.jsonl"
FALLBACK_CSV_PATH = DATA_DIR / "medDataset_processed.csv"
FALLBACK_CSV_URL = (
"https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset/"
"resolve/main/medDataset_processed.csv"
)
def load_medquad_dataset():
try:
return load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train")
except Exception as exc:
print(f"Primary Hugging Face dataset load failed: {exc}")
print("Falling back to direct CSV download with curl, then loading it via datasets.")
subprocess.run(
[
"curl",
"-L",
"--fail",
FALLBACK_CSV_URL,
"-o",
str(FALLBACK_CSV_PATH),
],
check=True,
)
return load_dataset("csv", data_files=str(FALLBACK_CSV_PATH), split="train")
def main() -> None:
DATA_DIR.mkdir(parents=True, exist_ok=True)
dataset = load_medquad_dataset()
print(f"Total rows loaded: {len(dataset)}")
print("First 3 rows:")
for row in dataset.select(range(min(3, len(dataset)))):
preview = {
"qtype": row.get("qtype", ""),
"Question": row.get("Question", ""),
"Answer": row.get("Answer", ""),
}
print(json.dumps(preview, ensure_ascii=False, indent=2))
with OUTPUT_PATH.open("w", encoding="utf-8") as file_obj:
for row in dataset:
record = {
"qtype": row.get("qtype", ""),
"question": row.get("Question", ""),
"answer": row.get("Answer", ""),
}
file_obj.write(json.dumps(record, ensure_ascii=False) + "\n")
print(f"Saved raw dataset to: {OUTPUT_PATH}")
if __name__ == "__main__":
main()

Xet Storage Details

Size:
2.02 kB
·
Xet hash:
b7c6b32519f12de7266b6002f388ea6769e3c271e24ec1a0034d7e5e315b5d59

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.