Buckets:
| from __future__ import annotations | |
| import json | |
| import subprocess | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| OUTPUT_PATH = DATA_DIR / "raw_medquad.jsonl" | |
| FALLBACK_CSV_PATH = DATA_DIR / "medDataset_processed.csv" | |
| FALLBACK_CSV_URL = ( | |
| "https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset/" | |
| "resolve/main/medDataset_processed.csv" | |
| ) | |
| def load_medquad_dataset(): | |
| try: | |
| return load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train") | |
| except Exception as exc: | |
| print(f"Primary Hugging Face dataset load failed: {exc}") | |
| print("Falling back to direct CSV download with curl, then loading it via datasets.") | |
| subprocess.run( | |
| [ | |
| "curl", | |
| "-L", | |
| "--fail", | |
| FALLBACK_CSV_URL, | |
| "-o", | |
| str(FALLBACK_CSV_PATH), | |
| ], | |
| check=True, | |
| ) | |
| return load_dataset("csv", data_files=str(FALLBACK_CSV_PATH), split="train") | |
| def main() -> None: | |
| DATA_DIR.mkdir(parents=True, exist_ok=True) | |
| dataset = load_medquad_dataset() | |
| print(f"Total rows loaded: {len(dataset)}") | |
| print("First 3 rows:") | |
| for row in dataset.select(range(min(3, len(dataset)))): | |
| preview = { | |
| "qtype": row.get("qtype", ""), | |
| "Question": row.get("Question", ""), | |
| "Answer": row.get("Answer", ""), | |
| } | |
| print(json.dumps(preview, ensure_ascii=False, indent=2)) | |
| with OUTPUT_PATH.open("w", encoding="utf-8") as file_obj: | |
| for row in dataset: | |
| record = { | |
| "qtype": row.get("qtype", ""), | |
| "question": row.get("Question", ""), | |
| "answer": row.get("Answer", ""), | |
| } | |
| file_obj.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| print(f"Saved raw dataset to: {OUTPUT_PATH}") | |
| if __name__ == "__main__": | |
| main() | |
Xet Storage Details
- Size:
- 2.02 kB
- Xet hash:
- b7c6b32519f12de7266b6002f388ea6769e3c271e24ec1a0034d7e5e315b5d59
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.