Buckets:

meet4150
/

alive_pine

meet4150/alive_pine / scripts /download_dataset.py

2.02 kB

	from __future__ import annotations

	import json
	import subprocess
	from pathlib import Path

	from datasets import load_dataset


	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	DATA_DIR = PROJECT_ROOT / "data"
	OUTPUT_PATH = DATA_DIR / "raw_medquad.jsonl"
	FALLBACK_CSV_PATH = DATA_DIR / "medDataset_processed.csv"
	FALLBACK_CSV_URL = (
	"https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset/"
	"resolve/main/medDataset_processed.csv"
	)


	def load_medquad_dataset():
	try:
	return load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train")
	except Exception as exc:
	print(f"Primary Hugging Face dataset load failed: {exc}")
	print("Falling back to direct CSV download with curl, then loading it via datasets.")
	subprocess.run(
	[
	"curl",
	"-L",
	"--fail",
	FALLBACK_CSV_URL,
	"-o",
	str(FALLBACK_CSV_PATH),
	],
	check=True,
	)
	return load_dataset("csv", data_files=str(FALLBACK_CSV_PATH), split="train")


	def main() -> None:
	DATA_DIR.mkdir(parents=True, exist_ok=True)

	dataset = load_medquad_dataset()
	print(f"Total rows loaded: {len(dataset)}")
	print("First 3 rows:")

	for row in dataset.select(range(min(3, len(dataset)))):
	preview = {
	"qtype": row.get("qtype", ""),
	"Question": row.get("Question", ""),
	"Answer": row.get("Answer", ""),
	}
	print(json.dumps(preview, ensure_ascii=False, indent=2))

	with OUTPUT_PATH.open("w", encoding="utf-8") as file_obj:
	for row in dataset:
	record = {
	"qtype": row.get("qtype", ""),
	"question": row.get("Question", ""),
	"answer": row.get("Answer", ""),
	}
	file_obj.write(json.dumps(record, ensure_ascii=False) + "\n")

	print(f"Saved raw dataset to: {OUTPUT_PATH}")


	if __name__ == "__main__":
	main()

Xet Storage Details

Size:: 2.02 kB
Xet hash:: b7c6b32519f12de7266b6002f388ea6769e3c271e24ec1a0034d7e5e315b5d59

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.