File size: 1,391 Bytes
d0c827a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
import os
import xml.etree.ElementTree as ET


def clean_text(text):
	if text is None:
		return ""
	return text.strip()


def parse_medquad(medquad_dir):
	qa_pairs = []

	for root_dir, _, files in os.walk(medquad_dir):
		for file_name in files:
			if not file_name.lower().endswith(".xml"):
				continue

			file_path = os.path.join(root_dir, file_name)

			try:
				tree = ET.parse(file_path)
				root = tree.getroot()
			except ET.ParseError:
				continue

			for qa_pair in root.findall(".//QAPair"):
				question = clean_text(qa_pair.findtext("Question"))
				answer = clean_text(qa_pair.findtext("Answer"))

				if len(answer) < 10:
					continue

				qa_pairs.append(
					{
						"question": question,
						"answer": answer,
						"source": file_name,
					}
				)

	return qa_pairs


def main():
	base_dir = os.path.dirname(os.path.abspath(__file__))
	medquad_dir = os.path.join(base_dir, "MedQuAD")
	data_dir = os.path.join(base_dir, "data")
	output_path = os.path.join(data_dir, "medquad.json")

	if not os.path.isdir(medquad_dir):
		print(f"MedQuAD folder not found: {medquad_dir}")
		return

	os.makedirs(data_dir, exist_ok=True)

	qa_pairs = parse_medquad(medquad_dir)

	with open(output_path, "w", encoding="utf-8") as f:
		json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

	print(f"Total QA pairs saved: {len(qa_pairs)}")


if __name__ == "__main__":
	main()