import json import os import xml.etree.ElementTree as ET def clean_text(text): if text is None: return "" return text.strip() def parse_medquad(medquad_dir): qa_pairs = [] for root_dir, _, files in os.walk(medquad_dir): for file_name in files: if not file_name.lower().endswith(".xml"): continue file_path = os.path.join(root_dir, file_name) try: tree = ET.parse(file_path) root = tree.getroot() except ET.ParseError: continue for qa_pair in root.findall(".//QAPair"): question = clean_text(qa_pair.findtext("Question")) answer = clean_text(qa_pair.findtext("Answer")) if len(answer) < 10: continue qa_pairs.append( { "question": question, "answer": answer, "source": file_name, } ) return qa_pairs def main(): base_dir = os.path.dirname(os.path.abspath(__file__)) medquad_dir = os.path.join(base_dir, "MedQuAD") data_dir = os.path.join(base_dir, "data") output_path = os.path.join(data_dir, "medquad.json") if not os.path.isdir(medquad_dir): print(f"MedQuAD folder not found: {medquad_dir}") return os.makedirs(data_dir, exist_ok=True) qa_pairs = parse_medquad(medquad_dir) with open(output_path, "w", encoding="utf-8") as f: json.dump(qa_pairs, f, ensure_ascii=False, indent=2) print(f"Total QA pairs saved: {len(qa_pairs)}") if __name__ == "__main__": main()