Spaces:
Running
Running
| import json | |
| import os | |
| import xml.etree.ElementTree as ET | |
| def clean_text(text): | |
| if text is None: | |
| return "" | |
| return text.strip() | |
| def parse_medquad(medquad_dir): | |
| qa_pairs = [] | |
| for root_dir, _, files in os.walk(medquad_dir): | |
| for file_name in files: | |
| if not file_name.lower().endswith(".xml"): | |
| continue | |
| file_path = os.path.join(root_dir, file_name) | |
| try: | |
| tree = ET.parse(file_path) | |
| root = tree.getroot() | |
| except ET.ParseError: | |
| continue | |
| for qa_pair in root.findall(".//QAPair"): | |
| question = clean_text(qa_pair.findtext("Question")) | |
| answer = clean_text(qa_pair.findtext("Answer")) | |
| if len(answer) < 10: | |
| continue | |
| qa_pairs.append( | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "source": file_name, | |
| } | |
| ) | |
| return qa_pairs | |
| def main(): | |
| base_dir = os.path.dirname(os.path.abspath(__file__)) | |
| medquad_dir = os.path.join(base_dir, "MedQuAD") | |
| data_dir = os.path.join(base_dir, "data") | |
| output_path = os.path.join(data_dir, "medquad.json") | |
| if not os.path.isdir(medquad_dir): | |
| print(f"MedQuAD folder not found: {medquad_dir}") | |
| return | |
| os.makedirs(data_dir, exist_ok=True) | |
| qa_pairs = parse_medquad(medquad_dir) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(qa_pairs, f, ensure_ascii=False, indent=2) | |
| print(f"Total QA pairs saved: {len(qa_pairs)}") | |
| if __name__ == "__main__": | |
| main() | |