medimind-api / parse_dataset.py
Manikantaperla's picture
initial medimind backend
d0c827a
import json
import os
import xml.etree.ElementTree as ET
def clean_text(text):
if text is None:
return ""
return text.strip()
def parse_medquad(medquad_dir):
qa_pairs = []
for root_dir, _, files in os.walk(medquad_dir):
for file_name in files:
if not file_name.lower().endswith(".xml"):
continue
file_path = os.path.join(root_dir, file_name)
try:
tree = ET.parse(file_path)
root = tree.getroot()
except ET.ParseError:
continue
for qa_pair in root.findall(".//QAPair"):
question = clean_text(qa_pair.findtext("Question"))
answer = clean_text(qa_pair.findtext("Answer"))
if len(answer) < 10:
continue
qa_pairs.append(
{
"question": question,
"answer": answer,
"source": file_name,
}
)
return qa_pairs
def main():
base_dir = os.path.dirname(os.path.abspath(__file__))
medquad_dir = os.path.join(base_dir, "MedQuAD")
data_dir = os.path.join(base_dir, "data")
output_path = os.path.join(data_dir, "medquad.json")
if not os.path.isdir(medquad_dir):
print(f"MedQuAD folder not found: {medquad_dir}")
return
os.makedirs(data_dir, exist_ok=True)
qa_pairs = parse_medquad(medquad_dir)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
print(f"Total QA pairs saved: {len(qa_pairs)}")
if __name__ == "__main__":
main()