SecureEdge-GPT / data_manager.py
ShivaKiranKunchala's picture
Initial project commit for SecureEdge-GPT
118e209
import os
from datasets import load_dataset
import pandas as pd
import json
def prepare_medical_data():
print("--- DOWNLOADING REAL MEDICAL DATA (MTSamples) ---")
# Load dataset from Hugging Face
try:
ds = load_dataset("tchebonenko/MedicalTranscriptions", split="train")
df = pd.DataFrame(ds)
except Exception as e:
print(f"Error downloading dataset: {e}")
return
# Clean data (ensure specialty and transcription aren't null)
df = df.dropna(subset=['medical_specialty', 'transcription'])
# Let's pick 4 distinct specialties to simulate different hospitals
hospitals = {
"Hospital-East": [" Cardiovascular / Pulmonary", " Neurology"],
"Hospital-West": [" Orthopedic", " Surgery"]
}
os.makedirs("data", exist_ok=True)
for hospital_name, specialties in hospitals.items():
# Filter for the specialties assigned to this hospital
hospital_df = df[df['medical_specialty'].isin(specialties)]
# Take a subset to keep it fast for PoC (e.g., 50 samples)
hospital_df = hospital_df.sample(min(len(hospital_df), 50))
# Format for LLM Categorization:
# Prompt: "Determine the medical specialty of this transcript: [TRANSCRIPTION]"
# Completion: "[SPECIALTY]"
data_list = []
for _, row in hospital_df.iterrows():
sample = {
"instruction": f"Determine the medical specialty of this transcript: {row['transcription'][:500]}...",
"output": row['medical_specialty'].strip()
}
data_list.append(sample)
file_path = f"data/{hospital_name}_data.jsonl"
with open(file_path, 'w') as f:
for entry in data_list:
f.write(json.dumps(entry) + '\n')
print(f"--- Saved {len(data_list)} samples for {hospital_name} to {file_path} ---")
if __name__ == "__main__":
prepare_medical_data()