File size: 1,980 Bytes
0f755ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os
import pandas as pd
# Path where you unzipped the files
data_dir = 'data'
def process_tokamak_data(directory):
# Check what files we actually have
try:
files = [f for f in os.listdir(directory) if not f.startswith('.')]
print(f"📂 Files found in '{directory}': {files}")
except FileNotFoundError:
print(f"❌ Error: Folder '{directory}' not found. Please create it and unzip files there.")
return None
data_frames = []
for f in files:
file_path = os.path.join(directory, f)
# Case A: It's a CSV (Likely feature vectors)
if f.endswith('.csv') and 'Sample' not in f:
try:
print(f" Reading {f}...")
df = pd.read_csv(file_path)
# Add a column to track which machine this came from (crucial for analysis)
df['machine'] = f.split('.')[0]
data_frames.append(df)
print(f" ✅ Loaded {f}: {df.shape}")
except Exception as e:
print(f" ⚠️ Could not read {f}: {e}")
# Case B: It's a ZIP inside a ZIP (common in Zindi)
elif f.endswith('.zip'):
print(f" ℹ️ Found nested zip {f}. Please unzip this one too.")
# Merge everything
if data_frames:
full_df = pd.concat(data_frames, ignore_index=True)
# Shuffle the data to mix HL-2A and J-TEXT
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)
output_file = 'real_tokamak_data_merged.csv'
full_df.to_csv(output_file, index=False)
print(f"\n🎉 Success! Combined dataset saved to '{output_file}'")
print(f"Total shape: {full_df.shape}")
return full_df
else:
print("\n❌ No usable CSV data found. Check if the zips contained sub-folders or HDF5 files.")
return None
# Run the function
df = process_tokamak_data(data_dir) |