QGAN_Project / vG.0.1 /data-detect
1bnjmn3's picture
Add files using upload-large-folder tool
0f755ec verified
import os
import pandas as pd
# Path where you unzipped the files
data_dir = 'data'
def process_tokamak_data(directory):
# Check what files we actually have
try:
files = [f for f in os.listdir(directory) if not f.startswith('.')]
print(f"📂 Files found in '{directory}': {files}")
except FileNotFoundError:
print(f"❌ Error: Folder '{directory}' not found. Please create it and unzip files there.")
return None
data_frames = []
for f in files:
file_path = os.path.join(directory, f)
# Case A: It's a CSV (Likely feature vectors)
if f.endswith('.csv') and 'Sample' not in f:
try:
print(f" Reading {f}...")
df = pd.read_csv(file_path)
# Add a column to track which machine this came from (crucial for analysis)
df['machine'] = f.split('.')[0]
data_frames.append(df)
print(f" ✅ Loaded {f}: {df.shape}")
except Exception as e:
print(f" ⚠️ Could not read {f}: {e}")
# Case B: It's a ZIP inside a ZIP (common in Zindi)
elif f.endswith('.zip'):
print(f" ℹ️ Found nested zip {f}. Please unzip this one too.")
# Merge everything
if data_frames:
full_df = pd.concat(data_frames, ignore_index=True)
# Shuffle the data to mix HL-2A and J-TEXT
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)
output_file = 'real_tokamak_data_merged.csv'
full_df.to_csv(output_file, index=False)
print(f"\n🎉 Success! Combined dataset saved to '{output_file}'")
print(f"Total shape: {full_df.shape}")
return full_df
else:
print("\n❌ No usable CSV data found. Check if the zips contained sub-folders or HDF5 files.")
return None
# Run the function
df = process_tokamak_data(data_dir)