import os import pandas as pd # Path where you unzipped the files data_dir = 'data' def process_tokamak_data(directory): # Check what files we actually have try: files = [f for f in os.listdir(directory) if not f.startswith('.')] print(f"šŸ“‚ Files found in '{directory}': {files}") except FileNotFoundError: print(f"āŒ Error: Folder '{directory}' not found. Please create it and unzip files there.") return None data_frames = [] for f in files: file_path = os.path.join(directory, f) # Case A: It's a CSV (Likely feature vectors) if f.endswith('.csv') and 'Sample' not in f: try: print(f" Reading {f}...") df = pd.read_csv(file_path) # Add a column to track which machine this came from (crucial for analysis) df['machine'] = f.split('.')[0] data_frames.append(df) print(f" āœ… Loaded {f}: {df.shape}") except Exception as e: print(f" āš ļø Could not read {f}: {e}") # Case B: It's a ZIP inside a ZIP (common in Zindi) elif f.endswith('.zip'): print(f" ā„¹ļø Found nested zip {f}. Please unzip this one too.") # Merge everything if data_frames: full_df = pd.concat(data_frames, ignore_index=True) # Shuffle the data to mix HL-2A and J-TEXT full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True) output_file = 'real_tokamak_data_merged.csv' full_df.to_csv(output_file, index=False) print(f"\nšŸŽ‰ Success! Combined dataset saved to '{output_file}'") print(f"Total shape: {full_df.shape}") return full_df else: print("\nāŒ No usable CSV data found. Check if the zips contained sub-folders or HDF5 files.") return None # Run the function df = process_tokamak_data(data_dir)