import os
import pandas as pd

# Path where you unzipped the files
data_dir = 'data' 

def process_tokamak_data(directory):
    # Check what files we actually have
    try:
        files = [f for f in os.listdir(directory) if not f.startswith('.')]
        print(f"📂 Files found in '{directory}': {files}")
    except FileNotFoundError:
        print(f"❌ Error: Folder '{directory}' not found. Please create it and unzip files there.")
        return None

    data_frames = []
    
    for f in files:
        file_path = os.path.join(directory, f)
        
        # Case A: It's a CSV (Likely feature vectors)
        if f.endswith('.csv') and 'Sample' not in f:
            try:
                print(f"   Reading {f}...")
                df = pd.read_csv(file_path)
                # Add a column to track which machine this came from (crucial for analysis)
                df['machine'] = f.split('.')[0] 
                data_frames.append(df)
                print(f"   ✅ Loaded {f}: {df.shape}")
            except Exception as e:
                print(f"   ⚠️ Could not read {f}: {e}")
                
        # Case B: It's a ZIP inside a ZIP (common in Zindi)
        elif f.endswith('.zip'):
            print(f"   ℹ️ Found nested zip {f}. Please unzip this one too.")

    # Merge everything
    if data_frames:
        full_df = pd.concat(data_frames, ignore_index=True)
        # Shuffle the data to mix HL-2A and J-TEXT
        full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)
        
        output_file = 'real_tokamak_data_merged.csv'
        full_df.to_csv(output_file, index=False)
        print(f"\n🎉 Success! Combined dataset saved to '{output_file}'")
        print(f"Total shape: {full_df.shape}")
        return full_df
    else:
        print("\n❌ No usable CSV data found. Check if the zips contained sub-folders or HDF5 files.")
        return None

# Run the function
df = process_tokamak_data(data_dir)