import pandas as pd import numpy as np import requests import io import random from datetime import datetime, timedelta SECOM_URL = "https://raw.githubusercontent.com/Eason0227/Semiconductor-Manufacturing-Procees-Prediction/main/uci-secom.csv" def fetch_process_data(): """ Fetches the UCI SECOM dataset from a public GitHub mirror. Returns a cleaned DataFrame with timestamps. """ print(f"Downloading SECOM data from {SECOM_URL}...") response = requests.get(SECOM_URL) response.raise_for_status() # The dataset usually doesn't have headers or has weird ones, let's look at the content # The referenced csv seems to be a merged version. df = pd.read_csv(io.StringIO(response.text)) # Generate synthetic timestamps because original SECOM timestamps are often messy or missing in some versions # Let's assume one run every 5 minutes starting from 1 year ago start_date = datetime.now() - timedelta(days=365) timestamps = [start_date + timedelta(minutes=5*i) for i in range(len(df))] df['Timestamp'] = timestamps # Fill NaN with 0 for simplicity in this prototype, or simple imputation df.fillna(0, inplace=True) print(f"Data fetched: {df.shape}") return df def generate_mock_dft_data(n_samples=100): """ Generates synthetic DFT simulation data. Simulates: Chemical Formula, Formation Energy, Bandgap, Lattice Constants. """ elements = ['Si', 'Ga', 'N', 'O', 'Al', 'Ti', 'C', 'Fe'] structures = ['Cubic', 'Hexagonal', 'Tetragonal', 'Orthorhombic'] data = [] for i in range(n_samples): elem1 = random.choice(elements) elem2 = random.choice([e for e in elements if e != elem1]) formula = f"{elem1}{random.randint(1,2)}{elem2}{random.randint(1,3)}" # Correlate bandgap loosely with formation energy for "realism" formation_energy = np.random.normal(loc=-1.5, scale=0.5) # eV/atom band_gap = max(0, np.random.normal(loc=2.0 + formation_energy, scale=0.8)) # eV structure = random.choice(structures) entry = { 'material_id': f"mp-mock-{1000+i}", 'formula': formula, 'formation_energy_per_atom': round(formation_energy, 3), 'band_gap': round(band_gap, 3), 'structure': structure, 'volume': round(np.random.normal(40, 5), 2), 'is_metal': band_gap < 0.1 } data.append(entry) return pd.DataFrame(data) def generate_perovskite_data(n_samples=100): """ Generates synthetic Perovskite (ABX3) data. """ A_sites = ['Cs', 'MA', 'FA'] # Cesium, Methylammonium, Formamidinium B_sites = ['Pb', 'Sn'] X_sites = ['I', 'Br', 'Cl'] data = [] for i in range(n_samples): a = random.choice(A_sites) b = random.choice(B_sites) x = random.choice(X_sites) formula = f"{a}{b}{x}3" # Bandgap engineering rules (approximate) # Pb > Sn, Cl > Br > I base_gap = 1.5 if 'Sn' in formula: base_gap -= 0.3 if 'Br' in formula: base_gap += 0.4 if 'Cl' in formula: base_gap += 0.8 # Add noise band_gap = max(0, np.random.normal(base_gap, 0.1)) formation_energy = np.random.normal(-2.0, 0.2) entry = { 'material_id': f"mp-perov-{1000+i}", 'formula': formula, 'formation_energy_per_atom': round(formation_energy, 3), 'band_gap': round(band_gap, 3), 'structure': 'Perovskite', 'volume': round(np.random.normal(180, 10), 2), 'is_metal': band_gap < 0.1 } data.append(entry) return pd.DataFrame(data) def generate_2d_materials_data(n_samples=100): """ Generates synthetic 2D Materials data (e.g., TMDs). """ M_sites = ['Mo', 'W'] X_sites = ['S', 'Se', 'Te'] data = [] for i in range(n_samples): m = random.choice(M_sites) x = random.choice(X_sites) formula = f"{m}{x}2" base_gap = 1.8 # MoS2 approx if 'W' in formula: base_gap += 0.2 if 'Se' in formula: base_gap -= 0.3 if 'Te' in formula: base_gap -= 0.6 band_gap = max(0, np.random.normal(base_gap, 0.1)) formation_energy = np.random.normal(-0.8, 0.1) # Less stable than bulk entry = { 'material_id': f"mp-2d-{1000+i}", 'formula': formula, 'formation_energy_per_atom': round(formation_energy, 3), 'band_gap': round(band_gap, 3), 'structure': '2D-Hexagonal', 'volume': round(np.random.normal(35, 2), 2), # Per formula unit 'is_metal': band_gap < 0.05 } data.append(entry) return pd.DataFrame(data) if __name__ == "__main__": # Test execution print("Generating Mock Data...") df_proc = fetch_process_data() # Generate all variations df_generic = generate_mock_dft_data() df_perov = generate_perovskite_data() df_2d = generate_2d_materials_data() # Save for local use df_proc.to_csv("mi_platform/data/process_data.csv", index=False) # Note: We will dynamically load these or save them as separate files. # For simplicity, let's keep dft_data as the 'generic' one, but user dashboard can request others. # Actually, let's save them locally. df_generic.to_csv("mi_platform/data/dft_data_generic.csv", index=False) df_perov.to_csv("mi_platform/data/dft_data_perovskite.csv", index=False) df_2d.to_csv("mi_platform/data/dft_data_2d.csv", index=False) # Default is generic for now to not break existing df_generic.to_csv("mi_platform/data/dft_data.csv", index=False) print("Done.")