|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import requests |
|
|
import io |
|
|
import random |
|
|
from datetime import datetime, timedelta |
|
|
|
|
|
SECOM_URL = "https://raw.githubusercontent.com/Eason0227/Semiconductor-Manufacturing-Procees-Prediction/main/uci-secom.csv" |
|
|
|
|
|
def fetch_process_data(): |
|
|
""" |
|
|
Fetches the UCI SECOM dataset from a public GitHub mirror. |
|
|
Returns a cleaned DataFrame with timestamps. |
|
|
""" |
|
|
print(f"Downloading SECOM data from {SECOM_URL}...") |
|
|
response = requests.get(SECOM_URL) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv(io.StringIO(response.text)) |
|
|
|
|
|
|
|
|
|
|
|
start_date = datetime.now() - timedelta(days=365) |
|
|
timestamps = [start_date + timedelta(minutes=5*i) for i in range(len(df))] |
|
|
df['Timestamp'] = timestamps |
|
|
|
|
|
|
|
|
df.fillna(0, inplace=True) |
|
|
|
|
|
print(f"Data fetched: {df.shape}") |
|
|
return df |
|
|
|
|
|
def generate_mock_dft_data(n_samples=100): |
|
|
""" |
|
|
Generates synthetic DFT simulation data. |
|
|
Simulates: Chemical Formula, Formation Energy, Bandgap, Lattice Constants. |
|
|
""" |
|
|
elements = ['Si', 'Ga', 'N', 'O', 'Al', 'Ti', 'C', 'Fe'] |
|
|
structures = ['Cubic', 'Hexagonal', 'Tetragonal', 'Orthorhombic'] |
|
|
|
|
|
data = [] |
|
|
for i in range(n_samples): |
|
|
elem1 = random.choice(elements) |
|
|
elem2 = random.choice([e for e in elements if e != elem1]) |
|
|
formula = f"{elem1}{random.randint(1,2)}{elem2}{random.randint(1,3)}" |
|
|
|
|
|
|
|
|
formation_energy = np.random.normal(loc=-1.5, scale=0.5) |
|
|
band_gap = max(0, np.random.normal(loc=2.0 + formation_energy, scale=0.8)) |
|
|
|
|
|
structure = random.choice(structures) |
|
|
|
|
|
entry = { |
|
|
'material_id': f"mp-mock-{1000+i}", |
|
|
'formula': formula, |
|
|
'formation_energy_per_atom': round(formation_energy, 3), |
|
|
'band_gap': round(band_gap, 3), |
|
|
'structure': structure, |
|
|
'volume': round(np.random.normal(40, 5), 2), |
|
|
'is_metal': band_gap < 0.1 |
|
|
} |
|
|
data.append(entry) |
|
|
|
|
|
return pd.DataFrame(data) |
|
|
|
|
|
def generate_perovskite_data(n_samples=100): |
|
|
""" |
|
|
Generates synthetic Perovskite (ABX3) data. |
|
|
""" |
|
|
A_sites = ['Cs', 'MA', 'FA'] |
|
|
B_sites = ['Pb', 'Sn'] |
|
|
X_sites = ['I', 'Br', 'Cl'] |
|
|
|
|
|
data = [] |
|
|
for i in range(n_samples): |
|
|
a = random.choice(A_sites) |
|
|
b = random.choice(B_sites) |
|
|
x = random.choice(X_sites) |
|
|
formula = f"{a}{b}{x}3" |
|
|
|
|
|
|
|
|
|
|
|
base_gap = 1.5 |
|
|
if 'Sn' in formula: base_gap -= 0.3 |
|
|
if 'Br' in formula: base_gap += 0.4 |
|
|
if 'Cl' in formula: base_gap += 0.8 |
|
|
|
|
|
|
|
|
band_gap = max(0, np.random.normal(base_gap, 0.1)) |
|
|
formation_energy = np.random.normal(-2.0, 0.2) |
|
|
|
|
|
entry = { |
|
|
'material_id': f"mp-perov-{1000+i}", |
|
|
'formula': formula, |
|
|
'formation_energy_per_atom': round(formation_energy, 3), |
|
|
'band_gap': round(band_gap, 3), |
|
|
'structure': 'Perovskite', |
|
|
'volume': round(np.random.normal(180, 10), 2), |
|
|
'is_metal': band_gap < 0.1 |
|
|
} |
|
|
data.append(entry) |
|
|
return pd.DataFrame(data) |
|
|
|
|
|
def generate_2d_materials_data(n_samples=100): |
|
|
""" |
|
|
Generates synthetic 2D Materials data (e.g., TMDs). |
|
|
""" |
|
|
M_sites = ['Mo', 'W'] |
|
|
X_sites = ['S', 'Se', 'Te'] |
|
|
|
|
|
data = [] |
|
|
for i in range(n_samples): |
|
|
m = random.choice(M_sites) |
|
|
x = random.choice(X_sites) |
|
|
formula = f"{m}{x}2" |
|
|
|
|
|
base_gap = 1.8 |
|
|
if 'W' in formula: base_gap += 0.2 |
|
|
if 'Se' in formula: base_gap -= 0.3 |
|
|
if 'Te' in formula: base_gap -= 0.6 |
|
|
|
|
|
band_gap = max(0, np.random.normal(base_gap, 0.1)) |
|
|
formation_energy = np.random.normal(-0.8, 0.1) |
|
|
|
|
|
entry = { |
|
|
'material_id': f"mp-2d-{1000+i}", |
|
|
'formula': formula, |
|
|
'formation_energy_per_atom': round(formation_energy, 3), |
|
|
'band_gap': round(band_gap, 3), |
|
|
'structure': '2D-Hexagonal', |
|
|
'volume': round(np.random.normal(35, 2), 2), |
|
|
'is_metal': band_gap < 0.05 |
|
|
} |
|
|
data.append(entry) |
|
|
return pd.DataFrame(data) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("Generating Mock Data...") |
|
|
df_proc = fetch_process_data() |
|
|
|
|
|
|
|
|
df_generic = generate_mock_dft_data() |
|
|
df_perov = generate_perovskite_data() |
|
|
df_2d = generate_2d_materials_data() |
|
|
|
|
|
|
|
|
df_proc.to_csv("mi_platform/data/process_data.csv", index=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df_generic.to_csv("mi_platform/data/dft_data_generic.csv", index=False) |
|
|
df_perov.to_csv("mi_platform/data/dft_data_perovskite.csv", index=False) |
|
|
df_2d.to_csv("mi_platform/data/dft_data_2d.csv", index=False) |
|
|
|
|
|
|
|
|
df_generic.to_csv("mi_platform/data/dft_data.csv", index=False) |
|
|
print("Done.") |
|
|
|