Dheeraj-13's picture
Fresh deployment
bbeec91
import pandas as pd
import numpy as np
import requests
import io
import random
from datetime import datetime, timedelta
SECOM_URL = "https://raw.githubusercontent.com/Eason0227/Semiconductor-Manufacturing-Procees-Prediction/main/uci-secom.csv"
def fetch_process_data():
"""
Fetches the UCI SECOM dataset from a public GitHub mirror.
Returns a cleaned DataFrame with timestamps.
"""
print(f"Downloading SECOM data from {SECOM_URL}...")
response = requests.get(SECOM_URL)
response.raise_for_status()
# The dataset usually doesn't have headers or has weird ones, let's look at the content
# The referenced csv seems to be a merged version.
df = pd.read_csv(io.StringIO(response.text))
# Generate synthetic timestamps because original SECOM timestamps are often messy or missing in some versions
# Let's assume one run every 5 minutes starting from 1 year ago
start_date = datetime.now() - timedelta(days=365)
timestamps = [start_date + timedelta(minutes=5*i) for i in range(len(df))]
df['Timestamp'] = timestamps
# Fill NaN with 0 for simplicity in this prototype, or simple imputation
df.fillna(0, inplace=True)
print(f"Data fetched: {df.shape}")
return df
def generate_mock_dft_data(n_samples=100):
"""
Generates synthetic DFT simulation data.
Simulates: Chemical Formula, Formation Energy, Bandgap, Lattice Constants.
"""
elements = ['Si', 'Ga', 'N', 'O', 'Al', 'Ti', 'C', 'Fe']
structures = ['Cubic', 'Hexagonal', 'Tetragonal', 'Orthorhombic']
data = []
for i in range(n_samples):
elem1 = random.choice(elements)
elem2 = random.choice([e for e in elements if e != elem1])
formula = f"{elem1}{random.randint(1,2)}{elem2}{random.randint(1,3)}"
# Correlate bandgap loosely with formation energy for "realism"
formation_energy = np.random.normal(loc=-1.5, scale=0.5) # eV/atom
band_gap = max(0, np.random.normal(loc=2.0 + formation_energy, scale=0.8)) # eV
structure = random.choice(structures)
entry = {
'material_id': f"mp-mock-{1000+i}",
'formula': formula,
'formation_energy_per_atom': round(formation_energy, 3),
'band_gap': round(band_gap, 3),
'structure': structure,
'volume': round(np.random.normal(40, 5), 2),
'is_metal': band_gap < 0.1
}
data.append(entry)
return pd.DataFrame(data)
def generate_perovskite_data(n_samples=100):
"""
Generates synthetic Perovskite (ABX3) data.
"""
A_sites = ['Cs', 'MA', 'FA'] # Cesium, Methylammonium, Formamidinium
B_sites = ['Pb', 'Sn']
X_sites = ['I', 'Br', 'Cl']
data = []
for i in range(n_samples):
a = random.choice(A_sites)
b = random.choice(B_sites)
x = random.choice(X_sites)
formula = f"{a}{b}{x}3"
# Bandgap engineering rules (approximate)
# Pb > Sn, Cl > Br > I
base_gap = 1.5
if 'Sn' in formula: base_gap -= 0.3
if 'Br' in formula: base_gap += 0.4
if 'Cl' in formula: base_gap += 0.8
# Add noise
band_gap = max(0, np.random.normal(base_gap, 0.1))
formation_energy = np.random.normal(-2.0, 0.2)
entry = {
'material_id': f"mp-perov-{1000+i}",
'formula': formula,
'formation_energy_per_atom': round(formation_energy, 3),
'band_gap': round(band_gap, 3),
'structure': 'Perovskite',
'volume': round(np.random.normal(180, 10), 2),
'is_metal': band_gap < 0.1
}
data.append(entry)
return pd.DataFrame(data)
def generate_2d_materials_data(n_samples=100):
"""
Generates synthetic 2D Materials data (e.g., TMDs).
"""
M_sites = ['Mo', 'W']
X_sites = ['S', 'Se', 'Te']
data = []
for i in range(n_samples):
m = random.choice(M_sites)
x = random.choice(X_sites)
formula = f"{m}{x}2"
base_gap = 1.8 # MoS2 approx
if 'W' in formula: base_gap += 0.2
if 'Se' in formula: base_gap -= 0.3
if 'Te' in formula: base_gap -= 0.6
band_gap = max(0, np.random.normal(base_gap, 0.1))
formation_energy = np.random.normal(-0.8, 0.1) # Less stable than bulk
entry = {
'material_id': f"mp-2d-{1000+i}",
'formula': formula,
'formation_energy_per_atom': round(formation_energy, 3),
'band_gap': round(band_gap, 3),
'structure': '2D-Hexagonal',
'volume': round(np.random.normal(35, 2), 2), # Per formula unit
'is_metal': band_gap < 0.05
}
data.append(entry)
return pd.DataFrame(data)
if __name__ == "__main__":
# Test execution
print("Generating Mock Data...")
df_proc = fetch_process_data()
# Generate all variations
df_generic = generate_mock_dft_data()
df_perov = generate_perovskite_data()
df_2d = generate_2d_materials_data()
# Save for local use
df_proc.to_csv("mi_platform/data/process_data.csv", index=False)
# Note: We will dynamically load these or save them as separate files.
# For simplicity, let's keep dft_data as the 'generic' one, but user dashboard can request others.
# Actually, let's save them locally.
df_generic.to_csv("mi_platform/data/dft_data_generic.csv", index=False)
df_perov.to_csv("mi_platform/data/dft_data_perovskite.csv", index=False)
df_2d.to_csv("mi_platform/data/dft_data_2d.csv", index=False)
# Default is generic for now to not break existing
df_generic.to_csv("mi_platform/data/dft_data.csv", index=False)
print("Done.")