Spaces:

dkhangn
/

CS5130_finalProject

Sleeping

App Files Files Community

Khang Nguyen commited on Dec 7, 2025

Commit

2f63664

1 Parent(s): aa893a9

Remove prepare_tesla_data.py

Browse files

Files changed (1) hide show

prepare_tesla_data.py +0 -194

prepare_tesla_data.py DELETED Viewed

@@ -1,194 +0,0 @@
-"""
-prepare_tesla_data.py
-This script takes the cleaned Tesla dataset in:
-    data/tesla_clean_full.csv
-and generates two synthetic datasets:
-    data/tesla_deliveries_1k.csv   ~ 1,000 rows
-    data/tesla_deliveries_50k.csv  ~ 50,000 rows
-It also makes sure there is a proper Date column built
-from Year and Month so the dashboard can use time series
-charts and trend insights.
-"""
-import os
-from pathlib import Path
-import numpy as np
-import pandas as pd
-# -------------------------------------------------------------------
-# CONFIG
-# -------------------------------------------------------------------
-# Cleaned base dataset (already downloaded + cleaned from Kaggle)
-CLEAN_FILE = Path("data") / "tesla_clean_full.csv"
-# Output files (synthetic samples)
-OUT_DIR = Path("data")
-OUT_1K = OUT_DIR / "tesla_deliveries_1k.csv"
-OUT_50K = OUT_DIR / "tesla_deliveries_50k.csv"
-RANDOM_SEED = 42
-# -------------------------------------------------------------------
-# Helper functions
-# -------------------------------------------------------------------
-def load_clean_data(path: Path) -> pd.DataFrame:
-    """
-    Load the cleaned Tesla dataset and make sure it has:
-      - Date column (datetime)
-      - Year and Month columns in sync with Date
-    If there is no Date column but we have Year and Month,
-    we create Date as the first day of that month.
-    """
-    if not path.exists():
-        raise FileNotFoundError(
-            f"Could not find cleaned file at: {path}\n"
-            "Make sure data/tesla_clean_full.csv exists."
-        )
-    df = pd.read_csv(path)
-    # If Date is missing but Year + Month exist, create it
-    if "Date" not in df.columns:
-        if "Year" in df.columns and "Month" in df.columns:
-            # Make sure they are integers
-            df["Year"] = df["Year"].astype(int)
-            df["Month"] = df["Month"].astype(int)
-            df["Date"] = pd.to_datetime(
-                df[["Year", "Month"]].assign(DAY=1)
-            )
-        else:
-            raise ValueError(
-                "Data does not have a Date column or Year/Month columns. "
-                "Cannot construct a proper Date."
-            )
-    else:
-        # Parse Date if it exists
-        df["Date"] = pd.to_datetime(df["Date"])
-    # Make Year / Month match Date (in case they were inconsistent)
-    df["Year"] = df["Date"].dt.year
-    df["Month"] = df["Date"].dt.month
-    # Sort for nicer behavior
-    df = df.sort_values(["Date", "Region", "Model"]).reset_index(drop=True)
-    return df
-def make_synthetic_from_clean(
-    df: pd.DataFrame,
-    target_rows: int,
-    seed: int = RANDOM_SEED,
-) -> pd.DataFrame:
-    """
-    Create a synthetic dataset with around target_rows rows.
-    Steps:
-      1. Repeat the base dataset enough times.
-      2. Sample down to exactly target_rows rows (with replacement).
-      3. Add small random noise to numeric columns.
-      4. Jitter Date by a few days, and re-sync Year / Month.
-    """
-    rng = np.random.default_rng(seed)
-    base_n = len(df)
-    repeats = int(np.ceil(target_rows / base_n))
-    # Repeat the dataset and then sample rows
-    df_rep = pd.concat([df] * repeats, ignore_index=True)
-    df_rep = df_rep.sample(n=target_rows, random_state=seed).reset_index(drop=True)
-    # ---- Jitter Date slightly (0–27 days) ----
-    # This keeps the general time pattern but avoids exact duplicates.
-    date_jitter_days = rng.integers(0, 28, size=len(df_rep))
-    jitter = pd.to_timedelta(date_jitter_days, unit="D")
-    df_rep["Date"] = df_rep["Date"] + jitter
-    # Recompute Year / Month so they match the new Date
-    df_rep["Year"] = df_rep["Date"].dt.year
-    df_rep["Month"] = df_rep["Date"].dt.month
-    # ---- Add noise to numeric columns ----
-    # Columns we expect from the Tesla dataset. If some are missing,
-    # we just skip them.
-    noise_specs = {
-        "Estimated_Deliveries": 0.05,   # ±5%
-        "Production_Units": 0.05,       # ±5%
-        "Avg_Price_USD": 0.03,          # ±3%
-        "Battery_Capacity_kWh": 0.02,   # ±2%
-        "Range_km": 0.03,               # ±3%
-        "CO2_Saved_tons": 0.08,         # ±8%
-        "Charging_Stations": 0.05,      # ±5%
-    }
-    for col, pct in noise_specs.items():
-        if col not in df_rep.columns:
-            continue  # skip if this column doesn't exist
-        # 1 + N(0, pct) multiplier
-        factors = 1.0 + rng.normal(loc=0.0, scale=pct, size=len(df_rep))
-        df_rep[col] = df_rep[col].astype(float) * factors
-        # Integer-like columns
-        if col in ["Estimated_Deliveries", "Production_Units", "Charging_Stations"]:
-            df_rep[col] = df_rep[col].round().astype(int)
-            df_rep[col] = df_rep[col].clip(lower=0)
-        # Capacity and range can also be integers
-        if col in ["Battery_Capacity_kWh", "Range_km"]:
-            df_rep[col] = df_rep[col].round().astype(int)
-            df_rep[col] = df_rep[col].clip(lower=0)
-        # Price and CO2 can stay as floats but nicely rounded
-        if col in ["Avg_Price_USD", "CO2_Saved_tons"]:
-            df_rep[col] = df_rep[col].round(2)
-    # Make sure columns are in the same order as the original df
-    df_rep = df_rep[df.columns]
-    return df_rep
-# -------------------------------------------------------------------
-# Main script
-# -------------------------------------------------------------------
-def main():
-    # Ensure output directory exists
-    OUT_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"Loading cleaned Tesla data from: {CLEAN_FILE}")
-    df_clean = load_clean_data(CLEAN_FILE)
-    print(f"Base cleaned data shape: {df_clean.shape}")
-    # (Optional) re-save the cleaned full dataset so we are sure it
-    # includes the Date column and synced Year/Month.
-    clean_out = OUT_DIR / "tesla_clean_full.csv"
-    df_clean.to_csv(clean_out, index=False)
-    print(f"Re-saved cleaned full dataset to: {clean_out}")
-    # ---- Create 1K synthetic sample ----
-    df_1k = make_synthetic_from_clean(df_clean, target_rows=1000, seed=RANDOM_SEED)
-    df_1k.to_csv(OUT_1K, index=False)
-    print(f"Saved synthetic 1K dataset to: {OUT_1K} (rows={len(df_1k)})")
-    # ---- Create 50K synthetic sample ----
-    df_50k = make_synthetic_from_clean(df_clean, target_rows=50000, seed=RANDOM_SEED + 1)
-    df_50k.to_csv(OUT_50K, index=False)
-    print(f"Saved synthetic 50K dataset to: {OUT_50K} (rows={len(df_50k)})")
-    # Show a small preview of the 50K dataset so we can eyeball it
-    print("\nSample of 50K synthetic dataset (first 5 rows):")
-    print(df_50k.head())
-if __name__ == "__main__":
-    main()