Khang Nguyen commited on
Commit
2f63664
·
1 Parent(s): aa893a9

Remove prepare_tesla_data.py

Browse files
Files changed (1) hide show
  1. prepare_tesla_data.py +0 -194
prepare_tesla_data.py DELETED
@@ -1,194 +0,0 @@
1
- """
2
- prepare_tesla_data.py
3
-
4
- This script takes the cleaned Tesla dataset in:
5
- data/tesla_clean_full.csv
6
-
7
- and generates two synthetic datasets:
8
-
9
- data/tesla_deliveries_1k.csv ~ 1,000 rows
10
- data/tesla_deliveries_50k.csv ~ 50,000 rows
11
-
12
- It also makes sure there is a proper Date column built
13
- from Year and Month so the dashboard can use time series
14
- charts and trend insights.
15
- """
16
-
17
- import os
18
- from pathlib import Path
19
-
20
- import numpy as np
21
- import pandas as pd
22
-
23
- # -------------------------------------------------------------------
24
- # CONFIG
25
- # -------------------------------------------------------------------
26
-
27
- # Cleaned base dataset (already downloaded + cleaned from Kaggle)
28
- CLEAN_FILE = Path("data") / "tesla_clean_full.csv"
29
-
30
- # Output files (synthetic samples)
31
- OUT_DIR = Path("data")
32
- OUT_1K = OUT_DIR / "tesla_deliveries_1k.csv"
33
- OUT_50K = OUT_DIR / "tesla_deliveries_50k.csv"
34
-
35
- RANDOM_SEED = 42
36
-
37
-
38
- # -------------------------------------------------------------------
39
- # Helper functions
40
- # -------------------------------------------------------------------
41
-
42
- def load_clean_data(path: Path) -> pd.DataFrame:
43
- """
44
- Load the cleaned Tesla dataset and make sure it has:
45
- - Date column (datetime)
46
- - Year and Month columns in sync with Date
47
-
48
- If there is no Date column but we have Year and Month,
49
- we create Date as the first day of that month.
50
- """
51
- if not path.exists():
52
- raise FileNotFoundError(
53
- f"Could not find cleaned file at: {path}\n"
54
- "Make sure data/tesla_clean_full.csv exists."
55
- )
56
-
57
- df = pd.read_csv(path)
58
-
59
- # If Date is missing but Year + Month exist, create it
60
- if "Date" not in df.columns:
61
- if "Year" in df.columns and "Month" in df.columns:
62
- # Make sure they are integers
63
- df["Year"] = df["Year"].astype(int)
64
- df["Month"] = df["Month"].astype(int)
65
-
66
- df["Date"] = pd.to_datetime(
67
- df[["Year", "Month"]].assign(DAY=1)
68
- )
69
- else:
70
- raise ValueError(
71
- "Data does not have a Date column or Year/Month columns. "
72
- "Cannot construct a proper Date."
73
- )
74
- else:
75
- # Parse Date if it exists
76
- df["Date"] = pd.to_datetime(df["Date"])
77
-
78
- # Make Year / Month match Date (in case they were inconsistent)
79
- df["Year"] = df["Date"].dt.year
80
- df["Month"] = df["Date"].dt.month
81
-
82
- # Sort for nicer behavior
83
- df = df.sort_values(["Date", "Region", "Model"]).reset_index(drop=True)
84
- return df
85
-
86
-
87
- def make_synthetic_from_clean(
88
- df: pd.DataFrame,
89
- target_rows: int,
90
- seed: int = RANDOM_SEED,
91
- ) -> pd.DataFrame:
92
- """
93
- Create a synthetic dataset with around target_rows rows.
94
-
95
- Steps:
96
- 1. Repeat the base dataset enough times.
97
- 2. Sample down to exactly target_rows rows (with replacement).
98
- 3. Add small random noise to numeric columns.
99
- 4. Jitter Date by a few days, and re-sync Year / Month.
100
- """
101
- rng = np.random.default_rng(seed)
102
-
103
- base_n = len(df)
104
- repeats = int(np.ceil(target_rows / base_n))
105
-
106
- # Repeat the dataset and then sample rows
107
- df_rep = pd.concat([df] * repeats, ignore_index=True)
108
- df_rep = df_rep.sample(n=target_rows, random_state=seed).reset_index(drop=True)
109
-
110
- # ---- Jitter Date slightly (0–27 days) ----
111
- # This keeps the general time pattern but avoids exact duplicates.
112
- date_jitter_days = rng.integers(0, 28, size=len(df_rep))
113
- jitter = pd.to_timedelta(date_jitter_days, unit="D")
114
- df_rep["Date"] = df_rep["Date"] + jitter
115
-
116
- # Recompute Year / Month so they match the new Date
117
- df_rep["Year"] = df_rep["Date"].dt.year
118
- df_rep["Month"] = df_rep["Date"].dt.month
119
-
120
- # ---- Add noise to numeric columns ----
121
- # Columns we expect from the Tesla dataset. If some are missing,
122
- # we just skip them.
123
- noise_specs = {
124
- "Estimated_Deliveries": 0.05, # ±5%
125
- "Production_Units": 0.05, # ±5%
126
- "Avg_Price_USD": 0.03, # ±3%
127
- "Battery_Capacity_kWh": 0.02, # ±2%
128
- "Range_km": 0.03, # ±3%
129
- "CO2_Saved_tons": 0.08, # ±8%
130
- "Charging_Stations": 0.05, # ±5%
131
- }
132
-
133
- for col, pct in noise_specs.items():
134
- if col not in df_rep.columns:
135
- continue # skip if this column doesn't exist
136
-
137
- # 1 + N(0, pct) multiplier
138
- factors = 1.0 + rng.normal(loc=0.0, scale=pct, size=len(df_rep))
139
- df_rep[col] = df_rep[col].astype(float) * factors
140
-
141
- # Integer-like columns
142
- if col in ["Estimated_Deliveries", "Production_Units", "Charging_Stations"]:
143
- df_rep[col] = df_rep[col].round().astype(int)
144
- df_rep[col] = df_rep[col].clip(lower=0)
145
-
146
- # Capacity and range can also be integers
147
- if col in ["Battery_Capacity_kWh", "Range_km"]:
148
- df_rep[col] = df_rep[col].round().astype(int)
149
- df_rep[col] = df_rep[col].clip(lower=0)
150
-
151
- # Price and CO2 can stay as floats but nicely rounded
152
- if col in ["Avg_Price_USD", "CO2_Saved_tons"]:
153
- df_rep[col] = df_rep[col].round(2)
154
-
155
- # Make sure columns are in the same order as the original df
156
- df_rep = df_rep[df.columns]
157
- return df_rep
158
-
159
-
160
- # -------------------------------------------------------------------
161
- # Main script
162
- # -------------------------------------------------------------------
163
-
164
- def main():
165
- # Ensure output directory exists
166
- OUT_DIR.mkdir(parents=True, exist_ok=True)
167
-
168
- print(f"Loading cleaned Tesla data from: {CLEAN_FILE}")
169
- df_clean = load_clean_data(CLEAN_FILE)
170
- print(f"Base cleaned data shape: {df_clean.shape}")
171
-
172
- # (Optional) re-save the cleaned full dataset so we are sure it
173
- # includes the Date column and synced Year/Month.
174
- clean_out = OUT_DIR / "tesla_clean_full.csv"
175
- df_clean.to_csv(clean_out, index=False)
176
- print(f"Re-saved cleaned full dataset to: {clean_out}")
177
-
178
- # ---- Create 1K synthetic sample ----
179
- df_1k = make_synthetic_from_clean(df_clean, target_rows=1000, seed=RANDOM_SEED)
180
- df_1k.to_csv(OUT_1K, index=False)
181
- print(f"Saved synthetic 1K dataset to: {OUT_1K} (rows={len(df_1k)})")
182
-
183
- # ---- Create 50K synthetic sample ----
184
- df_50k = make_synthetic_from_clean(df_clean, target_rows=50000, seed=RANDOM_SEED + 1)
185
- df_50k.to_csv(OUT_50K, index=False)
186
- print(f"Saved synthetic 50K dataset to: {OUT_50K} (rows={len(df_50k)})")
187
-
188
- # Show a small preview of the 50K dataset so we can eyeball it
189
- print("\nSample of 50K synthetic dataset (first 5 rows):")
190
- print(df_50k.head())
191
-
192
-
193
- if __name__ == "__main__":
194
- main()