bgamazay commited on
Commit
11eee3c
·
verified ·
1 Parent(s): 2b42220

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -79
app.py CHANGED
@@ -36,107 +36,89 @@ st.markdown("""
36
  # --- 1. DATA LOADING & CLEANING ---
37
  @st.cache_data
38
  def load_data():
39
- # Load the uploaded dataset
40
- # Note: In a real HF Space, ensure the filename matches exactly or use a relative path
41
- df = pd.read_csv("Frontier AI DC Emissions - Frontier Timeline.csv")
42
-
43
- # Clean numeric columns (remove commas, handle non-numeric)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def clean_numeric(val):
45
  if isinstance(val, str):
46
- val = val.replace(',', '').strip()
47
  return pd.to_numeric(val, errors='coerce')
48
 
49
  df['Power (MW)'] = df['Power (MW)'].apply(clean_numeric)
50
  df['Carbon Intensity'] = df['Carbon Intensity'].apply(clean_numeric)
51
  df['Annual Million tCO2'] = df['Annual Million tCO2'].apply(clean_numeric)
52
 
53
- # --- UNIT CORRECTION LOGIC ---
54
- # The CSV likely has emissions in Kilotonnes (e.g., 13093) or is just raw.
55
- # Logic: If value > 100 (which is physically impossible for MtCO2/yr for one plant),
56
- # assume it is Kilotonnes and divide by 1000 to get Million Tonnes (Mt).
57
- # Recalculate to verify: MW * 8760 * (Intensity/1000) / 1,000,000 = Mt
58
 
59
- # We will create a 'Calculated_MtCO2' for verification, but prefer the user's manual column if it exists
60
- # normalizing it to Million Tonnes.
61
  df['Emissions_Mt'] = df['Annual Million tCO2'].apply(lambda x: x / 1000 if x > 100 else x)
62
 
63
- # Handle missing coordinates manually based on research
64
- # Locations:
65
- # Fermi America "HyperGrid" -> Amarillo/Panhandle, TX
66
- df.loc[df['Project'].str.contains('Fermi', case=False, na=False), ['Latitude', 'Longitude']] = [35.344, -101.373]
67
- # Crane Clean Energy Center -> Three Mile Island, PA
68
- df.loc[df['Project'].str.contains('Crane', case=False, na=False), ['Latitude', 'Longitude']] = [40.154, -76.725]
69
- # CleanArc Hyperscale -> Caroline County, VA
70
- df.loc[df['Project'].str.contains('CleanArc', case=False, na=False), ['Latitude', 'Longitude']] = [38.005, -77.478]
71
- # Vantage Data Centers -> Fredericksburg, VA
72
- df.loc[df['Project'].str.contains('Vantage', case=False, na=False), ['Latitude', 'Longitude']] = [38.381, -77.495]
73
- # Stargate Michigan -> Saline Township, MI
74
- df.loc[df['Project'].str.contains('Stargate Michigan', case=False, na=False), ['Latitude', 'Longitude']] = [42.167, -83.850]
75
 
76
- # Clean Lat/Long to numeric
77
- def clean_coord(val):
78
- if isinstance(val, str):
79
- # Remove symbols like " ° ' N W E "
80
- val = val.replace('°', '').replace("'", '').replace('"', '').replace('N','').replace('W','').replace('E','')
81
- # Handle DMS to Decimal if necessary, but most look like decimals or simple strings
82
- # For this dataset, simple cleanup might suffice if formats are consistent
83
- pass
84
- return pd.to_numeric(val, errors='coerce')
85
-
86
- # The dataset has DMS strings (e.g., 42°40'28"N). We need a DMS parser.
87
- def dms_to_dd(dms_str):
88
- if pd.isna(dms_str) or isinstance(dms_str, (int, float)):
89
- return dms_str
90
- dms_str = str(dms_str).strip()
91
- if not dms_str: return None
92
-
93
- # Simple parser for format: 42°40'28"N
94
- try:
95
- parts = dms_str.replace('°', ' ').replace("'", ' ').replace('"', ' ').split()
96
- degrees = float(parts[0])
97
- minutes = float(parts[1]) if len(parts) > 1 else 0
98
- seconds = float(parts[2]) if len(parts) > 2 else 0
99
- direction = parts[-1] if parts[-1] in ['N','S','E','W'] else 'N' # Default N/E if missing
100
-
101
- dd = degrees + minutes/60 + seconds/3600
102
- if direction in ['S', 'W']:
103
- dd *= -1
104
- return dd
105
- except:
106
- return None # Fallback or keep original if it was already decimal
107
 
108
- # Apply DMS conversion only where it looks like a string with degrees
109
- # Note: The manual overrides above provided decimal, so we skip those rows
110
  for col in ['Latitude', 'Longitude']:
111
- df[col] = df[col].apply(lambda x: dms_to_dd(x) if isinstance(x, str) and '°' in x else x)
112
  df[col] = pd.to_numeric(df[col], errors='coerce')
113
-
114
- # Drop rows without coordinates
115
  df = df.dropna(subset=['Latitude', 'Longitude'])
116
-
117
- # --- ENRICHMENT FOR HOVER ---
118
- # 1 MtCO2 approx 217,000 passenger vehicles/year (EPA is 4.6 metric tons/car/year)
119
- # 1,000,000 tons / 4.6 = ~217,391 cars.
120
- # User stat: 13.1 Mt = 2.9M cars -> implies ~4.5 tons/car. We will use 4.6.
121
- df['Cars_Equivalent_Millions'] = (df['Emissions_Mt'] * 1000000 / 4600 / 1000000).round(2)
122
-
123
- # Coal Plant Equivalent: Average coal plant is ~3.5 to 4 MtCO2/year
124
  df['Coal_Plants_Equivalent'] = (df['Emissions_Mt'] / 4.0).round(1)
125
 
126
- # Color Categories (R, G, B, A)
127
  def get_color(status):
128
  s = str(status).lower()
129
- if 'off-grid' in s or 'gas' in s:
130
- return [255, 65, 54, 200] # Red (Danger)
131
- elif 'hybrid' in s or 'nuclear' in s: # Nuclear often grouped here as transition/special
132
- return [255, 133, 27, 200] # Orange (Transition)
133
- else:
134
- return [0, 116, 217, 200] # Blue (Grid)
135
 
136
  df['color'] = df['Grid Status'].apply(get_color)
137
-
138
- # Bubble Size (Scaled)
139
- # Scale factor for visual sizing
140
  df['radius'] = df['Emissions_Mt'].apply(lambda x: math.sqrt(x) * 15000)
141
 
142
  return df
 
36
  # --- 1. DATA LOADING & CLEANING ---
37
  @st.cache_data
38
  def load_data():
39
+ try:
40
+ # Load data
41
+ df = pd.read_csv("Frontier AI DC Emissions - Frontier Timeline.csv")
42
+
43
+ # --- FIX 1: Sanitize Headers ---
44
+ # Removes hidden spaces (e.g. "Power (MW) " -> "Power (MW)")
45
+ df.columns = df.columns.str.strip()
46
+
47
+ # Validation: Check if columns exist, if not, show what was found
48
+ required_cols = ['Power (MW)', 'Carbon Intensity', 'Annual Million tCO2']
49
+ missing = [c for c in required_cols if c not in df.columns]
50
+ if missing:
51
+ st.error(f"❌ Missing columns: {missing}. Found columns: {df.columns.tolist()}")
52
+ st.stop()
53
+
54
+ except FileNotFoundError:
55
+ st.error("❌ File not found. Please ensure 'Frontier AI DC Emissions - Frontier Timeline.csv' is uploaded.")
56
+ st.stop()
57
+
58
+ # --- Data Cleaning ---
59
  def clean_numeric(val):
60
  if isinstance(val, str):
61
+ val = val.replace(',', '').replace('"', '').strip()
62
  return pd.to_numeric(val, errors='coerce')
63
 
64
  df['Power (MW)'] = df['Power (MW)'].apply(clean_numeric)
65
  df['Carbon Intensity'] = df['Carbon Intensity'].apply(clean_numeric)
66
  df['Annual Million tCO2'] = df['Annual Million tCO2'].apply(clean_numeric)
67
 
68
+ # --- FIX 2: Math Check (Power * Intensity vs Reported) ---
69
+ # Formula: MW * 8760 hours * (Intensity kg/MWh / 1000 to get tonnes) / 1,000,000 to get Million Tonnes
70
+ df['Calculated_Mt'] = (df['Power (MW)'] * 8760 * df['Carbon Intensity']) / 1e9
 
 
71
 
72
+ # Use the Reported number, but normalize it (Handle the 13,093 vs 13.1 issue)
73
+ # If the number is > 100, it's likely in Kilotonnes, so divide by 1000
74
  df['Emissions_Mt'] = df['Annual Million tCO2'].apply(lambda x: x / 1000 if x > 100 else x)
75
 
76
+ # --- Geocoding (Manual Overrides for missing Lat/Long) ---
77
+ # Add coordinates for known projects if missing
78
+ overrides = {
79
+ 'Fermi': [35.344, -101.373], # Amarillo, TX
80
+ 'Crane': [40.154, -76.725], # Three Mile Island
81
+ 'CleanArc': [38.005, -77.478], # Caroline County, VA
82
+ 'Vantage': [38.381, -77.495], # Fredericksburg, VA
83
+ 'Stargate': [42.167, -83.850] # Michigan
84
+ }
 
 
 
85
 
86
+ for key, coords in overrides.items():
87
+ mask = df['Project'].astype(str).str.contains(key, case=False, na=False)
88
+ df.loc[mask, ['Latitude', 'Longitude']] = coords
89
+
90
+ # Parse DMS coordinates (e.g., 42°40'28"N) if they exist
91
+ def dms_to_dd(val):
92
+ if isinstance(val, str) and '°' in val:
93
+ try:
94
+ parts = val.replace('°', ' ').replace("'", ' ').replace('"', ' ').split()
95
+ dd = float(parts[0]) + float(parts[1])/60 + (float(parts[2]) if len(parts)>2 else 0)/3600
96
+ if 'S' in val or 'W' in val: dd *= -1
97
+ return dd
98
+ except: return None
99
+ return val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
 
 
101
  for col in ['Latitude', 'Longitude']:
102
+ df[col] = df[col].apply(dms_to_dd)
103
  df[col] = pd.to_numeric(df[col], errors='coerce')
104
+
105
+ # Drop rows that still have no location
106
  df = df.dropna(subset=['Latitude', 'Longitude'])
107
+
108
+ # --- Enrichment for Tooltip ---
109
+ # Cars: 1 MtCO2 217,000 cars (4.6t/car/yr)
110
+ df['Cars_Equivalent_Millions'] = (df['Emissions_Mt'] * 1_000_000 / 4.6 / 1_000_000).round(2)
111
+ # Coal Plants: 1 Coal Plant 4.0 MtCO2
 
 
 
112
  df['Coal_Plants_Equivalent'] = (df['Emissions_Mt'] / 4.0).round(1)
113
 
114
+ # Visual Attributes
115
  def get_color(status):
116
  s = str(status).lower()
117
+ if 'off-grid' in s or 'gas' in s: return [255, 65, 54, 200] # Red
118
+ elif 'hybrid' in s or 'nuclear' in s: return [255, 133, 27, 200] # Orange
119
+ else: return [0, 116, 217, 200] # Blue
 
 
 
120
 
121
  df['color'] = df['Grid Status'].apply(get_color)
 
 
 
122
  df['radius'] = df['Emissions_Mt'].apply(lambda x: math.sqrt(x) * 15000)
123
 
124
  return df