AIDCEmissionsMap

Sleeping

App Files Files Community

bgamazay commited on Jan 10

Commit

11eee3c

verified ·

1 Parent(s): 2b42220

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -79

app.py CHANGED Viewed

@@ -36,107 +36,89 @@ st.markdown("""
 # --- 1. DATA LOADING & CLEANING ---
 @st.cache_data
 def load_data():
-    # Load the uploaded dataset
-    # Note: In a real HF Space, ensure the filename matches exactly or use a relative path
-    df = pd.read_csv("Frontier AI DC Emissions - Frontier Timeline.csv")
-    # Clean numeric columns (remove commas, handle non-numeric)
     def clean_numeric(val):
         if isinstance(val, str):
-            val = val.replace(',', '').strip()
         return pd.to_numeric(val, errors='coerce')
     df['Power (MW)'] = df['Power (MW)'].apply(clean_numeric)
     df['Carbon Intensity'] = df['Carbon Intensity'].apply(clean_numeric)
     df['Annual Million tCO2'] = df['Annual Million tCO2'].apply(clean_numeric)
-    # --- UNIT CORRECTION LOGIC ---
-    # The CSV likely has emissions in Kilotonnes (e.g., 13093) or is just raw.
-    # Logic: If value > 100 (which is physically impossible for MtCO2/yr for one plant),
-    # assume it is Kilotonnes and divide by 1000 to get Million Tonnes (Mt).
-    # Recalculate to verify: MW * 8760 * (Intensity/1000) / 1,000,000 = Mt
-    # We will create a 'Calculated_MtCO2' for verification, but prefer the user's manual column if it exists
-    # normalizing it to Million Tonnes.
     df['Emissions_Mt'] = df['Annual Million tCO2'].apply(lambda x: x / 1000 if x > 100 else x)
-    # Handle missing coordinates manually based on research
-    # Locations:
-    # Fermi America "HyperGrid" -> Amarillo/Panhandle, TX
-    df.loc[df['Project'].str.contains('Fermi', case=False, na=False), ['Latitude', 'Longitude']] = [35.344, -101.373]
-    # Crane Clean Energy Center -> Three Mile Island, PA
-    df.loc[df['Project'].str.contains('Crane', case=False, na=False), ['Latitude', 'Longitude']] = [40.154, -76.725]
-    # CleanArc Hyperscale -> Caroline County, VA
-    df.loc[df['Project'].str.contains('CleanArc', case=False, na=False), ['Latitude', 'Longitude']] = [38.005, -77.478]
-    # Vantage Data Centers -> Fredericksburg, VA
-    df.loc[df['Project'].str.contains('Vantage', case=False, na=False), ['Latitude', 'Longitude']] = [38.381, -77.495]
-    # Stargate Michigan -> Saline Township, MI
-    df.loc[df['Project'].str.contains('Stargate Michigan', case=False, na=False), ['Latitude', 'Longitude']] = [42.167, -83.850]
-    # Clean Lat/Long to numeric
-    def clean_coord(val):
-        if isinstance(val, str):
-            # Remove symbols like " ° ' N W E "
-            val = val.replace('°', '').replace("'", '').replace('"', '').replace('N','').replace('W','').replace('E','')
-            # Handle DMS to Decimal if necessary, but most look like decimals or simple strings
-            # For this dataset, simple cleanup might suffice if formats are consistent
-            pass
-        return pd.to_numeric(val, errors='coerce')
-    # The dataset has DMS strings (e.g., 42°40'28"N). We need a DMS parser.
-    def dms_to_dd(dms_str):
-        if pd.isna(dms_str) or isinstance(dms_str, (int, float)):
-            return dms_str
-        dms_str = str(dms_str).strip()
-        if not dms_str: return None
-        # Simple parser for format: 42°40'28"N
-        try:
-            parts = dms_str.replace('°', ' ').replace("'", ' ').replace('"', ' ').split()
-            degrees = float(parts[0])
-            minutes = float(parts[1]) if len(parts) > 1 else 0
-            seconds = float(parts[2]) if len(parts) > 2 else 0
-            direction = parts[-1] if parts[-1] in ['N','S','E','W'] else 'N' # Default N/E if missing
-            dd = degrees + minutes/60 + seconds/3600
-            if direction in ['S', 'W']:
-                dd *= -1
-            return dd
-        except:
-            return None # Fallback or keep original if it was already decimal
-    # Apply DMS conversion only where it looks like a string with degrees
-    # Note: The manual overrides above provided decimal, so we skip those rows
     for col in ['Latitude', 'Longitude']:
-        df[col] = df[col].apply(lambda x: dms_to_dd(x) if isinstance(x, str) and '°' in x else x)
         df[col] = pd.to_numeric(df[col], errors='coerce')
-    # Drop rows without coordinates
     df = df.dropna(subset=['Latitude', 'Longitude'])
-    # --- ENRICHMENT FOR HOVER ---
-    # 1 MtCO2 approx 217,000 passenger vehicles/year (EPA is 4.6 metric tons/car/year)
-    # 1,000,000 tons / 4.6 = ~217,391 cars.
-    # User stat: 13.1 Mt = 2.9M cars -> implies ~4.5 tons/car. We will use 4.6.
-    df['Cars_Equivalent_Millions'] = (df['Emissions_Mt'] * 1000000 / 4600 / 1000000).round(2)
-    # Coal Plant Equivalent: Average coal plant is ~3.5 to 4 MtCO2/year
     df['Coal_Plants_Equivalent'] = (df['Emissions_Mt'] / 4.0).round(1)
-    # Color Categories (R, G, B, A)
     def get_color(status):
         s = str(status).lower()
-        if 'off-grid' in s or 'gas' in s:
-            return [255, 65, 54, 200]  # Red (Danger)
-        elif 'hybrid' in s or 'nuclear' in s: # Nuclear often grouped here as transition/special
-            return [255, 133, 27, 200] # Orange (Transition)
-        else:
-            return [0, 116, 217, 200]  # Blue (Grid)
     df['color'] = df['Grid Status'].apply(get_color)
-    # Bubble Size (Scaled)
-    # Scale factor for visual sizing
     df['radius'] = df['Emissions_Mt'].apply(lambda x: math.sqrt(x) * 15000)
     return df

 # --- 1. DATA LOADING & CLEANING ---
 @st.cache_data
 def load_data():
+    try:
+        # Load data
+        df = pd.read_csv("Frontier AI DC Emissions - Frontier Timeline.csv")
+        # --- FIX 1: Sanitize Headers ---
+        # Removes hidden spaces (e.g. "Power (MW) " -> "Power (MW)")
+        df.columns = df.columns.str.strip()
+        # Validation: Check if columns exist, if not, show what was found
+        required_cols = ['Power (MW)', 'Carbon Intensity', 'Annual Million tCO2']
+        missing = [c for c in required_cols if c not in df.columns]
+        if missing:
+            st.error(f"❌ Missing columns: {missing}. Found columns: {df.columns.tolist()}")
+            st.stop()
+    except FileNotFoundError:
+        st.error("❌ File not found. Please ensure 'Frontier AI DC Emissions - Frontier Timeline.csv' is uploaded.")
+        st.stop()
+    # --- Data Cleaning ---
     def clean_numeric(val):
         if isinstance(val, str):
+            val = val.replace(',', '').replace('"', '').strip()
         return pd.to_numeric(val, errors='coerce')
     df['Power (MW)'] = df['Power (MW)'].apply(clean_numeric)
     df['Carbon Intensity'] = df['Carbon Intensity'].apply(clean_numeric)
     df['Annual Million tCO2'] = df['Annual Million tCO2'].apply(clean_numeric)
+    # --- FIX 2: Math Check (Power * Intensity vs Reported) ---
+    # Formula: MW * 8760 hours * (Intensity kg/MWh / 1000 to get tonnes) / 1,000,000 to get Million Tonnes
+    df['Calculated_Mt'] = (df['Power (MW)'] * 8760 * df['Carbon Intensity']) / 1e9
+    # Use the Reported number, but normalize it (Handle the 13,093 vs 13.1 issue)
+    # If the number is > 100, it's likely in Kilotonnes, so divide by 1000
     df['Emissions_Mt'] = df['Annual Million tCO2'].apply(lambda x: x / 1000 if x > 100 else x)
+    # --- Geocoding (Manual Overrides for missing Lat/Long) ---
+    # Add coordinates for known projects if missing
+    overrides = {
+        'Fermi': [35.344, -101.373],       # Amarillo, TX
+        'Crane': [40.154, -76.725],        # Three Mile Island
+        'CleanArc': [38.005, -77.478],     # Caroline County, VA
+        'Vantage': [38.381, -77.495],      # Fredericksburg, VA
+        'Stargate': [42.167, -83.850]      # Michigan
+    }
+    for key, coords in overrides.items():
+        mask = df['Project'].astype(str).str.contains(key, case=False, na=False)
+        df.loc[mask, ['Latitude', 'Longitude']] = coords
+    # Parse DMS coordinates (e.g., 42°40'28"N) if they exist
+    def dms_to_dd(val):
+        if isinstance(val, str) and '°' in val:
+            try:
+                parts = val.replace('°', ' ').replace("'", ' ').replace('"', ' ').split()
+                dd = float(parts[0]) + float(parts[1])/60 + (float(parts[2]) if len(parts)>2 else 0)/3600
+                if 'S' in val or 'W' in val: dd *= -1
+                return dd
+            except: return None
+        return val
     for col in ['Latitude', 'Longitude']:
+        df[col] = df[col].apply(dms_to_dd)
         df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Drop rows that still have no location
     df = df.dropna(subset=['Latitude', 'Longitude'])
+    # --- Enrichment for Tooltip ---
+    # Cars: 1 MtCO2 ≈ 217,000 cars (4.6t/car/yr)
+    df['Cars_Equivalent_Millions'] = (df['Emissions_Mt'] * 1_000_000 / 4.6 / 1_000_000).round(2)
+    # Coal Plants: 1 Coal Plant ≈ 4.0 MtCO2
     df['Coal_Plants_Equivalent'] = (df['Emissions_Mt'] / 4.0).round(1)
+    # Visual Attributes
     def get_color(status):
         s = str(status).lower()
+        if 'off-grid' in s or 'gas' in s: return [255, 65, 54, 200]  # Red
+        elif 'hybrid' in s or 'nuclear' in s: return [255, 133, 27, 200] # Orange
+        else: return [0, 116, 217, 200]  # Blue
     df['color'] = df['Grid Status'].apply(get_color)
     df['radius'] = df['Emissions_Mt'].apply(lambda x: math.sqrt(x) * 15000)
     return df