File size: 8,951 Bytes
4495c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import streamlit as st
import re
import subprocess
import os
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut

def get_coordinates(city, state, timeout=10):
    geolocator = Nominatim(user_agent="geo_locator")
    try:
        location = geolocator.geocode(f"{city}, {state}, USA", timeout=timeout)
        if location:
            return f"{location.latitude}, {location.longitude}"
        else:
            return ""
    except (GeocoderUnavailable, GeocoderTimedOut) as e:
        print(f"Geocoding error: {e}")
        return ""

@st.cache_data
def load_county_data():
    df = pd.read_csv("us_counties.csv")
    df["stateName"] = df["stateName"].str.strip()
    df["countyName"] = df["countyName"].str.strip()
    return df

county_data = load_county_data()

# Mapping of full state names to abbreviations (including District of Columbia)
state_abbr_map = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
    "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
    "Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL",
    "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA",
    "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI",
    "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND",
    "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA",
    "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD",
    "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virginia": "VA",
    "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
    "District of Columbia": "DC"
}
# Inverse mapping: abbreviation to full state name
abbr_to_full = {abbr: name for name, abbr in state_abbr_map.items()}

st.title("Batch Data Ingestion Portal")
st.write("Upload multiple PDF files of climate action plans. Files should be named as follows:")
st.write("**City, State Plan Type Year.pdf** (e.g., *Carson, CA Mitigation Only CAP 2017.pdf* or *Washington, District of Columbia Green Plan 2019.pdf*)")

uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
api_key = st.text_input("OpenAI API Key", type="password")

file_info = {}

if uploaded_files:
    with st.form("metadata_form"):
        st.write("### File Details and County Selection")
        for uploaded_file in uploaded_files:
            with st.expander(f"File: {uploaded_file.name}", expanded=True):
                base_name = os.path.splitext(uploaded_file.name)[0]
                # Regex with alternation:
                # - Either exactly two letters as state_abbr (if followed by whitespace)
                # - Or a full state name (one or more words)
                pattern = r"^(?P<city>.+?),\s*((?P<state_abbr>[A-Za-z]{2})(?=\s)|(?P<state_full>[A-Za-z\.]+(?:\s+[A-Za-z\.]+)*?))\s+(?P<plan_type>.+?)\s+(?P<year>\d{4})$"
                match = re.match(pattern, base_name)
                if not match:
                    st.error("Filename format is incorrect. Please ensure it follows 'City, State Plan Type Year.pdf'")
                    continue
                
                city = match.group("city").strip()
                # Determine if the state was captured as an abbreviation or full name.
                if match.group("state_abbr"):
                    state_abbrev = match.group("state_abbr").upper()
                    full_state = abbr_to_full.get(state_abbrev)
                    if not full_state:
                        st.error(f"State abbreviation {state_abbrev} not recognized.")
                        continue
                else:
                    full_state = match.group("state_full").strip()
                    # Normalize common variations for District of Columbia.
                    if full_state.lower() in ["district", "d.c.", "dc"]:
                        full_state = "District of Columbia"
                    if full_state in state_abbr_map:
                        state_abbrev = state_abbr_map[full_state]
                    else:
                        st.error(f"State name {full_state} not recognized.")
                        continue
                
                plan_type = match.group("plan_type").strip()
                year = match.group("year").strip()
                
                st.write(f"**City:** {city}")
                st.write(f"**State:** {full_state} ({state_abbrev})")
                st.write(f"**Plan Type:** {plan_type}")
                st.write(f"**Year:** {year}")
                
                county_options = county_data[county_data["stateName"] == full_state]["countyName"].tolist()
                selected_counties = st.multiselect("Select County(ies) for this plan", county_options, key=f"counties_{uploaded_file.name}")
                
                default_coords = get_coordinates(city, state_abbrev)
                coords = st.text_input("City Center Coordinates (latitude, longitude)", value=default_coords, key=f"coords_{uploaded_file.name}")
                
                file_info[uploaded_file.name] = {
                    "uploaded_file": uploaded_file,
                    "city": city,
                    "state": state_abbrev,
                    "plan_type": plan_type,
                    "year": year,
                    "counties": selected_counties,
                    "coords": coords
                }
        form_submitted = st.form_submit_button("Process All Files")

    if form_submitted:
        if not api_key:
            st.error("Please provide the OpenAI API Key.")
        else:
            with st.spinner("Processing files..."):
                for file_name, info in file_info.items():
                    if (not info["city"] or not info["state"] or not info["plan_type"] or 
                        not info["year"] or not api_key or not info["counties"] or not info["coords"]):
                        st.error(f"Missing required fields for file {file_name}. Please fill in all fields.")
                        continue
                    
                    county_str = ", ".join(info["counties"])
                    city = info["city"]
                    state_abbrev = info["state"]
                    plan_type = info["plan_type"]
                    year = info["year"]
                    coords = info["coords"]
                    uploaded_file = info["uploaded_file"]
                    
                    out_file_name = f"{city}, {state_abbrev} {plan_type} {year}.pdf"
                    summary_file_name = f"{city}, {state_abbrev} {plan_type} {year}_Summary.md"
                    file_path = os.path.join("CAPS", out_file_name)
                    
                    if os.path.exists(file_path):
                        st.error(f"File for {out_file_name} already exists. Skipping this file.")
                        continue
                    
                    os.makedirs("CAPS", exist_ok=True)
                    with open(file_path, "wb") as f:
                        f.write(uploaded_file.getbuffer())
                    st.write(f"Saved {out_file_name} to CAPS folder.")
                    
                    subprocess.run(["python", "data_ingestion_helpers/city_county_mapping_addition.py", city, state_abbrev, county_str, coords])
                    st.write(f"City, State, County(s), and Coordinates added for {out_file_name}.")
                    
                    subprocess.run(["python", "data_ingestion_helpers/summary_generation.py", api_key, file_path])
                    st.write(f"Summary generated for {out_file_name}.")
                    
                    subprocess.run(["python", "data_ingestion_helpers/data_ingestion_vectorstores.py", api_key, out_file_name, summary_file_name])
                    st.write(f"Vector store created for {out_file_name}.")
                    
                    subprocess.run(["python", "data_ingestion_helpers/dataset_addition.py", api_key, file_path])
                    st.write(f"Data added to dataset for {out_file_name}.")
                
                # Run final batch scripts once after all files are processed.
                subprocess.run(["python", "batch_scripts/caps_directory_reader.py"])
                st.write("CAPS directory reader executed.")
                
                subprocess.run(["python", "maps_helpers/maps_data.py"])
                st.write("Maps data re-created.")

                subprocess.run(["python", "region_vectorstores.py", api_key])
                st.write("Region vectorstores created.")
            
            st.success("All files processed successfully!")