Spaces:

tyang4
/

Ecodata

Sleeping

App Files Files Community

tyang4 commited on May 8

Commit

750a410

verified ·

1 Parent(s): 7641e44

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -925

app.py DELETED Viewed

@@ -1,925 +0,0 @@
-import streamlit as st
-st.set_page_config(
-    page_title="🔬 Explainable Multi-Agent BioData Constructor",
-    layout="centered",
-    initial_sidebar_state="collapsed"
-)
-from neo4j import GraphDatabase
-import openai
-import pandas as pd
-import os
-import re
-import hashlib
-import json
-import pydeck as pdk
-import faiss
-import numpy as np
-from sklearn.preprocessing import normalize
-from transformers import AutoTokenizer, AutoModel
-import torch
-import ast
-import textwrap
-import requests
-# ============================== CONFIGURATION ==============================
-NEO4J_URI = st.secrets["NEO4J_URI"]
-NEO4J_USERNAME = st.secrets["NEO4J_USERNAME"]
-NEO4J_PASSWORD = st.secrets["NEO4J_PASSWORD"]
-openai.api_key = st.secrets["openai_api_key"]
-# ============================== DOWNLOAD ==============================
-def download_if_missing(url, local_path):
-    if not os.path.exists(local_path):
-        with open(local_path, "wb") as f:
-            f.write(requests.get(url).content)
-base_url = "https://github.com/Tianyu-yang-anna/EcoData-collector/releases/download/v1.0"
-files = {
-    "nodes.csv": "/tmp/nodes.csv",
-    "nodes_embeddings.npy": "/tmp/nodes_embeddings.npy",
-    "relationships.csv": "/tmp/relationships.csv",
-    "relationships_embeddings.npy": "/tmp/relationships_embeddings.npy"
-}
-for fname, path in files.items():
-    download_if_missing(f"{base_url}/{fname}", path)
-# ============================== NEO4J DRIVER ==============================
-@st.cache_resource(show_spinner=False)
-def create_driver():
-    try:
-        driver = GraphDatabase.driver(
-            NEO4J_URI,
-            auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
-        )
-        with driver.session() as session:
-            session.run("RETURN 1")
-        return driver
-    except Exception as e:
-        st.error(f"🔴 Neo4j connection failed: {e}")
-        return None
-driver = create_driver()
-# ============================== SIMPLE GPT HELPER ==============================
-openai_client = openai.OpenAI(api_key=openai.api_key)
-def gpt_chat(sys_msg: str, user_msg: str, **kwargs):
-    rsp = openai_client.chat.completions.create(
-        model="gpt-4o",
-        messages=[{"role": "system", "content": sys_msg}, {"role": "user", "content": user_msg}],
-        **kwargs
-    )
-    return rsp.choices[0].message.content.strip()
-# ============================== EMBEDDING ENCODER ==============================
-class SimpleEncoder:
-    def __init__(self):
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
-        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(self.device)
-        self.model.eval()
-    def encode(self, texts, batch_size: int = 16):
-        embeddings = []
-        for i in range(0, len(texts), batch_size):
-            batch = texts[i : i + batch_size]
-            with torch.no_grad():
-                inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(self.device)
-                outputs = self.model(**inputs)
-                batch_emb = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
-                embeddings.append(batch_emb)
-        return np.vstack(embeddings)
-@st.cache_resource(show_spinner=False)
-def get_encoder():
-    return SimpleEncoder()
-# ============================== FAISS INDEX LOADING ==============================
-csv_file_pairs = [
-    ("/tmp/nodes.csv", "/tmp/nodes_embeddings.npy"),
-    ("/tmp/relationships.csv", "/tmp/relationships_embeddings.npy"),
-]
-for csv_path, npy_path in csv_file_pairs:
-    if not os.path.exists(npy_path):
-        st.error(f"❌ Embedding file not found: {npy_path}")
-        st.stop()
-@st.cache_resource(show_spinner=False)
-def load_embeddings_and_faiss_indexes(file_pairs):
-    index_list, metadatas = [], []
-    for csv_path, npy_path in file_pairs:
-        try:
-            df = pd.read_csv(csv_path).fillna("")
-            emb = np.load(npy_path).astype("float32")
-            index = faiss.IndexFlatIP(emb.shape[1])
-            if faiss.get_num_gpus() > 0:
-                res = faiss.StandardGpuResources()
-                index = faiss.index_cpu_to_gpu(res, 0, index)
-            index.add(emb)
-            index_list.append(index)
-            metadatas.append(df)
-        except Exception as e:
-            st.warning(f"⚠️ Failed to load {csv_path} / {npy_path}: {e}")
-            index_list.append(None)
-            metadatas.append(pd.DataFrame())
-    return index_list, metadatas
-csv_faiss_indexes, csv_metadatas = load_embeddings_and_faiss_indexes(csv_file_pairs)
-# ============================== DATAFRAME UTILITIES ==============================
-def flatten_props(df: pd.DataFrame) -> pd.DataFrame:
-    if "props" not in df.columns:
-        return df
-    try:
-        props_df = df["props"].apply(ast.literal_eval).apply(pd.Series)
-        out = pd.concat([df.drop(columns=["props"]), props_df], axis=1)
-        # st.write("✅ props flattened, new columns:", list(props_df.columns))
-        return out
-    except Exception as e:
-        st.warning(f"⚠️ Failed to parse props column: {e}")
-        return df
-def unpack_singletons(df: pd.DataFrame) -> pd.DataFrame:
-    for col in df.columns:
-        if df[col].apply(lambda x: isinstance(x, (list, tuple)) and len(x) == 1).any():
-            df[col] = df[col].apply(lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) == 1 else x)
-    return df
-def standardize_latlon(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    - 统一列名到 latitudes / longitudes
-    - 若出现同名重复列，保留第一列并删除其余
-    - longitudes 位置保持不动，把 latitudes 放到其右侧
-    """
-    # ---------- ① 统一列名 ----------
-    col_map = {}
-    for col in df.columns:
-        low = col.lower()
-        if "lat" in low and "lon" not in low:
-            col_map[col] = "latitudes"
-        elif ("lon" in low or "lng" in low):
-            col_map[col] = "longitudes"
-    df = df.rename(columns=col_map)
-    # ---------- ② 处理重复列 ----------
-    # pandas 会把重名列自动加 .1 .2 …，用 .str.replace 统一判断
-    while df.columns.duplicated().any():
-        dup_col = df.columns[df.columns.duplicated()][0]
-        # 保留出现的第一列，其余同名全部丢掉
-        first_idx = list(df.columns).index(dup_col)
-        keep = [True] * len(df.columns)
-        for i, c in enumerate(df.columns):
-            if c == dup_col and i != first_idx:
-                keep[i] = False
-        df = df.loc[:, keep]
-    # ---------- ③ 转数值 ----------
-    for c in ("latitudes", "longitudes"):
-        if c in df.columns and not isinstance(df[c], pd.Series):
-            # 出现重复但未被处理时仍可能是 DataFrame，再取第一列
-            df[c] = df[c].iloc[:, 0]
-        if c in df.columns:
-            df[c] = df[c].apply(
-                lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) == 1 else x
-            )
-            df[c] = pd.to_numeric(df[c], errors="coerce")
-    # ---------- ④ 调整顺序：latitudes 紧跟 longitudes ----------
-    if {"longitudes", "latitudes"}.issubset(df.columns):
-        cols = list(df.columns)
-        lon_idx = cols.index("longitudes")
-        lat_idx = cols.index("latitudes")
-        if lat_idx != lon_idx + 1:
-            cols.pop(lat_idx)
-            cols.insert(lon_idx + 1, "latitudes")
-            df = df[cols]
-    return df
-# ===== CSV fallback 查询 =====
-@st.cache_data(show_spinner=False)
-def rag_csv_fallback(subtask, top_k=2000):
-    encoder = get_encoder()
-    query_vec = encoder.encode([subtask])
-    query_vec = normalize(query_vec, axis=1).astype("float32")
-    if not np.any(query_vec):
-        return pd.DataFrame()
-    all_results = []
-    for index, metadata in zip(csv_faiss_indexes, csv_metadatas):
-        if index is None or metadata.empty:
-            continue
-        distances, indices = index.search(query_vec, top_k)
-        retrieved = metadata.iloc[indices[0]].copy()
-        all_results.append(retrieved)
-    if all_results:
-        return pd.concat(all_results).drop_duplicates().reset_index(drop=True)
-    return pd.DataFrame()
-def generate_cypher_with_gpt(subtask: str) -> str:
-    prompt = f"""
-You are an expert Cypher query generator for a Neo4j biodiversity database. The schema is as follows:
-Node Types and Properties:
-- Observation: animal_name, date, latitude, longitude
-- Species: name, species_full_name
-- Site: name
-- County: name
-- State: name
-- Hurricane: name
-- Policy: title, description
-- ClimateEvent: event_type, date
-- TemperatureReading: value, date, location
-- Precipitation: amount, date, location
-Relationship Types:
-- OBSERVED_IN: (Observation)-[:OBSERVED_IN]->(Site)
-- OBSERVED_ORGANISM: (Observation)-[:OBSERVED_ORGANISM]->(Species)
-- BELONGS_TO: (Site)-[:BELONGS_TO]->(County)
-- IN_COUNTY: (Observation)-[:IN_COUNTY]->(County)
-- IN_STATE: (County)-[:IN_STATE]->(State)
-- interactsWith: (Species)-[:interactsWith]->(Species)
-- preysOn: (Species)-[:preysOn]->(Species)
-Your task is to generate a **precise and efficient** Cypher query for the following subtask:
-"{subtask}"
-Guidelines:
-- Do NOT return all nodes of a type (e.g., all Species) unless the subtask explicitly asks for it.
-- If a location (county/state) is mentioned or implied, include location filtering using IN_COUNTY, IN_STATE, or BELONGS_TO.
-- If the subtask implies a taxonomic or common name group (e.g., frog, snake, salmon), apply CONTAINS or STARTS WITH filters on Species.name or species_full_name, using toLower(...) for case-insensitive matching.
-- If the subtask includes a time range, include date filtering.
-- Prefer using DISTINCT to avoid redundant results.
-- Only return fields that are clearly needed to fulfill the subtask.
-Return your response strictly as a **JSON object** with the following fields:
-- "intent": a short description of what the query does
-- "cypher_query": the Cypher query
-- "fields": a list of returned field names (e.g., ["species", "county", "date"])
-Do not include any explanation or commentary—only return the JSON object.
-"""
-    client = openai.OpenAI(api_key=st.secrets["openai_api_key"])
-    response = client.chat.completions.create(
-        model="gpt-4o",
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0
-    )
-    content = response.choices[0].message.content.strip()
-    content = re.sub(r"^(json|python)?", "", content, flags=re.IGNORECASE).strip()
-    content = re.sub(r"$", "", content).strip()
-    try:
-        cypher_json = json.loads(content)
-        return cypher_json["cypher_query"]
-    except Exception as e:
-        return ""
-def intelligent_retriever_agent(subtask, saved_hashes=None):
-    if saved_hashes is None:
-        saved_hashes = set()
-    st.success("🔍 Attempting to retrieve data from the KN-Wildlife knowledge graph…")
-    cypher_query = generate_cypher_with_gpt(subtask)
-    cypher_df = pd.DataFrame()
-    if cypher_query.strip():
-        st.code(cypher_query, language="cypher")
-        try:
-            query = re.sub(r"(?i)LIMIT\s+\d+\s*$", "", cypher_query)
-            with driver.session() as session:
-                result = session.run(query)
-                cypher_df = pd.DataFrame(result.data())
-        except Exception as e:
-            st.error(f"🚨 Cypher execution error: {e}")
-            st.code(query, language="cypher")
-    # decide fallback
-    fallback_needed = False
-    if cypher_df.empty:
-        # st.warning("⚠️ Cypher query returned no data. Trying CSV fallback…")
-        fallback_needed = True
-    else:
-        df_hash = hashlib.md5(cypher_df.to_csv(index=False).encode()).hexdigest()
-        st.write(f"ℹ️ Cypher rows: {len(cypher_df)} | duplicate?: {df_hash in saved_hashes}")
-        if df_hash in saved_hashes or len(cypher_df) < 10:
-            fallback_needed = True
-    if fallback_needed:
-        csv_df = rag_csv_fallback(subtask)
-        if not csv_df.empty:
-            csv_df = flatten_props(csv_df)
-            csv_df = unpack_singletons(csv_df)
-            csv_df = standardize_latlon(csv_df)
-            # st.success("✅ CSV fallback successful.")
-            return csv_df
-        st.warning("❌ CSV fallback also returned nothing.")
-        return pd.DataFrame()
-    # good cypher
-    st.success("✅ Cypher query successful. Using Cypher result.")
-    cypher_df = flatten_props(cypher_df)
-    cypher_df = unpack_singletons(cypher_df)
-    cypher_df = standardize_latlon(cypher_df)
-    if "species" not in cypher_df.columns and "animal_name" in cypher_df.columns:
-        cypher_df["species"] = cypher_df["animal_name"]
-    if "date" in cypher_df.columns:
-        cypher_df["date"] = pd.to_datetime(cypher_df["date"], errors="coerce")
-    cypher_df.rename(columns={"latitudes": "latitude", "longitudes": "longitude", "lat": "latitude", "lon": "longitude"}, inplace=True)
-    for col in ("latitude", "longitude"):
-        if col in cypher_df.columns:
-            cypher_df[col] = pd.to_numeric(cypher_df[col], errors="coerce")
-    return cypher_df
-def planner_agent(question: str) -> str:
-    prompt = f"""
-You are a **research‑data planning assistant**.
-------------------------  📝  TASK  ------------------------
-Your job is to list the **separate data sets** a researcher must collect
-to answer the research question below.
-*Each data set* should be focused on one clearly defined entity or
-phenomenon (e.g. "Tracks of hurricanes affecting Florida since 1950",
-"Geo‑tagged snake observations in Florida 2000‑present").
---------------------  📋  OUTPUT FORMAT  --------------------
-Write 1–6 blocks.  For **each** block use *all* four lines exactly:
-Dataset Need X: <Concise title, ≤ 10 words>
-Description: <Why this data matters—1 short sentence>
-⚠️  Do NOT add extra lines or markdown.
-⚠️  Keep variable names short; no code blocks; no quotes.
---------------------  🔍  RESEARCH QUESTION  --------------------
-{question}
-"""
-    rsp = openai_client.chat.completions.create(
-        model="gpt-4o",
-        messages=[
-            {"role": "system", "content": "You are an expert research planner."},
-            {"role": "user", "content": prompt}
-        ],
-        temperature=0.2
-    )
-    return rsp.choices[0].message.content.strip()
-def evaluate_dataset_with_gpt(subtask: str, df: pd.DataFrame, client=openai_client) -> str:
-    max_columns = 50
-    selected_cols = df.columns[:max_columns]
-    column_info = {col: str(df[col].dtype) for col in selected_cols}
-    sample_rows = df.head(3)[selected_cols].to_dict(orient="records")   # take 3 example rows
-    prompt = f"""
-You are a data‑validation assistant. Decide whether the dataset below is useful for the research subtask.
-===== TASK =====
-Subtask: "{subtask}"
-===== DATASET PREVIEW =====
-Schema (first {len(selected_cols)} columns):
-{json.dumps(column_info, indent=2)}
-Sample rows (3 max):
-{json.dumps(sample_rows, indent=2)}
-===== OUTPUT INSTRUCTIONS (follow strictly) =====
-Case A – Relevant:
-• Write exactly two sentences, each no more than 30 words.
-• Summarize what the dataset contains and why it helps the subtask.
-• Do not mention column names or list individual rows.
-Case B – Not relevant:
-• Write one or two sentences, each no more than 30 words, **describing only what the dataset contains**.
-• Do **not** mention the subtask, relevance, suitability, limitations, or missing information (avoid phrases like “not related,” “does not focus,” “irrelevant,” etc.).
-• After the sentences, output the header **Additionally, here are some external resources you might find helpful:** on a new line. Format your output in markdown as:
-- [Name of Source](URL)
-• Then list 2–3 bullet points, each on its own line, starting with “- ” followed immediately by a URL likely to contain the needed data.
-• No additional commentary.
-General rules:
-Plain text only — no code fences. Markdown link syntax (`[text](url)`) is allowed.
-"""
-    rsp = client.chat.completions.create(
-        model="gpt-4o",
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0.3,
-    )
-    return rsp.choices[0].message.content.strip()
-# def evaluate_dataset_with_gpt(subtask: str, df: pd.DataFrame,client=openai_client) -> str:
-#     # 只选择前 N 个字段，避免超长 token
-#     max_columns = 10
-#     selected_columns = df.columns[:max_columns]
-#     # 提取字段名及其数据类型
-#     column_info = {col: str(df[col].dtype) for col in selected_columns}
-#     # 提取前 3 行示例
-#     sample_data = df.head(50)[selected_columns].to_dict(orient="records")
-#     # 构建 prompt
-#     prompt = f"""
-# You are a data validation assistant. Your task is to summarize what this dataset represents.
-# Subtask: {subtask}
-# Here are the dataset's column names and data types:
-# {json.dumps(column_info, indent=2)}
-# Here are a few sample rows:
-# {json.dumps(sample_data, indent=2)}
-# Your response should be concise (2-3 sentences).
-# Focus on the dataset's content and how it might help with the subtask.
-# Do not list column names or describe individual rows.
-# 下面是你的输出格式：
-# 如果你判断数据和data needed相关，那么输出2-3 sentences介绍该数据集。
-# 如果你判断数据和data needed不相关，那么输出2-4条外部资源的链接。
-# """
-#     # 调用 GPT-4o
-#     rsp = client.chat.completions.create(
-#         model="gpt-4o",
-#         messages=[{"role": "user", "content": prompt}],
-#         temperature=0.3
-#     )
-#     return rsp.choices[0].message.content.strip()
-def external_resource_recommender(subtask: str, client=openai_client) -> str:
-    prompt = f"""
-You are a helpful assistant for researchers. Please recommend 3 reliable and relevant online datasets or websites that can help with the following subtask:
-"{subtask}"
-Format your output in markdown as:
-- [Name of Source](URL)
-- [Name of Source](URL)
-- [Name of Source](URL)
-"""
-    rsp = client.chat.completions.create(
-        model="gpt-4o",
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0.3
-    )
-    return rsp.choices[0].message.content.strip()
-def fallback_query_router(subtask: str, driver) -> pd.DataFrame:
-    text = subtask.lower()
-    with driver.session() as session:
-        # --- 1. 物种“where…observed/found…” ---
-        if "where" in text and ("observed" in text or "found" in text):
-            query = """
-            MATCH (o:Observation)-[:OBSERVED_ORGANISM]->(s:Species)
-            RETURN s.name AS species, o.site_name AS location, o.date AS date
-            ORDER BY o.date DESC
-            """
-        # --- 2. before / after 某一年 ---
-        elif "before" in text or "after" in text:
-            years = re.findall(r'\b(19|20)\d{2}\b', text)
-            if years:
-                op = "<" if "before" in text else ">"
-                query = f"""
-                MATCH (o:Observation)-[:OBSERVED_ORGANISM]->(s:Species)
-                WHERE o.date {op} date('{years[0]}-01-01')
-                RETURN s.name AS species, o.site_name AS location, o.date AS date
-                ORDER BY o.date DESC
-                """
-            else:
-                query = "RETURN 1"
-        # --- 3. 飓风相关 ---
-        elif "hurricane" in text:
-            query = """
-            MATCH (o:Observation)-[:OBSERVED_AT]->(h:Hurricane),
-                  (o)-[:OBSERVED_ORGANISM]->(s:Species),
-                  (o)-[:OBSERVED_IN]->(site)-[:BELONGS_TO]->(c:County)-[:IN_STATE]->(st:State)
-            WHERE st.name = 'Florida'
-            RETURN h.name AS hurricane,
-                   s.name AS species,
-                   site.name AS site,
-                   c.name AS county,
-                   o.date AS date
-            ORDER BY o.date DESC
-            """
-        # --- 4. 捕食 / predator ---
-        elif "preys on" in text or "predator" in text:
-            query = """
-            MATCH (s1:Species)-[:preysOn]->(s2:Species)
-            RETURN s1.name AS predator, s2.name AS prey
-            """
-        # --- 5. 默认兜底 ---
-        else:
-            query = """
-            MATCH (o:Observation)
-            RETURN o.animal_name AS species, o.site_name AS location, o.date AS date
-            """
-        # --- 执行查询 ---
-        result = session.run(query)
-        df = pd.DataFrame(result.data())
-        if df.empty:
-            st.info("🌐 I couldn't find relevant data in KN‑Wildlife. Let me check external sources for you...")
-            suggestions = external_resource_recommender(subtask)
-            st.markdown(suggestions)
-        return df
-def save_dataset(df: pd.DataFrame, filename: str) -> str:
-    if len(df) < 10:
-        st.warning(f"❌ Dataset too small to save: only {len(df)} rows.")
-        return ""
-    os.makedirs("saved_datasets", exist_ok=True)
-    path = f"saved_datasets/{filename}.csv"
-    if os.path.exists(path):
-        old_hash = hashlib.md5(open(path, 'rb').read()).hexdigest()
-        new_hash = hashlib.md5(df.to_csv(index=False).encode()).hexdigest()
-        if old_hash == new_hash:
-            st.info(f"ℹ️ Dataset saved: {filename}.csv")
-            return path
-    df.to_csv(path, index=False)
-    st.info(f"✅ Dataset saved: {filename}.csv")
-    return path
-# ===================== CHART SUGGESTION (MODIFIED MAP SECTION) =====================
-def suggest_charts_with_gpt(df: pd.DataFrame) -> str:
-    """Generate Streamlit chart code for automatic visualisation."""
-    try:
-        # st.write("🟢 COLS‑DEBUG:", list(df.columns))
-        # Ensure dates are parsed
-        if "date" in df.columns:
-            df["date"] = df["date"].apply(lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) == 1 else x)
-            df["date"] = pd.to_datetime(df["date"], errors="coerce")
-        if "animal_name" in df.columns and "species" not in df.columns:
-            df["species"] = df["animal_name"]
-        df.rename(columns={"latitudes": "latitude", "longitudes": "longitude"}, inplace=True)
-        chart_code = """
-# --- Species Bar Chart ---
-if 'species' in df.columns:
-    st.markdown('📊 Count of Observations by Species')
-    try:
-        species_counts = df['species'].astype(str).value_counts()
-        st.bar_chart(species_counts)
-    except Exception as e:
-        st.warning(f'⚠️ Could not render species chart: {e}')
-# --- Timeline Line Chart ---
-if 'date' in df.columns:
-    st.markdown('📈 Observations Over Time')
-    try:
-        timeline = df['date'].dropna().value_counts().sort_index()
-        st.line_chart(timeline)
-    except Exception as e:
-        st.warning(f'⚠️ Could not render date chart: {e}')
-# --- Map Visualisation (highlight all points) ---
-if 'latitude' in df.columns and 'longitude' in df.columns:
-    st.markdown('🗺️ Observation Locations on Map')
-    try:
-        coords = (
-            df[['latitude', 'longitude']]
-            .apply(pd.to_numeric, errors='coerce')
-            .dropna()
-            .rename(columns={'latitude': 'lat', 'longitude': 'lon'})
-        )
-        coords = coords[
-            (coords['lat'].between(-90, 90)) &
-            (coords['lon'].between(-180, 180))
-        ]
-        if len(coords) == 0:
-            st.warning('⚠️ No valid coordinates to plot on the map.')
-        else:
-            # ---------- ① 视图 ----------
-            try:
-                vs_tmp = pdk.data_utils.compute_view(coords[['lon', 'lat']])
-                view_state = (
-                    pdk.ViewState(**vs_tmp, pitch=0, bearing=0)
-                    if isinstance(vs_tmp, dict) else vs_tmp
-                )
-                view_state.pitch = 0
-                view_state.bearing = 0
-            except Exception:
-                view_state = pdk.ViewState(
-                    latitude=coords['lat'].mean(),
-                    longitude=coords['lon'].mean(),
-                    zoom=5,
-                    pitch=0,
-                    bearing=0,
-                )
-            # ---------- ② 高亮层 ----------
-            layer = pdk.Layer(
-                'ScatterplotLayer',
-                data=coords,
-                get_position='[lon, lat]',
-                get_radius=50000,
-                get_fill_color=[0, 255, 0, 200],
-                get_line_color=[255, 255, 255],
-                line_width_units='pixels',
-                get_line_width=2,
-                pickable=True,
-                auto_highlight=True,
-            )
-            # ---------- ③ 组合 Deck ----------
-            deck = pdk.Deck(
-                layers=[layer],
-                initial_view_state=view_state,
-                map_style='mapbox://styles/mapbox/light-v11',
-                tooltip={'html': '<b>Lat:</b> {lat}<br/><b>Lon:</b> {lon}'},
-            )
-            st.pydeck_chart(deck)
-    except Exception as e:
-        st.warning(f'⚠️ Could not render map: {e}')
-"""
-        return textwrap.dedent(chart_code)
-    except Exception as outer_error:
-        return f"st.warning('❌ Chart generation failed: {outer_error}')"
-# ========= UI layout and connection ==========
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-# st.set_page_config(
-#     page_title="🔬 Explainable Multi-Agent BioData Constructor",
-#     layout="centered",
-#     initial_sidebar_state="collapsed"
-# )
-# ——— 自定义主容器最大宽度 ———
-st.markdown(
-    """
-    <style>
-        /* 针对正文文字 */
-        html, body, .block-container, .markdown-text-container {
-            font-size: 19px !important;     /* ← 这里改数字 */
-            line-height: 1.6 !important;
-        }
-        /* 把默认窄屏的 max-width（约700px）改成 1400px，视需要可调整 */
-        .block-container {
-            max-width: 1600px;
-        }
-    </style>
-    """,
-    unsafe_allow_html=True
-)
-st.title("🔬 EcoData collector")
-st.success("""
-👋 Hi there! I’m **Lily**, your research assistant bot 🤖. I’m here to help you explore data sources related to your **complex research question**. Let’s work together to find the information you need!
-💡 You can start by entering a research question like:
-- *In Florida, how do hurricanes affect the distribution of snakes?*
-- *How does precipitation impact salmon abundance in freshwater ecosystems?*
-- *How do climate change and urbanization jointly affect bird migration and diversity in Florida?*
-""")
-if driver:
-    st.success("🟢 Connected to **KN-Wildlife** — a Neo4j-powered biodiversity graph focused on Florida’s species and ecosystems.   I’ll start by checking what relevant data we already have in KN-Wildlife to support your research.")
-else:
-    st.error("🔴 Failed to connect to KN-Wildlife! Please fix connection first.")
-    st.stop()
-question = st.text_area("Enter your research question:", "")
-# 初始化状态变量
-if "start_clicked" not in st.session_state:
-    st.session_state.start_clicked = False
-if "subtask_plan" not in st.session_state:
-    st.session_state.subtask_plan = ""
-if "ready_to_continue" not in st.session_state:
-    st.session_state.ready_to_continue = False
-if "stop_requested" not in st.session_state:
-    st.session_state.stop_requested = False
-if "visualization_ready" not in st.session_state:
-    st.session_state.visualization_ready = False
-if "do_visualize" not in st.session_state:
-    st.session_state.do_visualize = False
-if "all_dataframes" not in st.session_state:
-    st.session_state.all_dataframes = []
-if "retrieval_done" not in st.session_state:
-    st.session_state.retrieval_done = False
-# 点击按钮，触发子任务分解
-if st.button("Let’s start") and question.strip():
-    st.session_state.start_clicked = True
-    st.session_state.subtask_plan = planner_agent(question)
-    st.session_state.ready_to_continue = False
-    st.session_state.stop_requested = False
-    st.session_state.visualization_ready = False
-    st.session_state.do_visualize = False
-    st.session_state.all_dataframes = []
-    st.session_state.retrieval_done = False
-# 阶段一：展示子任务
-if st.session_state.start_clicked:
-    # st.success("🧠 Now, I’ll break down your research question into several focused subtasks.")
-    st.success("🧠 I’ve identified the distinct datasets you’ll need for this research question.")
-    with st.expander("🔹 Curious how I split your question? Click to see!", expanded=True):
-        st.write(st.session_state.subtask_plan)
-    st.success("📌 I’m ready to roll up my sleeves — shall I start finding datasets for each subtask? 🕒 This step might take a little while, so thanks for your patience!")
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        if st.button("✅ Yes, go ahead", key="confirm_button"):
-            st.session_state.ready_to_continue = True
-            st.session_state.stop_requested = False
-    with col2:
-        if st.button("⛔ No, stop here", key="stop_button"):
-            st.session_state.ready_to_continue = False
-            st.session_state.stop_requested = True
-# ---------- 阶段二：数据检索 & 渲染 ----------
-if st.session_state.ready_to_continue:
-    # ① 先确定 Planner 输出使用的前缀
-    #    这里假设只有两种可能：Subtask / Dataset Need
-    if "Dataset Need" in st.session_state.subtask_plan:
-        prefix = "Dataset Need"
-    else:
-        prefix = "Subtask"
-    # ② 用 f-string 拼正则（rf = raw‑formatted）
-    pattern = rf"{prefix} \d+:.*?(?={prefix} \d+:|$)"
-    subtasks = re.findall(pattern,
-                          st.session_state.subtask_plan,
-                          flags=re.DOTALL)
-    # 如果 Planner 没输出任何块，给个提示
-    if not subtasks:
-        st.warning("⚠️ No dataset blocks detected in planner output.")
-        st.stop()
-    # 检索只执行一次
-    if not st.session_state.retrieval_done:                      # ★
-        progress_bar = st.progress(0)
-        total = len(subtasks)
-        saved_hashes = set()
-        st.session_state.all_dataframes = []
-    for idx, subtask in enumerate(subtasks):
-        # with st.expander(f"🔹 Retrieving data for subtask {idx+1}:", expanded=True):
-        with st.expander(f"🔹 Retrieving data for dataset need {idx+1}:", expanded=True):
-            cleaned_subtask = "\n".join(subtask.strip().split("\n")[1:])
-            st.markdown(cleaned_subtask)
-            # ---------- 首次运行：真正检索 ----------
-            if not st.session_state.retrieval_done:              # ★
-                df = intelligent_retriever_agent(subtask, saved_hashes)
-                if not df.empty:
-                    df_hash = hashlib.md5(df.to_csv(index=False).encode()).hexdigest()
-                    if df_hash in saved_hashes:
-                        st.warning("⚠️ This dataset has already been saved — skipping duplicate.")
-                    elif len(df) < 10:
-                        st.warning(f"❌ This dataset is too small — just {len(df)} rows. Skipping save.")
-                    else:
-                        saved_hashes.add(df_hash)
-                        df = flatten_props(df)
-                        df = standardize_latlon(df)
-                        summary = evaluate_dataset_with_gpt(subtask, df)
-                        st.session_state.all_dataframes.append({
-                            "hash": df_hash,
-                            "df": df,
-                            "summary": summary
-                            })
-                        st.dataframe(df.head(50))
-                        save_path = save_dataset(df, f"subtask_{idx+1}")
-                        if save_path:
-                            # summary = evaluate_dataset_with_gpt(subtask, df)
-                            st.markdown("**📝 Dataset Introduction:**")
-                            st.write(summary)
-                if 'progress_bar' in locals():
-                    progress_bar.progress((idx + 1) / total)
-            # ---------- 之后 rerun：只展示 ----------
-            else:                                               # ★
-                if idx < len(st.session_state.all_dataframes):
-                    # _hash, df = st.session_state.all_dataframes[idx]
-                    # df = standardize_latlon(df)
-                    # st.dataframe(df.head(50))
-                    entry = st.session_state.all_dataframes[idx]          # ➕ 新行
-                    df = standardize_latlon(entry["df"])
-                    st.dataframe(df.head(50))
-                    st.write(entry.get("summary", ""))
-    # 检索完成后打标记
-    if not st.session_state.retrieval_done:                     # ★
-        st.session_state.retrieval_done = True
-        st.session_state.visualization_ready = bool(st.session_state.all_dataframes)
-    if st.session_state.all_dataframes:
-        st.session_state.visualization_ready = True
-    else:
-        st.success("🎉 All subtasks completed and datasets generated!💡 Feel free to ask me more questions anytime!")
-        # st.success("🎉 All subtasks completed and datasets generated!")
-        # st.success("💡 Feel free to ask Lily more questions anytime!")
-# 阶段三：是否进行可视化选择
-if st.session_state.visualization_ready and not st.session_state.do_visualize:
-    st.success("📊 All set! I’ve gathered the datasets. Ready to visualize them?")
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        if st.button("✅ Yes, go ahead", key="viz_confirm"):
-            st.session_state.do_visualize = True
-    with col2:
-        if st.button("⛔ No, stop here", key="viz_stop"):
-            st.session_state.visualization_ready = False
-            st.success("🎉 All subtasks completed and datasets generated!💡 Feel free to ask me more questions anytime!")
-            # st.success("🎉 All subtasks completed and datasets generated!")
-            # st.success("💡 Feel free to ask Lily more questions anytime!")
-# 阶段三：数据可视化
-if st.session_state.do_visualize:
-    for i, entry in enumerate(st.session_state.all_dataframes):
-        df = entry["df"]
-        summary = entry.get("summary", "")
-        if len(df) < 10:
-            continue
-        with st.expander(f"**🔹 Dataset {i + 1} Visualization**", expanded=True):
-            st.markdown(f"Dataset {i + 1} Preview")
-            st.dataframe(df.head(10))
-            chart_code = suggest_charts_with_gpt(df)
-            if chart_code:
-                # st.markdown("🧠 The visualization code:")
-                # st.code(chart_code, language="python")
-                try:
-                    exec(chart_code, {"st": st, "pd": pd, "df": df, "pdk": pdk})
-                except Exception as e:
-                    st.error(f"❌ Error running chart code: {e}")
-    st.success("🎉 All subtasks completed and datasets generated!💡 Feel free to ask me more questions anytime!")
-    # st.success("💡 Feel free to ask me more questions anytime!")
-if st.session_state.stop_requested:
-    st.info("👍 No problem! You can review the subtasks above or revise your question.")
-# —— 在侧边栏插入 ChatGPT 风格聊天面板 ——
-with st.sidebar.expander("💬 Chat with Lily", expanded=True):
-    # 聊天输入框
-    user_msg = st.chat_input("Type your question here…", key="sidebar_chat_input")
-    if user_msg:
-        # 拼当前页面上下文
-        context_parts = []
-        if st.session_state.subtask_plan:
-            context_parts.append("Subtasks:\n" + st.session_state.subtask_plan)
-        for entry in st.session_state.all_dataframes:
-            context_parts.append("Data summary:\n" + entry["summary"])
-        page_context = "\n\n".join(context_parts)
-        # 调用 GPT helper
-        with st.spinner("Lily is thinking…"):
-            assistant_msg = gpt_chat(
-                sys_msg=f"You are Lily, a research assistant. Here’s what’s on screen:\n\n{page_context}",
-                user_msg=user_msg
-            )
-        # 保存对话
-        st.session_state.chat_history.append({"role": "user",      "content": user_msg})
-        st.session_state.chat_history.append({"role": "assistant", "content": assistant_msg})
-    # 渲染历史对话
-    for msg in st.session_state.chat_history:
-        if msg["role"] == "user":
-            st.chat_message("user").write(msg["content"])
-        else:
-            st.chat_message("assistant").write(msg["content"])