Spaces:

AI-Agent-Exercise-2025
/

Cause_estimation_tool

Running

App Files Files Community

MTeguri commited on Sep 18, 2025

Commit

b8b8a5d

1 Parent(s): 4c0852e

Enhance app.py with type annotations, improved error handling, and detailed docstrings for functions. The Supabase client and data fetching methods have been updated for better clarity and functionality, ensuring compatibility with both v1 and v2 APIs.

Browse files

Files changed (1) hide show

app.py +120 -37

app.py CHANGED Viewed

@@ -4,6 +4,8 @@
 import io
 import os
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
@@ -22,7 +24,7 @@ try:
 except Exception:
     # 旧API互換（v1 をお使いの場合は import supabase; supabase.create_client を利用）
     create_client = None
-    import supabase as supabase_v1
 plt.switch_backend("Agg")  # サーバー実行向け
@@ -33,12 +35,25 @@ matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic
 # .env 読み込み
 load_dotenv()
-SUPABASE_URL = os.environ.get("SUPABASE_URL")
-SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
-TABLE_NAME = "estimated_cause_mocdata"  # ご指定のテーブル名
-# Supabase クライアント作成（v2 優先、なければ v1）
-def _get_supabase_client():
     if not SUPABASE_URL or not SUPABASE_KEY:
         raise RuntimeError("環境変数 SUPABASE_URL または SUPABASE_KEY が設定されていません。")
     if create_client is not None:
@@ -46,16 +61,29 @@ def _get_supabase_client():
     # v1 fallback
     return supabase_v1.create_client(SUPABASE_URL, SUPABASE_KEY)
-def _fetch_supabase_df():
     client = _get_supabase_client()
     # v2 と v1 で返り値が異なるため分岐
     try:
-        resp = client.table(TABLE_NAME).select("*").execute()
-        data = getattr(resp, "data", None) if hasattr(resp, "data") else None
         if data is None:
-            # v1 の場合、resp が dict のことも
             if isinstance(resp, dict) and "data" in resp:
-                data = resp["data"]
         if not data:
             raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' からデータを取得できませんでした。")
         df = pd.DataFrame(data)
@@ -63,9 +91,25 @@ def _fetch_supabase_df():
             raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' にレコードがありません。")
         return df
     except Exception as e:
-        raise RuntimeError(f"Supabase 取得エラー: {e}")
-def _boxplot_image(a, b, feature_name):
     fig = plt.figure()
     plt.boxplot([a, b], labels=["正常(0)", "悪化(1)"])
     plt.title(f"Boxplot: {feature_name}")
@@ -77,28 +121,66 @@ def _boxplot_image(a, b, feature_name):
     img = Image.open(buf)  # PIL.Image.Image
     return np.array(img)   # Gallery は numpy 配列でもOK
-def analyze_from_supabase(threshold, top_k):
     # ---- データ取得 ----
     try:
         df = _fetch_supabase_df()
     except Exception as e:
-        msg = f"❌ データ取得に失敗：{e}\n- .env に SUPABASE_URL / SUPABASE_KEY を設定してください\n- テーブル名: {TABLE_NAME}"
         return (msg, None, None, None, None, [], None)
-    status_md = f"**テーブル:** `{TABLE_NAME}`\n\n**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
-    head_df = df.head()
     # ---- 目的変数の作成（悪化=1, 正常=0）----
     target_col = "CODcr(S)sin"
     if target_col not in df.columns:
-        return (f"❌ 必須列 '{target_col}' が見つかりません。現在の列: {list(df.columns)}", None, None, None, None, [], None)
     df = df.copy()
     # 数値化（もし文字列が混ざっていても NaN に落とす）
     df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
-    df["label"] = (df[target_col] > threshold).astype(int)
-    label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
     status_md += (
         f"**閾値:** {threshold}\n\n"
         f"**目的変数の分布:**\n"
@@ -107,8 +189,8 @@ def analyze_from_supabase(threshold, top_k):
     )
     # ---- 説明変数の準備 ----
-    X = df.drop(columns=[target_col, "label"])
-    y = df["label"]
     # 既知の小数表記ゆれ対策（あれば）
     if "分散菌槽DO" in X.columns:
@@ -116,22 +198,22 @@ def analyze_from_supabase(threshold, top_k):
         X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
     # ---- 相関 (point-biserial) ----
-    rows = []
     for col in X.columns:
         try:
             col_num = pd.to_numeric(X[col], errors="coerce")
             r, p = pointbiserialr(y, col_num)
-            rows.append((col, r, p))
         except Exception:
-            rows.append((col, np.nan, np.nan))
-    corr_df = (
         pd.DataFrame(rows, columns=["feature", "r_pb", "pval"])
         .set_index("feature")
         .sort_values(by="r_pb", key=lambda s: s.abs(), ascending=False)
     )
     # ---- t検定 ----
-    ttest_rows = []
     for col in X.columns:
         col_num = pd.to_numeric(X[col], errors="coerce")
         a = col_num[y == 0].dropna()
@@ -142,22 +224,22 @@ def analyze_from_supabase(threshold, top_k):
                 ttest_rows.append(
                     {
                         "feature": col,
-                        "mean_normal": a.mean(),
-                        "mean_bad": b.mean(),
-                        "pval": p,
-                        "n_normal": len(a),
-                        "n_bad": len(b),
                     }
                 )
             except Exception:
                 pass
-    ttest_df = (
         pd.DataFrame(ttest_rows).set_index("feature").sort_values(by="pval", ascending=True)
         if ttest_rows else pd.DataFrame()
     )
     # ---- 箱ひげ図 (ギャラリー) ----
-    gallery_imgs = []
     for col in X.columns:
         col_num = pd.to_numeric(X[col], errors="coerce")
         a_plot = col_num[y == 0].dropna()
@@ -171,10 +253,10 @@ def analyze_from_supabase(threshold, top_k):
     X_num = X_num.loc[:, X_num.notna().sum() > 0]  # すべてNaN列を落とす
     if X_num.shape[1] == 0:
-        coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
         status_md += "\n⚠️ 数値説明変数がありませんでした。係数は空です。"
     else:
-        pipe = Pipeline(
             steps=[
                 ("imputer", SimpleImputer(strategy="median")),
                 ("scaler", StandardScaler()),
@@ -199,7 +281,7 @@ def analyze_from_supabase(threshold, top_k):
                 .drop(columns=["abs_coef"])
             )
             coef_df["rank"] = np.arange(1, len(coef_df) + 1)
-            status_md += "\n\n**悪化原因の候補（上位{}項目）**:\n- ".format(top_k) + "\n- ".join(
                 [f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
             )
         except Exception as e:
@@ -210,6 +292,7 @@ def analyze_from_supabase(threshold, top_k):
     return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
 # === Gradio UI ===
 with gr.Blocks(title="水質データ 解析アプリ（Supabase版）") as demo:
     gr.Markdown(

 import io
 import os
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 except Exception:
     # 旧API互換（v1 をお使いの場合は import supabase; supabase.create_client を利用）
     create_client = None
+    import supabase as supabase_v1  # type: ignore
 plt.switch_backend("Agg")  # サーバー実行向け
 # .env 読み込み
 load_dotenv()
+SUPABASE_URL: Optional[str] = os.environ.get("SUPABASE_URL")
+SUPABASE_KEY: Optional[str] = os.environ.get("SUPABASE_KEY")
+TABLE_NAME: str = "estimated_cause_mocdata"  # ご指定のテーブル名
+def _get_supabase_client() -> Any:
+    """
+    Supabase クライアントを生成して返す（v2 を優先、なければ v1 にフォールバック）。
+    環境変数:
+        SUPABASE_URL (str): Supabase の URL
+        SUPABASE_KEY (str): Supabase の API キー
+    Raises:
+        RuntimeError: 必要な環境変数が未設定、またはクライアント生成に失敗した場合。
+    Returns:
+        Any: Supabase クライアントオブジェクト（v2 または v1）。
+    """
     if not SUPABASE_URL or not SUPABASE_KEY:
         raise RuntimeError("環境変数 SUPABASE_URL または SUPABASE_KEY が設定されていません。")
     if create_client is not None:
     # v1 fallback
     return supabase_v1.create_client(SUPABASE_URL, SUPABASE_KEY)
+def _fetch_supabase_df() -> pd.DataFrame:
+    """
+    Supabase の指定テーブルから全件取得し、pandas DataFrame に変換して返す。
+    テーブル:
+        TABLE_NAME: 既定では 'estimated_cause_mocdata'
+    Raises:
+        RuntimeError: 通信エラー、期待した形でデータが得られない、またはレコードが空の場合。
+    Returns:
+        pd.DataFrame: 取得したレコードの DataFrame。
+    """
     client = _get_supabase_client()
     # v2 と v1 で返り値が異なるため分岐
     try:
+        resp: Any = client.table(TABLE_NAME).select("*").execute()
+        data: Optional[List[Dict[str, Any]]] = getattr(resp, "data", None) if hasattr(resp, "data") else None
         if data is None:
+            # v1 の場合、resp が dict のこともある
             if isinstance(resp, dict) and "data" in resp:
+                data = resp["data"]  # type: ignore[index]
         if not data:
             raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' からデータを取得できませんでした。")
         df = pd.DataFrame(data)
             raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' にレコードがありません。")
         return df
     except Exception as e:
+        raise RuntimeError(f"Supabase 取得エラー: {e}") from e
+def _boxplot_image(
+    a: Union[pd.Series, Sequence[float], np.ndarray],
+    b: Union[pd.Series, Sequence[float], np.ndarray],
+    feature_name: str
+) -> np.ndarray:
+    """
+    2群（正常/悪化）の値から箱ひげ図を描画し、画像(ndarray)を返す。
+    Args:
+        a: 正常(0) 群の値（Series, list, ndarray など数値配列）
+        b: 悪化(1) 群の値（Series, list, ndarray など数値配列）
+        feature_name (str): グラフタイトルや y 軸ラベルに用いる特徴量名
+    Returns:
+        np.ndarray: 生成した箱ひげ図の画像配列（RGB）。
+    """
     fig = plt.figure()
     plt.boxplot([a, b], labels=["正常(0)", "悪化(1)"])
     plt.title(f"Boxplot: {feature_name}")
     img = Image.open(buf)  # PIL.Image.Image
     return np.array(img)   # Gallery は numpy 配列でもOK
+# 解析結果の返却タプル型（status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df）
+AnalysisResult = Tuple[
+    str,
+    Optional[pd.DataFrame],
+    Optional[pd.DataFrame],
+    Optional[pd.DataFrame],
+    Optional[pd.DataFrame],
+    List[Tuple[np.ndarray, str]],
+    Optional[pd.DataFrame],
+]
+def analyze_from_supabase(threshold: Union[int, float], top_k: Union[int, float]) -> AnalysisResult:
+    """
+    Supabase から水質データを取得し、閾値によるラベリングを行った上で
+    相関（point-biserial）、t検定、箱ひげ図、ロジスティック回帰による特徴量重要度を算出する。
+    Args:
+        threshold (int | float): 目的変数列 'CODcr(S)sin' を悪化(1) と判定する閾値。
+        top_k (int | float): ロジスティック回帰の係数上位として表示する特徴量数。
+    Returns:
+        AnalysisResult: 以下の7要素タプル
+            - status_md (str): 解析の要約（マークダウン）
+            - head_df (pd.DataFrame | None): 先頭行のプレビュー
+            - label_counts (pd.DataFrame | None): 目的変数の分布
+            - corr_df (pd.DataFrame | None): point-biserial 相関表
+            - ttest_df (pd.DataFrame | None): t検定の結果表
+            - gallery_imgs (List[Tuple[np.ndarray, str]]): 箱ひげ図（画像配列, タイトル）のリスト
+            - coef_df (pd.DataFrame | None): ロジスティック回帰の係数ランキング
+    """
     # ---- データ取得 ----
     try:
         df = _fetch_supabase_df()
     except Exception as e:
+        msg = (
+            f"❌ データ取得に失敗：{e}\n"
+            f"- .env に SUPABASE_URL / SUPABASE_KEY を設定してください\n"
+            f"- テーブル名: {TABLE_NAME}"
+        )
         return (msg, None, None, None, None, [], None)
+    status_md: str = f"**テーブル:** `{TABLE_NAME}`\n\n**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
+    head_df: pd.DataFrame = df.head()
     # ---- 目的変数の作成（悪化=1, 正常=0）----
     target_col = "CODcr(S)sin"
     if target_col not in df.columns:
+        return (
+            f"❌ 必須列 '{target_col}' が見つかりません。現在の列: {list(df.columns)}",
+            None, None, None, None, [], None
+        )
     df = df.copy()
     # 数値化（もし文字列が混ざっていても NaN に落とす）
     df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
+    df["label"] = (df[target_col] > float(threshold)).astype(int)
+    label_counts: pd.DataFrame = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
     status_md += (
         f"**閾値:** {threshold}\n\n"
         f"**目的変数の分布:**\n"
     )
     # ---- 説明変数の準備 ----
+    X: pd.DataFrame = df.drop(columns=[target_col, "label"])
+    y: pd.Series = df["label"]
     # 既知の小数表記ゆれ対策（あれば）
     if "分散菌槽DO" in X.columns:
         X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
     # ---- 相関 (point-biserial) ----
+    rows: List[Tuple[str, float, float]] = []
     for col in X.columns:
         try:
             col_num = pd.to_numeric(X[col], errors="coerce")
             r, p = pointbiserialr(y, col_num)
+            rows.append((col, float(r), float(p)))
         except Exception:
+            rows.append((col, float("nan"), float("nan")))
+    corr_df: pd.DataFrame = (
         pd.DataFrame(rows, columns=["feature", "r_pb", "pval"])
         .set_index("feature")
         .sort_values(by="r_pb", key=lambda s: s.abs(), ascending=False)
     )
     # ---- t検定 ----
+    ttest_rows: List[Dict[str, Union[str, float, int]]] = []
     for col in X.columns:
         col_num = pd.to_numeric(X[col], errors="coerce")
         a = col_num[y == 0].dropna()
                 ttest_rows.append(
                     {
                         "feature": col,
+                        "mean_normal": float(a.mean()),
+                        "mean_bad": float(b.mean()),
+                        "pval": float(p),
+                        "n_normal": int(len(a)),
+                        "n_bad": int(len(b)),
                     }
                 )
             except Exception:
                 pass
+    ttest_df: pd.DataFrame = (
         pd.DataFrame(ttest_rows).set_index("feature").sort_values(by="pval", ascending=True)
         if ttest_rows else pd.DataFrame()
     )
     # ---- 箱ひげ図 (ギャラリー) ----
+    gallery_imgs: List[Tuple[np.ndarray, str]] = []
     for col in X.columns:
         col_num = pd.to_numeric(X[col], errors="coerce")
         a_plot = col_num[y == 0].dropna()
     X_num = X_num.loc[:, X_num.notna().sum() > 0]  # すべてNaN列を落とす
     if X_num.shape[1] == 0:
+        coef_df: pd.DataFrame = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
         status_md += "\n⚠️ 数値説明変数がありませんでした。係数は空です。"
     else:
+        pipe: Pipeline = Pipeline(
             steps=[
                 ("imputer", SimpleImputer(strategy="median")),
                 ("scaler", StandardScaler()),
                 .drop(columns=["abs_coef"])
             )
             coef_df["rank"] = np.arange(1, len(coef_df) + 1)
+            status_md += "\n\n**悪化原因の候補（上位{}項目）**:\n- ".format(int(top_k)) + "\n- ".join(
                 [f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
             )
         except Exception as e:
     return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
 # === Gradio UI ===
 with gr.Blocks(title="水質データ 解析アプリ（Supabase版）") as demo:
     gr.Markdown(