Spaces:

AI-Agent-Exercise-2025
/

Cause_estimation_tool

Running

App Files Files Community

MTeguri commited on Sep 12, 2025

Commit

cf1066a

1 Parent(s): f5327a7

Refactor app.py to integrate Supabase data retrieval, update dependencies, and enhance analysis functions. The application now fetches data from Supabase instead of Excel files, with improved error handling and user interface adjustments for clarity.

Browse files

Files changed (2) hide show

app.py +90 -40
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # app.py
 # ---- 必要ライブラリ ----
-# pip install gradio pandas numpy matplotlib scipy scikit-learn openpyxl
 import io
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
@@ -13,12 +14,56 @@ from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 import gradio as gr
 from PIL import Image
 plt.switch_backend("Agg")  # サーバー実行向け
-# 日本語フォントの設定
 import matplotlib
-matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic', 'Meiryo', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']
 def _boxplot_image(a, b, feature_name):
     fig = plt.figure()
@@ -29,41 +74,43 @@ def _boxplot_image(a, b, feature_name):
     fig.savefig(buf, format="png", bbox_inches="tight")
     plt.close(fig)
     buf.seek(0)
-    # GradioのGallery用にnumpy配列として返す
-    img = Image.open(buf)
-    img_array = np.array(img)
-    return img_array
-def analyze_excel(file, threshold, top_k):
-    if file is None:
-        return (
-            "⚠️ 先にExcelファイル（.xlsx）をアップロードしてください。",
-            None, None, None, None, [], None
-        )
     try:
-        df = pd.read_excel(file.name)
     except Exception as e:
-        return (f"❌ 読み込みエラー: {e}", None, None, None, None, [], None)
-    status_md = f"**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
     head_df = df.head()
     # ---- 目的変数の作成（悪化=1, 正常=0）----
-    if "CODcr(S)sin" not in df.columns:
-        return ("❌ 必須列 'CODcr(S)sin' が見つかりません。列名を確認してください。", None, None, None, None, [], None)
     df = df.copy()
-    df["label"] = (df["CODcr(S)sin"] > threshold).astype(int)
     label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
-    status_md += f"**閾値:** {threshold}\n\n**目的変数の分布:**\n- 正常(0): {int(label_counts.loc[0,'count']) if 0 in label_counts.index else 0}\n- 悪化(1): {int(label_counts.loc[1,'count']) if 1 in label_counts.index else 0}\n"
     # ---- 説明変数の準備 ----
-    X = df.drop(columns=["CODcr(S)sin", "label"])
     y = df["label"]
-    # 文字列の小数点を ',' → '.' に調整（カラムがあれば）
     if "分散菌槽DO" in X.columns:
         X["分散菌槽DO"] = X["分散菌槽DO"].astype(str).str.replace(",", ".", regex=False)
         X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
@@ -72,7 +119,8 @@ def analyze_excel(file, threshold, top_k):
     rows = []
     for col in X.columns:
         try:
-            r, p = pointbiserialr(y, pd.to_numeric(X[col], errors="coerce"))
             rows.append((col, r, p))
         except Exception:
             rows.append((col, np.nan, np.nan))
@@ -104,9 +152,8 @@ def analyze_excel(file, threshold, top_k):
             except Exception:
                 pass
     ttest_df = (
-        pd.DataFrame(ttest_rows)
-        .set_index("feature")
-        .sort_values(by="pval", ascending=True) if ttest_rows else pd.DataFrame()
     )
     # ---- 箱ひげ図 (ギャラリー) ----
@@ -121,8 +168,7 @@ def analyze_excel(file, threshold, top_k):
     # ---- ロジスティック回帰 ----
     X_num = X.apply(pd.to_numeric, errors="coerce").select_dtypes(include=np.number)
-    # すべてNaN列を落とす
-    X_num = X_num.loc[:, X_num.notna().sum() > 0]
     if X_num.shape[1] == 0:
         coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
@@ -152,7 +198,6 @@ def analyze_excel(file, threshold, top_k):
                 .sort_values(by="abs_coef", ascending=False)
                 .drop(columns=["abs_coef"])
             )
-            # rank列付与
             coef_df["rank"] = np.arange(1, len(coef_df) + 1)
             status_md += "\n\n**悪化原因の候補（上位{}項目）**:\n- ".format(top_k) + "\n- ".join(
                 [f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
@@ -161,33 +206,38 @@ def analyze_excel(file, threshold, top_k):
             status_md += f"\n❗ ロジスティック回帰の学習に失敗しました: {e}"
             coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
-    status_md += "\n\n✅ 解析完了：相関・t検定・箱ひげ図・ロジスティック回帰を実行しました。"
     return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
-with gr.Blocks(title="水質データ 解析アプリ（相関 / t検定 / 箱ひげ / ロジ回帰）") as demo:
-    gr.Markdown("# 水質データ 解析アプリ\nExcelをアップロードし、閾値と上位特徴量数を指定して［解析実行］してください。")
-    with gr.Row():
-        file_in = gr.File(label="Excelファイル（.xlsx）をアップロード", file_types=[".xlsx"])
     with gr.Row():
         threshold_in = gr.Number(value=100, precision=0, label="CODcr(S)sin の閾値（悪化=1）")
         topk_in = gr.Slider(1, 10, value=4, step=1, label="ロジスティック回帰の上位特徴量 数")
-    run_btn = gr.Button("解析実行", variant="primary")
     status_out = gr.Markdown()
     head_out = gr.Dataframe(label="データ先頭", interactive=False)
     label_out = gr.Dataframe(label="目的変数の分布", interactive=False)
     corr_out = gr.Dataframe(label="相関 (point-biserial)", interactive=False)
     ttest_out = gr.Dataframe(label="t検定結果（p値の小さい順）", interactive=False)
-    gallery_out = gr.Gallery(label="箱ひげ図（正常 vs 悪化）")
     coef_out = gr.Dataframe(label="ロジスティック回帰 係数ランキング", interactive=False)
     run_btn.click(
-        analyze_excel,
-        inputs=[file_in, threshold_in, topk_in],
         outputs=[status_out, head_out, label_out, corr_out, ttest_out, gallery_out, coef_out],
     )
 if __name__ == "__main__":
     # demo.launch(share=True)  # 外部共有したい場合は share=True
-    demo.launch()

 # app.py
 # ---- 必要ライブラリ ----
+# pip install gradio pandas numpy matplotlib scipy scikit-learn pillow python-dotenv supabase
 import io
+import os
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.pipeline import Pipeline
 import gradio as gr
 from PIL import Image
+from dotenv import load_dotenv
+# Supabase
+try:
+    from supabase import create_client  # supabase-py v2
+except Exception:
+    # 旧API互換（v1 をお使いの場合は import supabase; supabase.create_client を利用）
+    create_client = None
+    import supabase as supabase_v1
 plt.switch_backend("Agg")  # サーバー実行向け
+# 日本語フォントの設定（環境に応じて使えるものを優先）
 import matplotlib
+matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic', 'Meiryo',
+                                      'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']
+# .env 読み込み
+load_dotenv()
+SUPABASE_URL = os.environ.get("SUPABASE_URL")
+SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
+TABLE_NAME = "estimated_cause_mocdata"  # ご指定のテーブル名
+# Supabase クライアント作成（v2 優先、なければ v1）
+def _get_supabase_client():
+    if not SUPABASE_URL or not SUPABASE_KEY:
+        raise RuntimeError("環境変数 SUPABASE_URL または SUPABASE_KEY が設定されていません。")
+    if create_client is not None:
+        return create_client(SUPABASE_URL, SUPABASE_KEY)
+    # v1 fallback
+    return supabase_v1.create_client(SUPABASE_URL, SUPABASE_KEY)
+def _fetch_supabase_df():
+    client = _get_supabase_client()
+    # v2 と v1 で返り値が異なるため分岐
+    try:
+        resp = client.table(TABLE_NAME).select("*").execute()
+        data = getattr(resp, "data", None) if hasattr(resp, "data") else None
+        if data is None:
+            # v1 の場合、resp が dict のことも
+            if isinstance(resp, dict) and "data" in resp:
+                data = resp["data"]
+        if not data:
+            raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' からデータを取得できませんでした。")
+        df = pd.DataFrame(data)
+        if df.empty:
+            raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' にレコードがありません。")
+        return df
+    except Exception as e:
+        raise RuntimeError(f"Supabase 取得エラー: {e}")
 def _boxplot_image(a, b, feature_name):
     fig = plt.figure()
     fig.savefig(buf, format="png", bbox_inches="tight")
     plt.close(fig)
     buf.seek(0)
+    img = Image.open(buf)  # PIL.Image.Image
+    return np.array(img)   # Gallery は numpy 配列でもOK
+def analyze_from_supabase(threshold, top_k):
+    # ---- データ取得 ----
     try:
+        df = _fetch_supabase_df()
     except Exception as e:
+        msg = f"❌ データ取得に失敗：{e}\n- .env に SUPABASE_URL / SUPABASE_KEY を設定してください\n- テーブル名: {TABLE_NAME}"
+        return (msg, None, None, None, None, [], None)
+    status_md = f"**テーブル:** `{TABLE_NAME}`\n\n**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
     head_df = df.head()
     # ---- 目的変数の作成（悪化=1, 正常=0）----
+    target_col = "CODcr(S)sin"
+    if target_col not in df.columns:
+        return (f"❌ 必須列 '{target_col}' が見つかりません。現在の列: {list(df.columns)}", None, None, None, None, [], None)
     df = df.copy()
+    # 数値化（もし文字列が混ざっていても NaN に落とす）
+    df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
+    df["label"] = (df[target_col] > threshold).astype(int)
     label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
+    status_md += (
+        f"**閾値:** {threshold}\n\n"
+        f"**目的変数の分布:**\n"
+        f"- 正常(0): {int(label_counts.loc[0,'count']) if 0 in label_counts.index else 0}\n"
+        f"- 悪化(1): {int(label_counts.loc[1,'count']) if 1 in label_counts.index else 0}\n"
+    )
     # ---- 説明変数の準備 ----
+    X = df.drop(columns=[target_col, "label"])
     y = df["label"]
+    # 既知の小数表記ゆれ対策（あれば）
     if "分散菌槽DO" in X.columns:
         X["分散菌槽DO"] = X["分散菌槽DO"].astype(str).str.replace(",", ".", regex=False)
         X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
     rows = []
     for col in X.columns:
         try:
+            col_num = pd.to_numeric(X[col], errors="coerce")
+            r, p = pointbiserialr(y, col_num)
             rows.append((col, r, p))
         except Exception:
             rows.append((col, np.nan, np.nan))
             except Exception:
                 pass
     ttest_df = (
+        pd.DataFrame(ttest_rows).set_index("feature").sort_values(by="pval", ascending=True)
+        if ttest_rows else pd.DataFrame()
     )
     # ---- 箱ひげ図 (ギャラリー) ----
     # ---- ロジスティック回帰 ----
     X_num = X.apply(pd.to_numeric, errors="coerce").select_dtypes(include=np.number)
+    X_num = X_num.loc[:, X_num.notna().sum() > 0]  # すべてNaN列を落とす
     if X_num.shape[1] == 0:
         coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
                 .sort_values(by="abs_coef", ascending=False)
                 .drop(columns=["abs_coef"])
             )
             coef_df["rank"] = np.arange(1, len(coef_df) + 1)
             status_md += "\n\n**悪化原因の候補（上位{}項目）**:\n- ".format(top_k) + "\n- ".join(
                 [f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
             status_md += f"\n❗ ロジスティック回帰の学習に失敗しました: {e}"
             coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
+    status_md += "\n\n✅ 解析完了：Supabase データに対して 相関・t検定・箱ひげ図・ロジスティック回帰 を実行しました。"
     return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
+# === Gradio UI ===
+with gr.Blocks(title="水質データ 解析アプリ（Supabase版）") as demo:
+    gr.Markdown(
+        """
+        # 水質データ 解析アプリ（Supabase版）
+        `.env` の **SUPABASE_URL** / **SUPABASE_KEY** を用意し、テーブル **estimated_cause_mocdata** からデータを取得して解析します。
+        解析対象列は **CODcr(S)sin**（悪化=1 判定用）を想定しています。
+        """
+    )
     with gr.Row():
         threshold_in = gr.Number(value=100, precision=0, label="CODcr(S)sin の閾値（悪化=1）")
         topk_in = gr.Slider(1, 10, value=4, step=1, label="ロジスティック回帰の上位特徴量 数")
+    run_btn = gr.Button("Supabase から取得して解析", variant="primary")
     status_out = gr.Markdown()
     head_out = gr.Dataframe(label="データ先頭", interactive=False)
     label_out = gr.Dataframe(label="目的変数の分布", interactive=False)
     corr_out = gr.Dataframe(label="相関 (point-biserial)", interactive=False)
     ttest_out = gr.Dataframe(label="t検定結果（p値の小さい順）", interactive=False)
+    gallery_out = gr.Gallery(label="箱ひげ図（正常 vs 悪化）", columns=2, height="auto")
     coef_out = gr.Dataframe(label="ロジスティック回帰 係数ランキング", interactive=False)
     run_btn.click(
+        analyze_from_supabase,
+        inputs=[threshold_in, topk_in],
         outputs=[status_out, head_out, label_out, corr_out, ttest_out, gallery_out, coef_out],
     )
 if __name__ == "__main__":
     # demo.launch(share=True)  # 外部共有したい場合は share=True
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -7,4 +7,4 @@ scipy
 scikit-learn
 openpyxl
 pandas
-Pillow

 scikit-learn
 openpyxl
 pandas
+Pillow