Refactor app.py to integrate Supabase data retrieval, update dependencies, and enhance analysis functions. The application now fetches data from Supabase instead of Excel files, with improved error handling and user interface adjustments for clarity.
Browse files- app.py +90 -40
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
# app.py
|
| 2 |
# ---- 必要ライブラリ ----
|
| 3 |
-
# pip install gradio pandas numpy matplotlib scipy scikit-learn
|
| 4 |
|
| 5 |
import io
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
import numpy as np
|
| 8 |
import matplotlib.pyplot as plt
|
|
@@ -13,12 +14,56 @@ from sklearn.preprocessing import StandardScaler
|
|
| 13 |
from sklearn.pipeline import Pipeline
|
| 14 |
import gradio as gr
|
| 15 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
plt.switch_backend("Agg") # サーバー実行向け
|
| 18 |
|
| 19 |
-
#
|
| 20 |
import matplotlib
|
| 21 |
-
matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic', 'Meiryo',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def _boxplot_image(a, b, feature_name):
|
| 24 |
fig = plt.figure()
|
|
@@ -29,41 +74,43 @@ def _boxplot_image(a, b, feature_name):
|
|
| 29 |
fig.savefig(buf, format="png", bbox_inches="tight")
|
| 30 |
plt.close(fig)
|
| 31 |
buf.seek(0)
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
img_array = np.array(img)
|
| 35 |
-
return img_array
|
| 36 |
-
|
| 37 |
-
def analyze_excel(file, threshold, top_k):
|
| 38 |
-
if file is None:
|
| 39 |
-
return (
|
| 40 |
-
"⚠️ 先にExcelファイル(.xlsx)をアップロードしてください。",
|
| 41 |
-
None, None, None, None, [], None
|
| 42 |
-
)
|
| 43 |
|
|
|
|
|
|
|
| 44 |
try:
|
| 45 |
-
df =
|
| 46 |
except Exception as e:
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
-
status_md = f"**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
|
| 50 |
head_df = df.head()
|
| 51 |
|
| 52 |
# ---- 目的変数の作成(悪化=1, 正常=0)----
|
| 53 |
-
|
| 54 |
-
|
|
|
|
| 55 |
|
| 56 |
df = df.copy()
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
|
| 59 |
label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
|
| 60 |
-
status_md +=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
# ---- 説明変数の準備 ----
|
| 63 |
-
X = df.drop(columns=[
|
| 64 |
y = df["label"]
|
| 65 |
|
| 66 |
-
#
|
| 67 |
if "分散菌槽DO" in X.columns:
|
| 68 |
X["分散菌槽DO"] = X["分散菌槽DO"].astype(str).str.replace(",", ".", regex=False)
|
| 69 |
X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
|
|
@@ -72,7 +119,8 @@ def analyze_excel(file, threshold, top_k):
|
|
| 72 |
rows = []
|
| 73 |
for col in X.columns:
|
| 74 |
try:
|
| 75 |
-
|
|
|
|
| 76 |
rows.append((col, r, p))
|
| 77 |
except Exception:
|
| 78 |
rows.append((col, np.nan, np.nan))
|
|
@@ -104,9 +152,8 @@ def analyze_excel(file, threshold, top_k):
|
|
| 104 |
except Exception:
|
| 105 |
pass
|
| 106 |
ttest_df = (
|
| 107 |
-
pd.DataFrame(ttest_rows)
|
| 108 |
-
.
|
| 109 |
-
.sort_values(by="pval", ascending=True) if ttest_rows else pd.DataFrame()
|
| 110 |
)
|
| 111 |
|
| 112 |
# ---- 箱ひげ図 (ギャラリー) ----
|
|
@@ -121,8 +168,7 @@ def analyze_excel(file, threshold, top_k):
|
|
| 121 |
|
| 122 |
# ---- ロジスティック回帰 ----
|
| 123 |
X_num = X.apply(pd.to_numeric, errors="coerce").select_dtypes(include=np.number)
|
| 124 |
-
# すべてNaN列を落とす
|
| 125 |
-
X_num = X_num.loc[:, X_num.notna().sum() > 0]
|
| 126 |
|
| 127 |
if X_num.shape[1] == 0:
|
| 128 |
coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
|
|
@@ -152,7 +198,6 @@ def analyze_excel(file, threshold, top_k):
|
|
| 152 |
.sort_values(by="abs_coef", ascending=False)
|
| 153 |
.drop(columns=["abs_coef"])
|
| 154 |
)
|
| 155 |
-
# rank列付与
|
| 156 |
coef_df["rank"] = np.arange(1, len(coef_df) + 1)
|
| 157 |
status_md += "\n\n**悪化原因の候補(上位{}項目)**:\n- ".format(top_k) + "\n- ".join(
|
| 158 |
[f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
|
|
@@ -161,33 +206,38 @@ def analyze_excel(file, threshold, top_k):
|
|
| 161 |
status_md += f"\n❗ ロジスティック回帰の学習に失敗しました: {e}"
|
| 162 |
coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
|
| 163 |
|
| 164 |
-
status_md += "\n\n✅
|
| 165 |
|
| 166 |
return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
with gr.Row():
|
| 173 |
threshold_in = gr.Number(value=100, precision=0, label="CODcr(S)sin の閾値(悪化=1)")
|
| 174 |
topk_in = gr.Slider(1, 10, value=4, step=1, label="ロジスティック回帰の上位特徴量 数")
|
| 175 |
-
run_btn = gr.Button("
|
| 176 |
|
| 177 |
status_out = gr.Markdown()
|
| 178 |
head_out = gr.Dataframe(label="データ先頭", interactive=False)
|
| 179 |
label_out = gr.Dataframe(label="目的変数の分布", interactive=False)
|
| 180 |
corr_out = gr.Dataframe(label="相関 (point-biserial)", interactive=False)
|
| 181 |
ttest_out = gr.Dataframe(label="t検定結果(p値の小さい順)", interactive=False)
|
| 182 |
-
gallery_out = gr.Gallery(label="箱ひげ図(正常 vs 悪化)")
|
| 183 |
coef_out = gr.Dataframe(label="ロジスティック回帰 係数ランキング", interactive=False)
|
| 184 |
|
| 185 |
run_btn.click(
|
| 186 |
-
|
| 187 |
-
inputs=[
|
| 188 |
outputs=[status_out, head_out, label_out, corr_out, ttest_out, gallery_out, coef_out],
|
| 189 |
)
|
| 190 |
|
| 191 |
if __name__ == "__main__":
|
| 192 |
# demo.launch(share=True) # 外部共有したい場合は share=True
|
| 193 |
-
demo.launch()
|
|
|
|
| 1 |
# app.py
|
| 2 |
# ---- 必要ライブラリ ----
|
| 3 |
+
# pip install gradio pandas numpy matplotlib scipy scikit-learn pillow python-dotenv supabase
|
| 4 |
|
| 5 |
import io
|
| 6 |
+
import os
|
| 7 |
import pandas as pd
|
| 8 |
import numpy as np
|
| 9 |
import matplotlib.pyplot as plt
|
|
|
|
| 14 |
from sklearn.pipeline import Pipeline
|
| 15 |
import gradio as gr
|
| 16 |
from PIL import Image
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
# Supabase
|
| 20 |
+
try:
|
| 21 |
+
from supabase import create_client # supabase-py v2
|
| 22 |
+
except Exception:
|
| 23 |
+
# 旧API互換(v1 をお使いの場合は import supabase; supabase.create_client を利用)
|
| 24 |
+
create_client = None
|
| 25 |
+
import supabase as supabase_v1
|
| 26 |
|
| 27 |
plt.switch_backend("Agg") # サーバー実行向け
|
| 28 |
|
| 29 |
+
# 日本語フォントの設定(環境に応じて使えるものを優先)
|
| 30 |
import matplotlib
|
| 31 |
+
matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic', 'Meiryo',
|
| 32 |
+
'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']
|
| 33 |
+
|
| 34 |
+
# .env 読み込み
|
| 35 |
+
load_dotenv()
|
| 36 |
+
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
| 37 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
|
| 38 |
+
TABLE_NAME = "estimated_cause_mocdata" # ご指定のテーブル名
|
| 39 |
+
|
| 40 |
+
# Supabase クライアント作成(v2 優先、なければ v1)
|
| 41 |
+
def _get_supabase_client():
|
| 42 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
| 43 |
+
raise RuntimeError("環境変数 SUPABASE_URL または SUPABASE_KEY が設定されていません。")
|
| 44 |
+
if create_client is not None:
|
| 45 |
+
return create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 46 |
+
# v1 fallback
|
| 47 |
+
return supabase_v1.create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 48 |
+
|
| 49 |
+
def _fetch_supabase_df():
|
| 50 |
+
client = _get_supabase_client()
|
| 51 |
+
# v2 と v1 で返り値が異なるため分岐
|
| 52 |
+
try:
|
| 53 |
+
resp = client.table(TABLE_NAME).select("*").execute()
|
| 54 |
+
data = getattr(resp, "data", None) if hasattr(resp, "data") else None
|
| 55 |
+
if data is None:
|
| 56 |
+
# v1 の場合、resp が dict のことも
|
| 57 |
+
if isinstance(resp, dict) and "data" in resp:
|
| 58 |
+
data = resp["data"]
|
| 59 |
+
if not data:
|
| 60 |
+
raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' からデータを取得できませんでした。")
|
| 61 |
+
df = pd.DataFrame(data)
|
| 62 |
+
if df.empty:
|
| 63 |
+
raise RuntimeError(f"Supabase テーブル '{TABLE_NAME}' にレコードがありません。")
|
| 64 |
+
return df
|
| 65 |
+
except Exception as e:
|
| 66 |
+
raise RuntimeError(f"Supabase 取得エラー: {e}")
|
| 67 |
|
| 68 |
def _boxplot_image(a, b, feature_name):
|
| 69 |
fig = plt.figure()
|
|
|
|
| 74 |
fig.savefig(buf, format="png", bbox_inches="tight")
|
| 75 |
plt.close(fig)
|
| 76 |
buf.seek(0)
|
| 77 |
+
img = Image.open(buf) # PIL.Image.Image
|
| 78 |
+
return np.array(img) # Gallery は numpy 配列でもOK
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
def analyze_from_supabase(threshold, top_k):
|
| 81 |
+
# ---- データ取得 ----
|
| 82 |
try:
|
| 83 |
+
df = _fetch_supabase_df()
|
| 84 |
except Exception as e:
|
| 85 |
+
msg = f"❌ データ取得に失敗:{e}\n- .env に SUPABASE_URL / SUPABASE_KEY を設定してください\n- テーブル名: {TABLE_NAME}"
|
| 86 |
+
return (msg, None, None, None, None, [], None)
|
| 87 |
|
| 88 |
+
status_md = f"**テーブル:** `{TABLE_NAME}`\n\n**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
|
| 89 |
head_df = df.head()
|
| 90 |
|
| 91 |
# ---- 目的変数の作成(悪化=1, 正常=0)----
|
| 92 |
+
target_col = "CODcr(S)sin"
|
| 93 |
+
if target_col not in df.columns:
|
| 94 |
+
return (f"❌ 必須列 '{target_col}' が見つかりません。現在の列: {list(df.columns)}", None, None, None, None, [], None)
|
| 95 |
|
| 96 |
df = df.copy()
|
| 97 |
+
# 数値化(もし文字列が混ざっていても NaN に落とす)
|
| 98 |
+
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
|
| 99 |
+
df["label"] = (df[target_col] > threshold).astype(int)
|
| 100 |
|
| 101 |
label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
|
| 102 |
+
status_md += (
|
| 103 |
+
f"**閾値:** {threshold}\n\n"
|
| 104 |
+
f"**目的変数の分布:**\n"
|
| 105 |
+
f"- 正常(0): {int(label_counts.loc[0,'count']) if 0 in label_counts.index else 0}\n"
|
| 106 |
+
f"- 悪化(1): {int(label_counts.loc[1,'count']) if 1 in label_counts.index else 0}\n"
|
| 107 |
+
)
|
| 108 |
|
| 109 |
# ---- 説明変数の準備 ----
|
| 110 |
+
X = df.drop(columns=[target_col, "label"])
|
| 111 |
y = df["label"]
|
| 112 |
|
| 113 |
+
# 既知の小数表記ゆれ対策(あれば)
|
| 114 |
if "分散菌槽DO" in X.columns:
|
| 115 |
X["分散菌槽DO"] = X["分散菌槽DO"].astype(str).str.replace(",", ".", regex=False)
|
| 116 |
X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
|
|
|
|
| 119 |
rows = []
|
| 120 |
for col in X.columns:
|
| 121 |
try:
|
| 122 |
+
col_num = pd.to_numeric(X[col], errors="coerce")
|
| 123 |
+
r, p = pointbiserialr(y, col_num)
|
| 124 |
rows.append((col, r, p))
|
| 125 |
except Exception:
|
| 126 |
rows.append((col, np.nan, np.nan))
|
|
|
|
| 152 |
except Exception:
|
| 153 |
pass
|
| 154 |
ttest_df = (
|
| 155 |
+
pd.DataFrame(ttest_rows).set_index("feature").sort_values(by="pval", ascending=True)
|
| 156 |
+
if ttest_rows else pd.DataFrame()
|
|
|
|
| 157 |
)
|
| 158 |
|
| 159 |
# ---- 箱ひげ図 (ギャラリー) ----
|
|
|
|
| 168 |
|
| 169 |
# ---- ロジスティック回帰 ----
|
| 170 |
X_num = X.apply(pd.to_numeric, errors="coerce").select_dtypes(include=np.number)
|
| 171 |
+
X_num = X_num.loc[:, X_num.notna().sum() > 0] # すべてNaN列を落とす
|
|
|
|
| 172 |
|
| 173 |
if X_num.shape[1] == 0:
|
| 174 |
coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
|
|
|
|
| 198 |
.sort_values(by="abs_coef", ascending=False)
|
| 199 |
.drop(columns=["abs_coef"])
|
| 200 |
)
|
|
|
|
| 201 |
coef_df["rank"] = np.arange(1, len(coef_df) + 1)
|
| 202 |
status_md += "\n\n**悪化原因の候補(上位{}項目)**:\n- ".format(top_k) + "\n- ".join(
|
| 203 |
[f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
|
|
|
|
| 206 |
status_md += f"\n❗ ロジスティック回帰の学習に失敗しました: {e}"
|
| 207 |
coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
|
| 208 |
|
| 209 |
+
status_md += "\n\n✅ 解析完了:Supabase データに対して 相関・t検定・箱ひげ図・ロジスティック回帰 を実行しました。"
|
| 210 |
|
| 211 |
return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
|
| 212 |
|
| 213 |
+
# === Gradio UI ===
|
| 214 |
+
with gr.Blocks(title="水質データ 解析アプリ(Supabase版)") as demo:
|
| 215 |
+
gr.Markdown(
|
| 216 |
+
"""
|
| 217 |
+
# 水質データ 解析アプリ(Supabase版)
|
| 218 |
+
`.env` の **SUPABASE_URL** / **SUPABASE_KEY** を用意し、テーブル **estimated_cause_mocdata** からデータを取得して解析します。
|
| 219 |
+
解析対象列は **CODcr(S)sin**(悪化=1 判定用)を想定しています。
|
| 220 |
+
"""
|
| 221 |
+
)
|
| 222 |
with gr.Row():
|
| 223 |
threshold_in = gr.Number(value=100, precision=0, label="CODcr(S)sin の閾値(悪化=1)")
|
| 224 |
topk_in = gr.Slider(1, 10, value=4, step=1, label="ロジスティック回帰の上位特徴量 数")
|
| 225 |
+
run_btn = gr.Button("Supabase から取得して解析", variant="primary")
|
| 226 |
|
| 227 |
status_out = gr.Markdown()
|
| 228 |
head_out = gr.Dataframe(label="データ先頭", interactive=False)
|
| 229 |
label_out = gr.Dataframe(label="目的変数の分布", interactive=False)
|
| 230 |
corr_out = gr.Dataframe(label="相関 (point-biserial)", interactive=False)
|
| 231 |
ttest_out = gr.Dataframe(label="t検定結果(p値の小さい順)", interactive=False)
|
| 232 |
+
gallery_out = gr.Gallery(label="箱ひげ図(正常 vs 悪化)", columns=2, height="auto")
|
| 233 |
coef_out = gr.Dataframe(label="ロジスティック回帰 係数ランキング", interactive=False)
|
| 234 |
|
| 235 |
run_btn.click(
|
| 236 |
+
analyze_from_supabase,
|
| 237 |
+
inputs=[threshold_in, topk_in],
|
| 238 |
outputs=[status_out, head_out, label_out, corr_out, ttest_out, gallery_out, coef_out],
|
| 239 |
)
|
| 240 |
|
| 241 |
if __name__ == "__main__":
|
| 242 |
# demo.launch(share=True) # 外部共有したい場合は share=True
|
| 243 |
+
demo.launch()
|
requirements.txt
CHANGED
|
@@ -7,4 +7,4 @@ scipy
|
|
| 7 |
scikit-learn
|
| 8 |
openpyxl
|
| 9 |
pandas
|
| 10 |
-
Pillow
|
|
|
|
| 7 |
scikit-learn
|
| 8 |
openpyxl
|
| 9 |
pandas
|
| 10 |
+
Pillow
|