Spaces:

AI-Agent-Exercise-2025
/

Cause_estimation_tool

Running

+# app.py
+# ---- 必要ライブラリ ----
+# pip install gradio pandas numpy matplotlib scipy scikit-learn openpyxl
+import io
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import ttest_ind, pointbiserialr
+from sklearn.linear_model import LogisticRegression
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+import gradio as gr
+from PIL import Image
+plt.switch_backend("Agg")  # サーバー実行向け
+# 日本語フォントの設定
+import matplotlib
+matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic', 'Meiryo', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']
+def _boxplot_image(a, b, feature_name):
+    fig = plt.figure()
+    plt.boxplot([a, b], labels=["正常(0)", "悪化(1)"])
+    plt.title(f"Boxplot: {feature_name}")
+    plt.ylabel(feature_name)
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    # GradioのGallery用にnumpy配列として返す
+    img = Image.open(buf)
+    img_array = np.array(img)
+    return img_array
+def analyze_excel(file, threshold, top_k):
+    if file is None:
+        return (
+            "⚠️ 先にExcelファイル（.xlsx）をアップロードしてください。",
+            None, None, None, None, [], None
+        )
+    try:
+        df = pd.read_excel(file.name)
+    except Exception as e:
+        return (f"❌ 読み込みエラー: {e}", None, None, None, None, [], None)
+    status_md = f"**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
+    head_df = df.head()
+    # ---- 目的変数の作成（悪化=1, 正常=0）----
+    if "CODcr(S)sin" not in df.columns:
+        return ("❌ 必須列 'CODcr(S)sin' が見つかりません。列名を確認してください。", None, None, None, None, [], None)
+    df = df.copy()
+    df["label"] = (df["CODcr(S)sin"] > threshold).astype(int)
+    label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
+    status_md += f"**閾値:** {threshold}\n\n**目的変数の分布:**\n- 正常(0): {int(label_counts.loc[0,'count']) if 0 in label_counts.index else 0}\n- 悪化(1): {int(label_counts.loc[1,'count']) if 1 in label_counts.index else 0}\n"
+    # ---- 説明変数の準備 ----
+    X = df.drop(columns=["CODcr(S)sin", "label"])
+    y = df["label"]
+    # 文字列の小数点を ',' → '.' に調整（カラムがあれば）
+    if "分散菌槽DO" in X.columns:
+        X["分散菌槽DO"] = X["分散菌槽DO"].astype(str).str.replace(",", ".", regex=False)
+        X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
+    # ---- 相関 (point-biserial) ----
+    rows = []
+    for col in X.columns:
+        try:
+            r, p = pointbiserialr(y, pd.to_numeric(X[col], errors="coerce"))
+            rows.append((col, r, p))
+        except Exception:
+            rows.append((col, np.nan, np.nan))
+    corr_df = (
+        pd.DataFrame(rows, columns=["feature", "r_pb", "pval"])
+        .set_index("feature")
+        .sort_values(by="r_pb", key=lambda s: s.abs(), ascending=False)
+    )
+    # ---- t検定 ----
+    ttest_rows = []
+    for col in X.columns:
+        col_num = pd.to_numeric(X[col], errors="coerce")
+        a = col_num[y == 0].dropna()
+        b = col_num[y == 1].dropna()
+        if len(a) > 1 and len(b) > 1:
+            try:
+                t, p = ttest_ind(a, b, equal_var=False)
+                ttest_rows.append(
+                    {
+                        "feature": col,
+                        "mean_normal": a.mean(),
+                        "mean_bad": b.mean(),
+                        "pval": p,
+                        "n_normal": len(a),
+                        "n_bad": len(b),
+                    }
+                )
+            except Exception:
+                pass
+    ttest_df = (
+        pd.DataFrame(ttest_rows)
+        .set_index("feature")
+        .sort_values(by="pval", ascending=True) if ttest_rows else pd.DataFrame()
+    )
+    # ---- 箱ひげ図 (ギャラリー) ----
+    gallery_imgs = []
+    for col in X.columns:
+        col_num = pd.to_numeric(X[col], errors="coerce")
+        a_plot = col_num[y == 0].dropna()
+        b_plot = col_num[y == 1].dropna()
+        if len(a_plot) > 0 and len(b_plot) > 0:
+            img_array = _boxplot_image(a_plot, b_plot, col)
+            gallery_imgs.append((img_array, f"Boxplot: {col}"))
+    # ---- ロジスティック回帰 ----
+    X_num = X.apply(pd.to_numeric, errors="coerce").select_dtypes(include=np.number)
+    # すべてNaN列を落とす
+    X_num = X_num.loc[:, X_num.notna().sum() > 0]
+    if X_num.shape[1] == 0:
+        coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
+        status_md += "\n⚠️ 数値説明変数がありませんでした。係数は空です。"
+    else:
+        pipe = Pipeline(
+            steps=[
+                ("imputer", SimpleImputer(strategy="median")),
+                ("scaler", StandardScaler()),
+                ("clf", LogisticRegression(max_iter=500, class_weight="balanced")),
+            ]
+        )
+        try:
+            pipe.fit(X_num, y)
+            coef = pd.Series(pipe.named_steps["clf"].coef_[0], index=X_num.columns)
+            coef_abs_sorted = coef.abs().sort_values(ascending=False)
+            top_features = coef_abs_sorted.head(int(top_k)).index.tolist()
+            coef_df = (
+                pd.DataFrame(
+                    {
+                        "coef": coef,
+                        "abs_coef": coef.abs(),
+                        "sign": np.where(coef > 0, "↑ (増加で悪化リスク上昇)", "↓ (増加で悪化リスク低下)"),
+                    }
+                )
+                .sort_values(by="abs_coef", ascending=False)
+                .drop(columns=["abs_coef"])
+            )
+            # rank列付与
+            coef_df["rank"] = np.arange(1, len(coef_df) + 1)
+            status_md += "\n\n**悪化原因の候補（上位{}項目）**:\n- ".format(top_k) + "\n- ".join(
+                [f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
+            )
+        except Exception as e:
+            status_md += f"\n❗ ロジスティック回帰の学習に失敗しました: {e}"
+            coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
+    status_md += "\n\n✅ 解析完了：相関・t検定・箱ひげ図・ロジスティック回帰を実行しました。"
+    return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
+with gr.Blocks(title="水質データ 解析アプリ（相関 / t検定 / 箱ひげ / ロジ回帰）") as demo:
+    gr.Markdown("# 水質データ 解析アプリ\nExcelをアップロードし、閾値と上位特徴量数を指定して［解析実行］してください。")
+    with gr.Row():
+        file_in = gr.File(label="Excelファイル（.xlsx）をアップロード", file_types=[".xlsx"])
+    with gr.Row():
+        threshold_in = gr.Number(value=100, precision=0, label="CODcr(S)sin の閾値（悪化=1）")
+        topk_in = gr.Slider(1, 10, value=4, step=1, label="ロジスティック回帰の上位特徴量 数")
+    run_btn = gr.Button("解析実行", variant="primary")
+    status_out = gr.Markdown()
+    head_out = gr.Dataframe(label="データ先頭", interactive=False)
+    label_out = gr.Dataframe(label="目的変数の分布", interactive=False)
+    corr_out = gr.Dataframe(label="相関 (point-biserial)", interactive=False)
+    ttest_out = gr.Dataframe(label="t検定結果（p値の小さい順）", interactive=False)
+    gallery_out = gr.Gallery(label="箱ひげ図（正常 vs 悪化）")
+    coef_out = gr.Dataframe(label="ロジスティック回帰 係数ランキング", interactive=False)
+    run_btn.click(
+        analyze_excel,
+        inputs=[file_in, threshold_in, topk_in],
+        outputs=[status_out, head_out, label_out, corr_out, ttest_out, gallery_out, coef_out],
+    )
+if __name__ == "__main__":
+    # demo.launch(share=True)  # 外部共有したい場合は share=True
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio[mcp]
+supabase
+python-dotenv
+numpy
+matplotlib
+scipy
+scikit-learn
+openpyxl
+pandas
+Pillow