MTeguri commited on
Commit
f5327a7
·
1 Parent(s): 6d8104c

Update README.md to reflect new project title and theme, changing from 'Cause Estimation Tool' to 'Operation Data Analysis' with updated emoji and color scheme.

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. README.md +4 -4
  3. app.py +193 -0
  4. requirements.txt +10 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv/
2
+ *.un~
3
+ .env
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Cause Estimation Tool
3
- emoji: 🚀
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.44.1
8
  app_file: app.py
 
1
  ---
2
+ title: Operation Data Analysis
3
+ emoji: 🦀
4
+ colorFrom: gray
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.44.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # ---- 必要ライブラリ ----
3
+ # pip install gradio pandas numpy matplotlib scipy scikit-learn openpyxl
4
+
5
+ import io
6
+ import pandas as pd
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ from scipy.stats import ttest_ind, pointbiserialr
10
+ from sklearn.linear_model import LogisticRegression
11
+ from sklearn.impute import SimpleImputer
12
+ from sklearn.preprocessing import StandardScaler
13
+ from sklearn.pipeline import Pipeline
14
+ import gradio as gr
15
+ from PIL import Image
16
+
17
+ plt.switch_backend("Agg") # サーバー実行向け
18
+
19
+ # 日本語フォントの設定
20
+ import matplotlib
21
+ matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Hiragino Sans', 'Yu Gothic', 'Meiryo', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']
22
+
23
+ def _boxplot_image(a, b, feature_name):
24
+ fig = plt.figure()
25
+ plt.boxplot([a, b], labels=["正常(0)", "悪化(1)"])
26
+ plt.title(f"Boxplot: {feature_name}")
27
+ plt.ylabel(feature_name)
28
+ buf = io.BytesIO()
29
+ fig.savefig(buf, format="png", bbox_inches="tight")
30
+ plt.close(fig)
31
+ buf.seek(0)
32
+ # GradioのGallery用にnumpy配列として返す
33
+ img = Image.open(buf)
34
+ img_array = np.array(img)
35
+ return img_array
36
+
37
+ def analyze_excel(file, threshold, top_k):
38
+ if file is None:
39
+ return (
40
+ "⚠️ 先にExcelファイル(.xlsx)をアップロードしてください。",
41
+ None, None, None, None, [], None
42
+ )
43
+
44
+ try:
45
+ df = pd.read_excel(file.name)
46
+ except Exception as e:
47
+ return (f"❌ 読み込みエラー: {e}", None, None, None, None, [], None)
48
+
49
+ status_md = f"**データ形状:** {df.shape[0]} 行 × {df.shape[1]} 列\n\n"
50
+ head_df = df.head()
51
+
52
+ # ---- 目的変数の作成(悪化=1, 正常=0)----
53
+ if "CODcr(S)sin" not in df.columns:
54
+ return ("❌ 必須列 'CODcr(S)sin' が見つかりません。列名を確認してください。", None, None, None, None, [], None)
55
+
56
+ df = df.copy()
57
+ df["label"] = (df["CODcr(S)sin"] > threshold).astype(int)
58
+
59
+ label_counts = df["label"].value_counts(dropna=False).rename_axis("label").to_frame("count")
60
+ status_md += f"**閾値:** {threshold}\n\n**目的変数の分布:**\n- 正常(0): {int(label_counts.loc[0,'count']) if 0 in label_counts.index else 0}\n- 悪化(1): {int(label_counts.loc[1,'count']) if 1 in label_counts.index else 0}\n"
61
+
62
+ # ---- 説明変数の準備 ----
63
+ X = df.drop(columns=["CODcr(S)sin", "label"])
64
+ y = df["label"]
65
+
66
+ # 文字列の小数点を ',' → '.' に調整(カラムがあれば)
67
+ if "分散菌槽DO" in X.columns:
68
+ X["分散菌槽DO"] = X["分散菌槽DO"].astype(str).str.replace(",", ".", regex=False)
69
+ X["分散菌槽DO"] = pd.to_numeric(X["分散菌槽DO"], errors="coerce")
70
+
71
+ # ---- 相関 (point-biserial) ----
72
+ rows = []
73
+ for col in X.columns:
74
+ try:
75
+ r, p = pointbiserialr(y, pd.to_numeric(X[col], errors="coerce"))
76
+ rows.append((col, r, p))
77
+ except Exception:
78
+ rows.append((col, np.nan, np.nan))
79
+ corr_df = (
80
+ pd.DataFrame(rows, columns=["feature", "r_pb", "pval"])
81
+ .set_index("feature")
82
+ .sort_values(by="r_pb", key=lambda s: s.abs(), ascending=False)
83
+ )
84
+
85
+ # ---- t検定 ----
86
+ ttest_rows = []
87
+ for col in X.columns:
88
+ col_num = pd.to_numeric(X[col], errors="coerce")
89
+ a = col_num[y == 0].dropna()
90
+ b = col_num[y == 1].dropna()
91
+ if len(a) > 1 and len(b) > 1:
92
+ try:
93
+ t, p = ttest_ind(a, b, equal_var=False)
94
+ ttest_rows.append(
95
+ {
96
+ "feature": col,
97
+ "mean_normal": a.mean(),
98
+ "mean_bad": b.mean(),
99
+ "pval": p,
100
+ "n_normal": len(a),
101
+ "n_bad": len(b),
102
+ }
103
+ )
104
+ except Exception:
105
+ pass
106
+ ttest_df = (
107
+ pd.DataFrame(ttest_rows)
108
+ .set_index("feature")
109
+ .sort_values(by="pval", ascending=True) if ttest_rows else pd.DataFrame()
110
+ )
111
+
112
+ # ---- 箱ひげ図 (ギャラリー) ----
113
+ gallery_imgs = []
114
+ for col in X.columns:
115
+ col_num = pd.to_numeric(X[col], errors="coerce")
116
+ a_plot = col_num[y == 0].dropna()
117
+ b_plot = col_num[y == 1].dropna()
118
+ if len(a_plot) > 0 and len(b_plot) > 0:
119
+ img_array = _boxplot_image(a_plot, b_plot, col)
120
+ gallery_imgs.append((img_array, f"Boxplot: {col}"))
121
+
122
+ # ---- ロジスティック回帰 ----
123
+ X_num = X.apply(pd.to_numeric, errors="coerce").select_dtypes(include=np.number)
124
+ # すべてNaN列を落とす
125
+ X_num = X_num.loc[:, X_num.notna().sum() > 0]
126
+
127
+ if X_num.shape[1] == 0:
128
+ coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
129
+ status_md += "\n⚠️ 数値説明変数がありませんでした。係数は空です。"
130
+ else:
131
+ pipe = Pipeline(
132
+ steps=[
133
+ ("imputer", SimpleImputer(strategy="median")),
134
+ ("scaler", StandardScaler()),
135
+ ("clf", LogisticRegression(max_iter=500, class_weight="balanced")),
136
+ ]
137
+ )
138
+ try:
139
+ pipe.fit(X_num, y)
140
+ coef = pd.Series(pipe.named_steps["clf"].coef_[0], index=X_num.columns)
141
+ coef_abs_sorted = coef.abs().sort_values(ascending=False)
142
+ top_features = coef_abs_sorted.head(int(top_k)).index.tolist()
143
+
144
+ coef_df = (
145
+ pd.DataFrame(
146
+ {
147
+ "coef": coef,
148
+ "abs_coef": coef.abs(),
149
+ "sign": np.where(coef > 0, "↑ (増加で悪化リスク上昇)", "↓ (増加で悪化リスク低下)"),
150
+ }
151
+ )
152
+ .sort_values(by="abs_coef", ascending=False)
153
+ .drop(columns=["abs_coef"])
154
+ )
155
+ # rank列付与
156
+ coef_df["rank"] = np.arange(1, len(coef_df) + 1)
157
+ status_md += "\n\n**悪化原因の候補(上位{}項目)**:\n- ".format(top_k) + "\n- ".join(
158
+ [f"{f}: 係数={coef[f]:.3f} {('↑' if coef[f]>0 else '↓')}" for f in top_features]
159
+ )
160
+ except Exception as e:
161
+ status_md += f"\n❗ ロジスティック回帰の学習に失敗しました: {e}"
162
+ coef_df = pd.DataFrame(columns=["feature", "coef", "sign", "rank"]).set_index("feature")
163
+
164
+ status_md += "\n\n✅ 解析完了:相関・t検定・箱ひげ図・ロジスティック回帰を実行しました。"
165
+
166
+ return status_md, head_df, label_counts, corr_df, ttest_df, gallery_imgs, coef_df
167
+
168
+ with gr.Blocks(title="水質データ 解析アプリ(相関 / t検定 / 箱ひげ / ロジ回帰)") as demo:
169
+ gr.Markdown("# 水質データ 解析アプリ\nExcelをアップロードし、閾値と上位特徴量数を指定して[解析実行]してください。")
170
+ with gr.Row():
171
+ file_in = gr.File(label="Excelファイル(.xlsx)をアップロード", file_types=[".xlsx"])
172
+ with gr.Row():
173
+ threshold_in = gr.Number(value=100, precision=0, label="CODcr(S)sin の閾値(悪化=1)")
174
+ topk_in = gr.Slider(1, 10, value=4, step=1, label="ロジスティック回帰の上位特徴量 数")
175
+ run_btn = gr.Button("解析実行", variant="primary")
176
+
177
+ status_out = gr.Markdown()
178
+ head_out = gr.Dataframe(label="データ先頭", interactive=False)
179
+ label_out = gr.Dataframe(label="目的変数の分布", interactive=False)
180
+ corr_out = gr.Dataframe(label="相関 (point-biserial)", interactive=False)
181
+ ttest_out = gr.Dataframe(label="t検定結果(p値の小さい順)", interactive=False)
182
+ gallery_out = gr.Gallery(label="箱ひげ図(正常 vs 悪化)")
183
+ coef_out = gr.Dataframe(label="ロジスティック回帰 係数ランキング", interactive=False)
184
+
185
+ run_btn.click(
186
+ analyze_excel,
187
+ inputs=[file_in, threshold_in, topk_in],
188
+ outputs=[status_out, head_out, label_out, corr_out, ttest_out, gallery_out, coef_out],
189
+ )
190
+
191
+ if __name__ == "__main__":
192
+ # demo.launch(share=True) # 外部共有したい場合は share=True
193
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[mcp]
2
+ supabase
3
+ python-dotenv
4
+ numpy
5
+ matplotlib
6
+ scipy
7
+ scikit-learn
8
+ openpyxl
9
+ pandas
10
+ Pillow