Spaces:

Kung-Hsun
/

Data_Extraction_CLG_Exp

Sleeping

App Files Files Community

Kung-Hsun commited on Nov 12, 2025

Commit

d467c2d

verified ·

1 Parent(s): 8bfccb8

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -85

app.py CHANGED Viewed

@@ -3,8 +3,9 @@ import pandas as pd
 import numpy as np
 import io
 import os
-from datetime import datetime, time, timedelta
 from typing import Union
 EXCEL_LETTERS = ["A", "B", "K", "L", "M", "V", "W", "X", "Y"]
 TARGET_NAMES  = ["data", "time", "⊿Ptop", "⊿Pmid", "⊿Pbot", "H2%", "CO%", "CO2%", "CH4%"]
@@ -27,8 +28,6 @@ def get_lower_name(file_input: Union[str, os.PathLike, io.BytesIO, bytes, object
 def load_dataframe(file_input) -> pd.DataFrame:
     lower_name = get_lower_name(file_input)
-    # 1) 路徑（含 NamedString）
     if isinstance(file_input, (str, os.PathLike)):
         path = str(file_input)
         if lower_name.endswith((".xlsx", ".xls")):
@@ -47,45 +46,34 @@ def load_dataframe(file_input) -> pd.DataFrame:
                 except Exception:
                     return pd.read_csv(path)
-    # 2) 檔案物件（具 .read）
     if hasattr(file_input, "read"):
         raw = file_input.read()
         bio = io.BytesIO(raw)
         if lower_name.endswith((".xlsx", ".xls")):
-            bio.seek(0)
-            return pd.read_excel(bio, engine="openpyxl")
         elif lower_name.endswith(".csv"):
             try:
-                bio.seek(0)
-                return pd.read_csv(bio, sep=None, engine="python")
             except Exception:
-                bio.seek(0)
-                return pd.read_csv(bio)
         else:
             try:
-                bio.seek(0)
-                return pd.read_excel(bio, engine="openpyxl")
             except Exception:
                 try:
-                    bio.seek(0)
-                    return pd.read_csv(bio, sep=None, engine="python")
                 except Exception:
-                    bio.seek(0)
-                    return pd.read_csv(bio)
-    # 3) bytes
     if isinstance(file_input, (bytes, bytearray)):
         bio = io.BytesIO(file_input)
         try:
-            bio.seek(0)
-            return pd.read_excel(bio, engine="openpyxl")
         except Exception:
             try:
-                bio.seek(0)
-                return pd.read_csv(bio, sep=None, engine="python")
             except Exception:
-                bio.seek(0)
-                return pd.read_csv(bio)
     raise ValueError("不支援的檔案型態，請上傳 .xlsx 或 .csv 檔。")
@@ -94,14 +82,11 @@ def extract_and_rename(df: pd.DataFrame) -> pd.DataFrame:
     existing_positions = [i for i in TARGET_INDICES if i < n_cols]
     if not existing_positions:
         raise ValueError("上傳的資料欄位數不足，無法擷取指定欄位（A,B,K,L,M,V,W,X,Y）。")
     out = df.iloc[:, existing_positions].copy()
     name_map = []
     for pos in existing_positions:
         idx_in_targets = TARGET_INDICES.index(pos)
         name_map.append(TARGET_NAMES[idx_in_targets])
     out.columns = name_map
     return out
@@ -123,97 +108,126 @@ def parse_time_to_seconds(h, m, s):
     return h * 3600 + m * 60 + s
 def _hhmmss_int_to_seconds(n: int):
-    """將整數 HHMMSS（例如 93005）轉成秒；不合法回傳 pd.NA。"""
-    if n < 0 or n > 235959:
-        return pd.NA
-    ss = n % 100
-    n //= 100
-    mm = n % 100
-    n //= 100
     hh = n % 100
     if 0 <= hh <= 23 and 0 <= mm <= 59 and 0 <= ss <= 59:
-        return hh * 3600 + mm * 60 + ss
     return pd.NA
 def series_time_to_seconds_of_day(series: pd.Series) -> pd.Series:
-    """
-    將 'time' 欄位轉成 0~86399 的秒數。
-    支援：
-      - pandas datetime64[ns] / datetime64[ns, tz]
-      - timedelta64[ns]
-      - 文字：'YYYY-mm-dd HH:MM:SS' / 'HH:MM:SS(.fff)' / 'AM/PM'
-      - Excel 序列（包含日期+時間，像 45213.5）
-      - 純數字 HHMMSS（93005 -> 09:30:05）
-      - Python datetime.time
-    未能解析者回傳 NaN。
-    """
     s = series.copy()
-    # 1) 若已是 datetime64，直接取時分秒
     if pd.api.types.is_datetime64_any_dtype(s):
-        sec = (s.dt.hour * 3600 + s.dt.minute * 60 + s.dt.second).astype("float")
-        return sec
-    # 2) 若是 timedelta（少見），取一天內秒數
     if pd.api.types.is_timedelta64_dtype(s):
         total_sec = s.dt.total_seconds()
         return (total_sec % 86400).astype("float")
-    # 3) 嘗試一般字串/物件 → datetime
     parsed = pd.to_datetime(s, errors="coerce")
-    sec_parsed = (parsed.dt.hour * 3600 + parsed.dt.minute * 60 + parsed.dt.second).astype("float")
-    # 4) Excel 序列時間（含日期部分），任何數值都取小數部分 * 86400
     num = pd.to_numeric(s, errors="coerce")
-    sec_excel = ((num % 1) * 86400).round().astype("float")  # 45213.5 -> 0.5 天 -> 43200 秒
-    # 僅在 parsed 失敗時使用 excel 轉換
     result = sec_parsed.where(~sec_parsed.isna(), other=np.nan)
     result = np.where(np.isnan(result), sec_excel, result)
     result = pd.Series(result, index=s.index, dtype="float")
-    # 5) 純數字 HHMMSS（ex: 93005）
     mask_intlike = num.notna() & (num == np.floor(num))
     sec_hhmmss = pd.Series(np.nan, index=s.index, dtype="float")
     if mask_intlike.any():
         ints = num[mask_intlike].astype("int64")
         sec_hhmmss.loc[mask_intlike] = ints.map(_hhmmss_int_to_seconds).astype("float")
-    # 僅在前兩招皆 NaN 時，採用 HHMMSS 轉換
     fill_mask = result.isna() & sec_hhmmss.notna()
     result.loc[fill_mask] = sec_hhmmss.loc[fill_mask]
-    # 6) Python datetime.time 物件
     if result.isna().any():
         obj_mask = result.isna()
         subset = s[obj_mask]
         def time_obj_to_sec(x):
             if isinstance(x, time):
-                return x.hour * 3600 + x.minute * 60 + x.second
             return np.nan
         result.loc[obj_mask] = subset.map(time_obj_to_sec)
-    # 最終返回（仍可能有 NaN，代表無法解析）
     return result.astype("float")
 def pad_time(h, m, s):
-    def to2(x):
-        return "??" if x is None else f"{int(x):02d}"
     return f"{to2(h)}:{to2(m)}:{to2(s)}"
-with gr.Blocks(title="Excel/CSV 指定欄位擷取器（含時間區段）") as demo:
-    gr.Markdown("### 指定欄位擷取（A,B,K,L,M,V,W,X,Y）→ 重新命名為 data,time,⊿Ptop,⊿Pmid,⊿Pbot,H2%,CO%,CO2%,CH4% ；可依 **時間區段 (hh:mm:ss)** 過濾。")
     inp = gr.File(label="上傳 .xlsx 或 .csv 檔案", file_types=[".xlsx", ".csv"], type="filepath")
     with gr.Row():
-        gr.Markdown("**開始時間 (hh:mm:ss)** — 三個欄位：")
     with gr.Row():
         sh = gr.Number(label="Start HH (0-23)", value=None)
         sm = gr.Number(label="Start MM (0-59)", value=None)
         ss = gr.Number(label="Start SS (0-59)", value=None)
     with gr.Row():
-        gr.Markdown("**結束時間 (hh:mm:ss)** — 三個欄位：")
     with gr.Row():
         eh = gr.Number(label="End HH (0-23)", value=None)
         em = gr.Number(label="End MM (0-59)", value=None)
@@ -225,66 +239,96 @@ with gr.Blocks(title="Excel/CSV 指定欄位擷取器（含時間區段）") as
     msg = gr.Markdown()
     preview = gr.Dataframe(label="預覽（前 20 列）", wrap=True)
     def run_pipeline(file_path_str, sh_, sm_, ss_, eh_, em_, es_):
         if not file_path_str:
-            return gr.update(visible=False), "請先上傳檔案。", pd.DataFrame()
         try:
             df = load_dataframe(file_path_str)
             out = extract_and_rename(df)
         except Exception as e:
-            return gr.update(visible=False), f"處理失敗：{e}", pd.DataFrame()
         original_rows = len(out)
-        # 嘗試時間過濾
         try:
             start_sec = parse_time_to_seconds(sh_, sm_, ss_)
             end_sec   = parse_time_to_seconds(eh_, em_, es_)
         except Exception as e:
-            return gr.update(visible=False), f"時間輸入錯誤：{e}", pd.DataFrame()
         parsed_ok = None
         if (start_sec is not None) and (end_sec is not None):
             if "time" not in out.columns:
-                return gr.update(visible=False), "找不到 'time' 欄，無法做時間過濾。", pd.DataFrame()
             secs = series_time_to_seconds_of_day(out["time"])
             parsed_ok = int(secs.notna().sum())
             valid_mask = secs.notna()
             secs_valid = secs.where(valid_mask, other=-1)
             if start_sec <= end_sec:
                 keep = valid_mask & (secs_valid >= start_sec) & (secs_valid <= end_sec)
             else:
                 keep = valid_mask & ((secs_valid >= start_sec) | (secs_valid <= end_sec))
             out = out.loc[keep].reset_index(drop=True)
-        # 寫檔輸出
         ts = datetime.now().strftime("%Y%m%d_%H%M%S")
         out_path = f"/tmp/extracted_columns_{ts}.xlsx"
         try:
             out.to_excel(out_path, index=False, engine="openpyxl")
         except Exception as e:
-            return gr.update(visible=False), f"輸出 Excel 失敗：{e}", pd.DataFrame()
-        # 訊息摘要
-        note_lines = [f"完成！原始列數：**{original_rows}**"]
         if parsed_ok is not None:
-            note_lines.append(f"可解析時間列數：**{parsed_ok}**")
-            note_lines.append(f"時間區段：**{pad_time(sh_, sm_, ss_)} → {pad_time(eh_, em_, es_)}**")
-        note_lines.append(f"輸出列數：**{len(out)}**")
-        note_lines.append("下方預覽、右側可下載 Excel。")
         note = "｜".join(note_lines)
-        return gr.update(value=out_path, visible=True), note, out.head(20)
     run_btn.click(
         run_pipeline,
         inputs=[inp, sh, sm, ss, eh, em, es],
-        outputs=[file_out, msg, preview]
     )
 if __name__ == "__main__":

 import numpy as np
 import io
 import os
+from datetime import datetime, time
 from typing import Union
+import matplotlib.pyplot as plt
 EXCEL_LETTERS = ["A", "B", "K", "L", "M", "V", "W", "X", "Y"]
 TARGET_NAMES  = ["data", "time", "⊿Ptop", "⊿Pmid", "⊿Pbot", "H2%", "CO%", "CO2%", "CH4%"]
 def load_dataframe(file_input) -> pd.DataFrame:
     lower_name = get_lower_name(file_input)
     if isinstance(file_input, (str, os.PathLike)):
         path = str(file_input)
         if lower_name.endswith((".xlsx", ".xls")):
                 except Exception:
                     return pd.read_csv(path)
     if hasattr(file_input, "read"):
         raw = file_input.read()
         bio = io.BytesIO(raw)
         if lower_name.endswith((".xlsx", ".xls")):
+            bio.seek(0); return pd.read_excel(bio, engine="openpyxl")
         elif lower_name.endswith(".csv"):
             try:
+                bio.seek(0); return pd.read_csv(bio, sep=None, engine="python")
             except Exception:
+                bio.seek(0); return pd.read_csv(bio)
         else:
             try:
+                bio.seek(0); return pd.read_excel(bio, engine="openpyxl")
             except Exception:
                 try:
+                    bio.seek(0); return pd.read_csv(bio, sep=None, engine="python")
                 except Exception:
+                    bio.seek(0); return pd.read_csv(bio)
     if isinstance(file_input, (bytes, bytearray)):
         bio = io.BytesIO(file_input)
         try:
+            bio.seek(0); return pd.read_excel(bio, engine="openpyxl")
         except Exception:
             try:
+                bio.seek(0); return pd.read_csv(bio, sep=None, engine="python")
             except Exception:
+                bio.seek(0); return pd.read_csv(bio)
     raise ValueError("不支援的檔案型態，請上傳 .xlsx 或 .csv 檔。")
     existing_positions = [i for i in TARGET_INDICES if i < n_cols]
     if not existing_positions:
         raise ValueError("上傳的資料欄位數不足，無法擷取指定欄位（A,B,K,L,M,V,W,X,Y）。")
     out = df.iloc[:, existing_positions].copy()
     name_map = []
     for pos in existing_positions:
         idx_in_targets = TARGET_INDICES.index(pos)
         name_map.append(TARGET_NAMES[idx_in_targets])
     out.columns = name_map
     return out
     return h * 3600 + m * 60 + s
 def _hhmmss_int_to_seconds(n: int):
+    if n < 0 or n > 235959: return pd.NA
+    ss = n % 100; n //= 100
+    mm = n % 100; n //= 100
     hh = n % 100
     if 0 <= hh <= 23 and 0 <= mm <= 59 and 0 <= ss <= 59:
+        return hh*3600 + mm*60 + ss
     return pd.NA
 def series_time_to_seconds_of_day(series: pd.Series) -> pd.Series:
     s = series.copy()
     if pd.api.types.is_datetime64_any_dtype(s):
+        return (s.dt.hour*3600 + s.dt.minute*60 + s.dt.second).astype("float")
     if pd.api.types.is_timedelta64_dtype(s):
         total_sec = s.dt.total_seconds()
         return (total_sec % 86400).astype("float")
     parsed = pd.to_datetime(s, errors="coerce")
+    sec_parsed = (parsed.dt.hour*3600 + parsed.dt.minute*60 + parsed.dt.second).astype("float")
     num = pd.to_numeric(s, errors="coerce")
+    sec_excel = ((num % 1) * 86400).round().astype("float")
     result = sec_parsed.where(~sec_parsed.isna(), other=np.nan)
     result = np.where(np.isnan(result), sec_excel, result)
     result = pd.Series(result, index=s.index, dtype="float")
     mask_intlike = num.notna() & (num == np.floor(num))
     sec_hhmmss = pd.Series(np.nan, index=s.index, dtype="float")
     if mask_intlike.any():
         ints = num[mask_intlike].astype("int64")
         sec_hhmmss.loc[mask_intlike] = ints.map(_hhmmss_int_to_seconds).astype("float")
     fill_mask = result.isna() & sec_hhmmss.notna()
     result.loc[fill_mask] = sec_hhmmss.loc[fill_mask]
     if result.isna().any():
         obj_mask = result.isna()
         subset = s[obj_mask]
         def time_obj_to_sec(x):
             if isinstance(x, time):
+                return x.hour*3600 + x.minute*60 + x.second
             return np.nan
         result.loc[obj_mask] = subset.map(time_obj_to_sec)
     return result.astype("float")
 def pad_time(h, m, s):
+    def to2(x): return "??" if x is None else f"{int(x):02d}"
     return f"{to2(h)}:{to2(m)}:{to2(s)}"
+def make_scatter_with_trend(df: pd.DataFrame, x_col: str, y_cols: list):
+    if df is None or len(df) == 0:
+        raise ValueError("沒有可繪圖的資料。")
+    if not x_col or not y_cols:
+        raise ValueError("請選擇 X 與至少一個 Y 欄位。")
+    for c in [x_col, *y_cols]:
+        if c not in df.columns:
+            raise ValueError(f"找不到欄位：{c}")
+    # 轉 x 為數值
+    x = df[x_col]
+    if x_col == "time" or x.dtype == object:
+        x_num = series_time_to_seconds_of_day(x)
+    else:
+        x_num = pd.to_numeric(x, errors="coerce")
+    if x_num.notna().sum() < 2:
+        raise ValueError("X 軸無法解析為數值或有效點數不足。")
+    fig, ax = plt.subplots(figsize=(7, 4.5))  # 一張圖、單軸
+    lines = 0
+    for y_col in y_cols:
+        y = pd.to_numeric(df[y_col], errors="coerce")
+        mask = x_num.notna() & y.notna()
+        if mask.sum() < 2:
+            continue
+        xs = x_num[mask].values
+        ys = y[mask].values
+        # 散佈圖
+        ax.scatter(xs, ys, label=f"{y_col}", alpha=0.8)
+        # 線性趨勢線（最小平方法）
+        try:
+            slope, intercept = np.polyfit(xs, ys, 1)
+            x_line = np.linspace(xs.min(), xs.max(), 200)
+            y_line = slope * x_line + intercept
+            ax.plot(x_line, y_line, linewidth=2)
+            lines += 1
+        except Exception:
+            pass
+    ax.set_xlabel(x_col)
+    if len(y_cols) == 1:
+        ax.set_ylabel(y_cols[0])
+    else:
+        ax.set_ylabel("Selected Y")
+    ax.grid(True, alpha=0.3)
+    ax.legend()
+    fig.tight_layout()
+    return fig, lines
+with gr.Blocks(title="Excel/CSV 指定欄位擷取器（含時間過濾＋繪圖）") as demo:
+    gr.Markdown("### 指定欄位擷取（A,B,K,L,M,V,W,X,Y）→ 重新命名為 data,time,⊿Ptop,⊿Pmid,⊿Pbot,H2%,CO%,CO2%,CH4% ；支援 **時間區段 (hh:mm:ss)** 過濾與 **散佈圖＋直線** 繪製（Y 可複選）。")
+    df_state = gr.State(value=None)  # 保存處理後的 DataFrame
     inp = gr.File(label="上傳 .xlsx 或 .csv 檔案", file_types=[".xlsx", ".csv"], type="filepath")
     with gr.Row():
+        gr.Markdown("**開始時間 (hh:mm:ss)**")
     with gr.Row():
         sh = gr.Number(label="Start HH (0-23)", value=None)
         sm = gr.Number(label="Start MM (0-59)", value=None)
         ss = gr.Number(label="Start SS (0-59)", value=None)
     with gr.Row():
+        gr.Markdown("**結束時間 (hh:mm:ss)**")
     with gr.Row():
         eh = gr.Number(label="End HH (0-23)", value=None)
         em = gr.Number(label="End MM (0-59)", value=None)
     msg = gr.Markdown()
     preview = gr.Dataframe(label="預覽（前 20 列）", wrap=True)
+    gr.Markdown("### 繪圖設定")
+    with gr.Row():
+        x_sel = gr.Dropdown(label="X 軸欄位", choices=[], value=None)
+        y_sel = gr.Dropdown(label="Y 軸欄位（可複選）", choices=[], value=None, multiselect=True)
+    plot_btn = gr.Button("繪圖（散佈＋直線）")
+    plot_out = gr.Plot(label="散佈圖（含線性趨勢線）")
+    plot_msg = gr.Markdown()
     def run_pipeline(file_path_str, sh_, sm_, ss_, eh_, em_, es_):
         if not file_path_str:
+            return gr.update(visible=False), "請先上傳檔案。", pd.DataFrame(), None, gr.update(choices=[], value=None), gr.update(choices=[], value=None)
         try:
             df = load_dataframe(file_path_str)
             out = extract_and_rename(df)
         except Exception as e:
+            return gr.update(visible=False), f"處理失敗：{e}", pd.DataFrame(), None, gr.update(choices=[], value=None), gr.update(choices=[], value=None)
         original_rows = len(out)
         try:
             start_sec = parse_time_to_seconds(sh_, sm_, ss_)
             end_sec   = parse_time_to_seconds(eh_, em_, es_)
         except Exception as e:
+            return gr.update(visible=False), f"時間輸入錯誤：{e}", pd.DataFrame(), None, gr.update(choices=[], value=None), gr.update(choices=[], value=None)
         parsed_ok = None
         if (start_sec is not None) and (end_sec is not None):
             if "time" not in out.columns:
+                return gr.update(visible=False), "找不到 'time' 欄，無法做時間過濾。", pd.DataFrame(), None, gr.update(choices=[], value=None), gr.update(choices=[], value=None)
             secs = series_time_to_seconds_of_day(out["time"])
             parsed_ok = int(secs.notna().sum())
             valid_mask = secs.notna()
             secs_valid = secs.where(valid_mask, other=-1)
             if start_sec <= end_sec:
                 keep = valid_mask & (secs_valid >= start_sec) & (secs_valid <= end_sec)
             else:
                 keep = valid_mask & ((secs_valid >= start_sec) | (secs_valid <= end_sec))
             out = out.loc[keep].reset_index(drop=True)
         ts = datetime.now().strftime("%Y%m%d_%H%M%S")
         out_path = f"/tmp/extracted_columns_{ts}.xlsx"
         try:
             out.to_excel(out_path, index=False, engine="openpyxl")
         except Exception as e:
+            return gr.update(visible=False), f"輸出 Excel 失敗：{e}", pd.DataFrame(), None, gr.update(choices=[], value=None), gr.update(choices=[], value=None)
+        # 更新下拉選單
+        cols = out.columns.tolist()
+        default_x = "time" if "time" in cols else (cols[0] if cols else None)
+        default_y = [c for c in ["H2%", "CO%", "CO2%", "CH4%"] if c in cols] or ([cols[1]] if len(cols) > 1 else cols)
+        note_lines = [f"完成！原始列數：**{original_rows}**",
+                      f"輸出列數：**{len(out)}**"]
         if parsed_ok is not None:
+            note_lines.insert(1, f"可解析時間列數：**{parsed_ok}**")
+            note_lines.insert(2, f"時間區段：**{pad_time(sh_, sm_, ss_)} → {pad_time(eh_, em_, es_)}**")
+        note_lines.append("下方預覽、右側可下載 Excel；並可於下方選擇欄位繪圖。")
         note = "｜".join(note_lines)
+        return (
+            gr.update(value=out_path, visible=True),
+            note,
+            out.head(20),
+            out,  # 存入 State
+            gr.update(choices=cols, value=default_x),
+            gr.update(choices=cols, value=default_y)
+        )
+    def plot_handler(df, x_col, y_cols):
+        if df is None:
+            return None, "尚未有可用資料，請先完成上方處理。"
+        try:
+            fig, lines = make_scatter_with_trend(df, x_col, y_cols or [])
+            msg = f"完成繪圖：共 {len(y_cols or [])} 個 Y；已繪製 {lines} 條線性趨勢線。"
+            return fig, msg
+        except Exception as e:
+            return None, f"繪圖失敗：{e}"
     run_btn.click(
         run_pipeline,
         inputs=[inp, sh, sm, ss, eh, em, es],
+        outputs=[file_out, msg, preview, df_state, x_sel, y_sel]
+    )
+    plot_btn.click(
+        plot_handler,
+        inputs=[df_state, x_sel, y_sel],
+        outputs=[plot_out, plot_msg]
     )
 if __name__ == "__main__":