Spaces:

hiroki0008
/

FIT_data

Sleeping

App Files Files Community

hiroki0008 commited on Sep 15, 2025

Commit

1012ab1

verified ·

1 Parent(s): 3fa0fec

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -113

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from bs4 import BeautifulSoup
 PUBLIC_URL = "https://www.fit-portal.go.jp/PublicInfo"
 OUTDIR = "data_fit"
-# ---- ユーティリティ ---------------------------------------------------------
 def normalize_filename(name: str) -> str:
     name = unicodedata.normalize("NFKC", name)
@@ -45,6 +45,7 @@ def pick_sheet_name(xls_path: str, preferred: str | None) -> str | None:
         xl = pd.ExcelFile(xls_path)
         if preferred and preferred in xl.sheet_names:
             return preferred
         for candidate in ["代表地番", "代表地番のみ", "代表地番シート"]:
             if candidate in xl.sheet_names:
                 return candidate
@@ -78,56 +79,66 @@ def download_one(session: requests.Session, url: str, outdir: str, pref: str) ->
         fname = guess_filename_from_headers(r, f"{pref}_{file_id}.xlsx")
         path = os.path.join(outdir, fname)
         with open(path, "wb") as f:
-            # ✅ ここを修正：通常の iter_content ループに
             for chunk in r.iter_content(chunk_size=1 << 15):
                 if chunk:
                     f.write(chunk)
     return path
 def choose_names_from_multiindex(mi: pd.MultiIndex) -> list[str]:
     """
-    3段ヘッダ(MultiIndex: 大,中,小)から列名を選ぶ。
-    優先順: 中分類(第2段) → 小分類(第3段) → 大分類(第1段)。
-    空/NaN/空白は無視。重複は .1, .2… を付与。
     """
-    def clean(x) -> str:
-        if x is None:
-            return ""
-        s = str(x).strip()
-        return "" if s.lower() == "nan" else s
-    # 優先で選択
-    picked = []
     for tpl in mi:
-        a = clean(tpl[0]) if len(tpl) >= 1 else ""  # 大
-        b = clean(tpl[1]) if len(tpl) >= 2 else ""  # 中
-        c = clean(tpl[2]) if len(tpl) >= 3 else ""  # 小
-        name = b or c or a or "col"                # ★ 中 > 小 > 大
-        picked.append(name)
     # 重複解消
     seen = {}
-    uniq = []
-    for n in picked:
         if n not in seen:
             seen[n] = 0
-            uniq.append(n)
         else:
             seen[n] += 1
-            uniq.append(f"{n}.{seen[n]}")
-    return uniq
-# ---- 3段ヘッダ → 1枚目のみ採用／他はスキップ行数で読込 ----------------------
-HEADER_ROWS = [1, 2, 3]  # 0行目は削除、1/2/3行目を列名として結合
-SKIP_ROWS_OTHERS = 4     # 2枚目以降は 0〜3 行目をスキップ
-def load_excel_first(xls_path: str, sheet_pref: str | None) -> tuple[pd.DataFrame, list]:
     """
-    1枚目: 0行目は使わず、1/2/3行目を列名にする（MultiIndex）。
-    さらに「一番左の列」を削除して返す。
-    戻り値: (df, columns_multiindex)
     """
     sheet = pick_sheet_name(xls_path, sheet_pref)
     if not sheet:
@@ -139,37 +150,37 @@ def load_excel_first(xls_path: str, sheet_pref: str | None) -> tuple[pd.DataFram
         header=HEADER_ROWS,
         dtype=str
     )
-    # 左端の余計な列を削除
     df = df.iloc[:, 1:]
-    # 列名を選択（中 > 小 > 大）
     if isinstance(df.columns, pd.MultiIndex):
         chosen = choose_names_from_multiindex(df.columns)
     else:
-        # 単層ヘッダの場合のフォールバック
-        raw = []
-        for c in df.columns:
-            s = "" if c is None else str(c).strip()
-            raw.append("" if s.lower() == "nan" else s)
-        raw = [r if r else "col" for r in raw]
         seen = {}
         chosen = []
         for n in raw:
-            if n in seen:
-                seen[n] += 1
-                chosen.append(f"{n}.{seen[n]}")
-            else:
                 seen[n] = 0
                 chosen.append(n)
     df.columns = chosen
-    return df, cols
-def load_excel_other(xls_path: str, sheet_pref: str | None, target_cols: list) -> pd.DataFrame | None:
     """
-    2枚目以降: 3行目までスキップしてデータのみ読み込み。
-    左端列を削除後、1枚目の列（MultiIndex）を適用。
     """
     sheet = pick_sheet_name(xls_path, sheet_pref)
     if not sheet:
@@ -182,58 +193,27 @@ def load_excel_other(xls_path: str, sheet_pref: str | None, target_cols: list) -
         skiprows=SKIP_ROWS_OTHERS,
         dtype=str
     )
-    # 左端の余計な列を削除
     df = df.iloc[:, 1:]
     # 前後空白トリム
     for c in df.select_dtypes(include=["object"]).columns:
         df[c] = df[c].str.strip()
-    # 列数が合わない場合は合わせられる範囲で調整（警告付き）
     if df.shape[1] != len(target_cols):
         print(f"[WARN] 列数不一致: file={os.path.basename(xls_path)} "
               f"read={df.shape[1]} vs target={len(target_cols)} -> 自動調整")
         if df.shape[1] > len(target_cols):
             df = df.iloc[:, :len(target_cols)]
         else:
-            # 足りない場合は欠損列を追加
-            for _ in range(len(target_cols) - df.shape[1]):
-                df[pd.util.hash_pandas_object(df).name or f"_pad_{_}"] = None
-            # 列順を並べ替え
             df = df.iloc[:, :len(target_cols)]
     df.columns = target_cols
     return df
-# ---- 列名フラット化 ---------------------------------------------------------
-def flatten_columns(cols, sep: str = "_") -> list[str]:
-    """
-    MultiIndex 列を '上位_中位_下位' にフラット化。
-    None / NaN / 空白は除去。重複名は .1, .2... を付与。
-    """
-    def as_str(x):
-        s = "" if x is None else str(x)
-        s = s.strip()
-        return "" if s.lower() == "nan" else s
-    if isinstance(cols, pd.MultiIndex):
-        raw = []
-        for tpl in cols:
-            parts = [as_str(p) for p in tpl if as_str(p)]
-            raw.append(sep.join(parts) if parts else "col")
-    else:
-        raw = [as_str(c) or "col" for c in cols]
-    seen, out = {}, []
-    for c in raw:
-        if c not in seen:
-            seen[c] = 0
-            out.append(c)
-        else:
-            seen[c] += 1
-            out.append(f"{c}.{seen[c]}")
-    return out
 def zip_paths(paths: list[str], out_zip: str) -> str:
     with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as z:
         for p in paths:
@@ -241,21 +221,22 @@ def zip_paths(paths: list[str], out_zip: str) -> str:
                 z.write(p, arcname=os.path.basename(p))
     return out_zip
-# ---- メイン（Gradioから呼び出し） -------------------------------------------
-def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress=gr.Progress(track_tqdm=False)):
     progress(0, desc="初期化中…")
     session = requests.Session()
     session.headers.update({
-        "User-Agent": "Mozilla/5.0 (compatible; FITCollector/1.2; +https://huggingface.co/spaces)",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     })
     # 1) リンク収集
     links = collect_pref_links(session)
     if not links:
-        return ("都道府県ファイルのリンク検出に失敗しました。", None, None, None, None)
     if limit and limit > 0:
         links = links[:int(limit)]
     progress(0.1, desc=f"リンク検出 {len(links)} 件")
@@ -284,8 +265,8 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
     if not downloaded:
         return ("ダウンロードに失敗しました。", None, None, None, None)
-    # 3) 読み込み・1枚目
-    progress(0.75, desc="1枚目を読み込み（列名生成）")
     first_path = downloaded[0]
     try:
         df0, cols0 = load_excel_first(first_path, sheet_name if sheet_name else None)
@@ -295,7 +276,7 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
     frames = [df0]
-    # 4) 読み込み・2枚目以降
     for j, p in enumerate(downloaded[1:], start=2):
         progress(0.75 + 0.25 * (j - 1) / max(1, len(downloaded) - 1),
                  desc=f"{j}枚目を読み込み")
@@ -308,11 +289,7 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
     # 5) 縦結合
     combined = pd.concat(frames, ignore_index=True)
-    # 6) 列名のフラット化（既定ON）
-    if do_flatten:
-        combined.columns = flatten_columns(combined.columns, sep=sep or "_")
-    # 7) 出力
     os.makedirs(OUTDIR, exist_ok=True)
     out_xlsx = os.path.join(OUTDIR, "combined_fit.xlsx")
     out_parq = os.path.join(OUTDIR, "combined_fit.parquet")
@@ -320,11 +297,11 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
         combined.to_excel(w, index=False, sheet_name="combined")
     combined.to_parquet(out_parq, index=False)
-    # 8) ZIP（取得ファイル一式）
     raw_zip = os.path.join(OUTDIR, "raw_excels.zip")
     zip_paths(downloaded, raw_zip)
-    # 9) プレビュー
     preview_csv = os.path.join(OUTDIR, "combined_head.csv")
     combined.head(1000).to_csv(preview_csv, index=False)
@@ -335,21 +312,22 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
         f"・Parquet: combined_fit.parquet\n"
         f"・Raw ZIP: raw_excels.zip\n"
         f"・プレビュー: combined_head.csv\n"
-        f"・列名フラット化: {'ON' if do_flatten else 'OFF'}（区切り: '{sep or '_'}'）"
     )
     return (msg, out_xlsx, out_parq, raw_zip, preview_csv)
-# ---- Gradio UI -------------------------------------------------------------
 with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合") as demo:
     gr.Markdown(
         """
         # FIT 公表（都道府県別Excel）一括取得 & 結合
-        **要件に沿った処理**:
-        - 1枚目のみ「0行目を削除」「1/2/3行目を結合して列名」。
-        - 2枚目以降は「3行目までスキップ」してデータのみ。
-        - すべてのファイルで**左端の列を削除**。
-        - ファイル/シート名などのメタ列は**付与しません**。
         """
     )
     with gr.Row():
@@ -358,9 +336,6 @@ with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合"
     with gr.Row():
         limit = gr.Number(value=None, precision=0, label="先頭N県のみ（テスト用・空欄は全県）")
         reget = gr.Checkbox(label="既存ファイルがあっても再ダウンロードする", value=False)
-    with gr.Accordion("列名オプション", open=True):
-        do_flatten = gr.Checkbox(label="列名を1段にフラット化（推奨）", value=True)
-        sep = gr.Textbox(label="フラット化セパレータ", value="_", placeholder="例）_, /, | など")
     run_btn = gr.Button("実行", variant="primary")
     out_msg = gr.Markdown()
@@ -371,7 +346,7 @@ with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合"
     run_btn.click(
         fn=run_job,
-        inputs=[sheet, sleep, limit, reget, do_flatten, sep],
         outputs=[out_msg, out_xlsx, out_parq, out_zip, out_preview]
     )

 PUBLIC_URL = "https://www.fit-portal.go.jp/PublicInfo"
 OUTDIR = "data_fit"
+# -------------------- ユーティリティ --------------------
 def normalize_filename(name: str) -> str:
     name = unicodedata.normalize("NFKC", name)
         xl = pd.ExcelFile(xls_path)
         if preferred and preferred in xl.sheet_names:
             return preferred
+        # 一般的に「代表地番」を優先
         for candidate in ["代表地番", "代表地番のみ", "代表地番シート"]:
             if candidate in xl.sheet_names:
                 return candidate
         fname = guess_filename_from_headers(r, f"{pref}_{file_id}.xlsx")
         path = os.path.join(outdir, fname)
         with open(path, "wb") as f:
             for chunk in r.iter_content(chunk_size=1 << 15):
                 if chunk:
                     f.write(chunk)
     return path
+# -------------------- 列名選択: 小分類 > 中分類 > 大分類 --------------------
+def _clean_cell(x) -> str:
+    if x is None:
+        return ""
+    s = str(x).strip()
+    if s.lower() == "nan":
+        return ""
+    return s
 def choose_names_from_multiindex(mi: pd.MultiIndex) -> list[str]:
     """
+    3段ヘッダ(MultiIndex)から列名を選ぶ。
+    ルール: 小分類(第3段)に値があればそれ、無ければ中分類(第2段)、
+           それも無ければ大分類(第1段)。すべて空なら 'col'。
+    最後に重複を .1, .2… で解消。
     """
+    names = []
     for tpl in mi:
+        # tpl は (大, 中, 小) 想定
+        if len(tpl) < 3:
+            # 念のため不足時の安全対策
+            a = _clean_cell(tpl[0]) if len(tpl) >= 1 else ""
+            b = _clean_cell(tpl[1]) if len(tpl) >= 2 else ""
+            c = ""
+        else:
+            a, b, c = (_clean_cell(tpl[0]), _clean_cell(tpl[1]), _clean_cell(tpl[2]))
+        name = c or b or a or "col"
+        names.append(name)
     # 重複解消
     seen = {}
+    out = []
+    for n in names:
         if n not in seen:
             seen[n] = 0
+            out.append(n)
         else:
             seen[n] += 1
+            out.append(f"{n}.{seen[n]}")
+    return out
+# -------------------- 読み込みルール --------------------
+# 0行目は削除し、1/2/3行目をヘッダ（= header=[1,2,3]）
+HEADER_ROWS = [1, 2, 3]
+# 2枚目以降は 0〜3行目をスキップ（= skiprows=4）、header=None でデータのみ
+SKIP_ROWS_OTHERS = 4
+def load_excel_first(xls_path: str, sheet_pref: str | None) -> tuple[pd.DataFrame, list[str]]:
     """
+    1枚目:
+      - header=[1,2,3] で3段ヘッダを読み込み（0行目は自動的に使われない）
+      - 左端の列を削除
+      - MultiIndex から列名を「小＞中＞大」の優先で単一行に変換
+    戻り値: (df, chosen_names)
     """
     sheet = pick_sheet_name(xls_path, sheet_pref)
     if not sheet:
         header=HEADER_ROWS,
         dtype=str
     )
+    # 左端の列を削除
     df = df.iloc[:, 1:]
+    # 前後空白トリム
+    for c in df.select_dtypes(include=["object"]).columns:
+        df[c] = df[c].str.strip()
+    # 列名を選択
     if isinstance(df.columns, pd.MultiIndex):
         chosen = choose_names_from_multiindex(df.columns)
     else:
+        # 念のため単層だった場合もクリーニング＆重複解消
+        raw = [_clean_cell(c) or "col" for c in df.columns]
         seen = {}
         chosen = []
         for n in raw:
+            if n not in seen:
                 seen[n] = 0
                 chosen.append(n)
+            else:
+                seen[n] += 1
+                chosen.append(f"{n}.{seen[n]}")
     df.columns = chosen
+    return df, chosen
+def load_excel_other(xls_path: str, sheet_pref: str | None, target_cols: list[str]) -> pd.DataFrame | None:
     """
+    2枚目以降:
+      - skiprows=4, header=None でデータのみ
+      - 左端の列を削除
+      - 列数が合わなければ切り詰め/ダミー列追加で合わせる
+      - 列名を 1枚目の chosen に置換
     """
     sheet = pick_sheet_name(xls_path, sheet_pref)
     if not sheet:
         skiprows=SKIP_ROWS_OTHERS,
         dtype=str
     )
+    # 左端の列を削除
     df = df.iloc[:, 1:]
     # 前後空白トリム
     for c in df.select_dtypes(include=["object"]).columns:
         df[c] = df[c].str.strip()
+    # 列数調整
     if df.shape[1] != len(target_cols):
         print(f"[WARN] 列数不一致: file={os.path.basename(xls_path)} "
               f"read={df.shape[1]} vs target={len(target_cols)} -> 自動調整")
         if df.shape[1] > len(target_cols):
             df = df.iloc[:, :len(target_cols)]
         else:
+            # 足りないときは None 列を追加
+            for k in range(len(target_cols) - df.shape[1]):
+                df[f"_pad_{k}"] = None
             df = df.iloc[:, :len(target_cols)]
     df.columns = target_cols
     return df
 def zip_paths(paths: list[str], out_zip: str) -> str:
     with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as z:
         for p in paths:
                 z.write(p, arcname=os.path.basename(p))
     return out_zip
+# -------------------- メイン実行（Gradioから呼ぶ） --------------------
+def run_job(sheet_name, sleep_sec, limit, re_download, progress=gr.Progress(track_tqdm=False)):
     progress(0, desc="初期化中…")
     session = requests.Session()
     session.headers.update({
+        "User-Agent": "Mozilla/5.0 (compatible; FITCollector/1.3; +https://huggingface.co/spaces)",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     })
     # 1) リンク収集
     links = collect_pref_links(session)
     if not links:
+        return ("都道府県ファイルのリンク検出に失敗しました。ページ構成の変更/一時的な制限の可能性があります。",
+                None, None, None, None)
     if limit and limit > 0:
         links = links[:int(limit)]
     progress(0.1, desc=f"リンク検出 {len(links)} 件")
     if not downloaded:
         return ("ダウンロードに失敗しました。", None, None, None, None)
+    # 3) 読み込み（1枚目で列名確定）
+    progress(0.75, desc="1枚目を読み込み（列名を確定）")
     first_path = downloaded[0]
     try:
         df0, cols0 = load_excel_first(first_path, sheet_name if sheet_name else None)
     frames = [df0]
+    # 4) 読み込み（2枚目以降）
     for j, p in enumerate(downloaded[1:], start=2):
         progress(0.75 + 0.25 * (j - 1) / max(1, len(downloaded) - 1),
                  desc=f"{j}枚目を読み込み")
     # 5) 縦結合
     combined = pd.concat(frames, ignore_index=True)
+    # 6) 出力
     os.makedirs(OUTDIR, exist_ok=True)
     out_xlsx = os.path.join(OUTDIR, "combined_fit.xlsx")
     out_parq = os.path.join(OUTDIR, "combined_fit.parquet")
         combined.to_excel(w, index=False, sheet_name="combined")
     combined.to_parquet(out_parq, index=False)
+    # 7) ZIP（取得ファイル一式）
     raw_zip = os.path.join(OUTDIR, "raw_excels.zip")
     zip_paths(downloaded, raw_zip)
+    # 8) プレビュー
     preview_csv = os.path.join(OUTDIR, "combined_head.csv")
     combined.head(1000).to_csv(preview_csv, index=False)
         f"・Parquet: combined_fit.parquet\n"
         f"・Raw ZIP: raw_excels.zip\n"
         f"・プレビュー: combined_head.csv\n"
+        f"・列名は『小分類＞中分類＞大分類』の優先で単一行化（結合は不実施）"
     )
     return (msg, out_xlsx, out_parq, raw_zip, preview_csv)
+# -------------------- Gradio UI --------------------
 with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合") as demo:
     gr.Markdown(
         """
         # FIT 公表（都道府県別Excel）一括取得 & 結合
+        **列名ポリシー**:
+        - 1枚目: 0行目を使わず、1/2/3行目をヘッダとして読み込み（3段）。
+        - 列名は **小分類に値があれば小分類、無ければ中分類のみ**（結合しません）。
+        - 2枚目以降: 0〜3行目をスキップし、データのみ読み込み。
+        - すべてのファイルで **左端の列は削除**。
+        - ファイル名／シート名などのメタ列は付与しません。
         """
     )
     with gr.Row():
     with gr.Row():
         limit = gr.Number(value=None, precision=0, label="先頭N県のみ（テスト用・空欄は全県）")
         reget = gr.Checkbox(label="既存ファイルがあっても再ダウンロードする", value=False)
     run_btn = gr.Button("実行", variant="primary")
     out_msg = gr.Markdown()
     run_btn.click(
         fn=run_job,
+        inputs=[sheet, sleep, limit, reget],
         outputs=[out_msg, out_xlsx, out_parq, out_zip, out_preview]
     )