Spaces:

hiroki0008
/

FIT_data

Sleeping

App Files Files Community

hiroki0008 commited on Sep 15, 2025

Commit

395503f

verified ·

1 Parent(s): 18f56a2

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -110

app.py CHANGED Viewed

@@ -11,9 +11,9 @@ import pandas as pd
 from bs4 import BeautifulSoup
 PUBLIC_URL = "https://www.fit-portal.go.jp/PublicInfo"
-OUTDIR = "data_fit"  # 保存先
-# -------------------- ユーティリティ --------------------
 def normalize_filename(name: str) -> str:
     name = unicodedata.normalize("NFKC", name)
@@ -45,7 +45,6 @@ def pick_sheet_name(xls_path: str, preferred: str | None) -> str | None:
         xl = pd.ExcelFile(xls_path)
         if preferred and preferred in xl.sheet_names:
             return preferred
-        # 代表地番を優先
         for candidate in ["代表地番", "代表地番のみ", "代表地番シート"]:
             if candidate in xl.sheet_names:
                 return candidate
@@ -60,13 +59,9 @@ def collect_pref_links(session: requests.Session) -> list[dict]:
     links = []
     for a in soup.find_all("a"):
         if is_pref_link(a):
-            links.append({
-                "pref": extract_pref_name(a),
-                "href": urljoin(PUBLIC_URL, a.get("href")),
-            })
     # 重複除去
-    seen = set()
-    uniq = []
     for item in links:
         key = (item["pref"], item["href"])
         if key not in seen:
@@ -76,7 +71,6 @@ def collect_pref_links(session: requests.Session) -> list[dict]:
 def download_one(session: requests.Session, url: str, outdir: str, pref: str) -> str:
     os.makedirs(outdir, exist_ok=True)
-    from urllib.parse import urlparse, parse_qs
     qs = parse_qs(urlparse(url).query)
     file_id = (qs.get("file", ["unknown"])[0])[:18]
     with session.get(url, timeout=180, stream=True) as r:
@@ -84,87 +78,108 @@ def download_one(session: requests.Session, url: str, outdir: str, pref: str) ->
         fname = guess_filename_from_headers(r, f"{pref}_{file_id}.xlsx")
         path = os.path.join(outdir, fname)
         with open(path, "wb") as f:
-            for chunk in r.iter_content(chunk_size=1 << 15):
                 if chunk:
                     f.write(chunk)
     return path
-# ---- 3段ヘッダー対応：1枚目のみ利用、他は削除（skiprows） ----
-def load_excel(xls_path: str, sheet_pref: str | None, pref_name: str, use_header: bool) -> pd.DataFrame | None:
     """
-    use_header=True のときのみ上3行を列名として使用（MultiIndex）
-    use_header=False のときは列名なし（skiprows=3）
-    いずれも先頭にメタ列（都道府県/元ファイル/読込シート）を付加
     """
     sheet = pick_sheet_name(xls_path, sheet_pref)
     if not sheet:
         return None
-    try:
-        if use_header:
-            df = pd.read_excel(xls_path, sheet_name=sheet, engine="openpyxl",
-                               header=[0, 1, 2], dtype=str)
-            # 後でMultiIndex列を維持するため、元の列名を保存
-            orig_cols = list(df.columns)
         else:
-            df = pd.read_excel(xls_path, sheet_name=sheet, engine="openpyxl",
-                               header=None, skiprows=3, dtype=str)
-            orig_cols = None  # 後で1枚目の列名を適用
-        # 前後空白トリム
-        for c in df.select_dtypes(include=["object"]).columns:
-            df[c] = df[c].str.strip()
-        # メタ情報列を追加
-        df.insert(0, "都道府県", pref_name)
-        df.insert(1, "元ファイル", os.path.basename(xls_path))
-        df.insert(2, "読込シート", sheet)
-        # 1枚目は MultiIndex 列をメタ列も含めて設定しておく
-        if use_header and isinstance(pd.Index(orig_cols), pd.MultiIndex):
-            meta = [
-                ("meta", "都道府県", ""),
-                ("meta", "元ファイル", ""),
-                ("meta", "読込シート", ""),
-            ]
-            df.columns = pd.MultiIndex.from_tuples(meta + list(orig_cols))
-        return df
-    except Exception as e:
-        print(f"[WARN] 読み込み失敗: {xls_path} ({e})")
-        return None
-# ---- 3段 → 1段 へフラット化 ----
 def flatten_columns(cols, sep: str = "_") -> list[str]:
     """
-    MultiIndex 列を '上位_中位_下位' の1段に変換。
     None / NaN / 空白は除去。重複名は .1, .2... を付与。
     """
-    # 1) 一旦すべて文字列へ
     def as_str(x):
         s = "" if x is None else str(x)
         s = s.strip()
         return "" if s.lower() == "nan" else s
-    flat = []
     if isinstance(cols, pd.MultiIndex):
         for tpl in cols:
-            parts = [as_str(p) for p in tpl]
-            parts = [p for p in parts if p]  # 空除去
-            name = sep.join(parts) if parts else "col"
-            flat.append(name)
     else:
-        flat = [as_str(c) or "col" for c in cols]
-    # 2) 重複名に連番を付与
-    seen = {}
-    uniq = []
-    for c in flat:
         if c not in seen:
             seen[c] = 0
-            uniq.append(c)
         else:
             seen[c] += 1
-            uniq.append(f"{c}.{seen[c]}")
-    return uniq
 def zip_paths(paths: list[str], out_zip: str) -> str:
     with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as z:
@@ -173,34 +188,23 @@ def zip_paths(paths: list[str], out_zip: str) -> str:
                 z.write(p, arcname=os.path.basename(p))
     return out_zip
-# -------------------- メイン実行（Gradioから呼ぶ） --------------------
 def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress=gr.Progress(track_tqdm=False)):
-    """
-    sheet_name: "代表地番" 等。空欄なら自動
-    sleep_sec: ダウンロード間隔
-    limit: 先頭N件のみ（テスト用）
-    re_download: 既存ファイルがあっても再取得する
-    do_flatten: 列名を1段にフラット化する
-    sep: フラット化時のセパレータ
-    """
     progress(0, desc="初期化中…")
     session = requests.Session()
     session.headers.update({
-        "User-Agent": "Mozilla/5.0 (compatible; FITCollector/1.1; +https://huggingface.co/spaces)",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     })
     # 1) リンク収集
     links = collect_pref_links(session)
     if not links:
-        return ("都道府県ファイルのリンク検出に失敗しました。ページ構成の変更や一時的なブロックの可能性があります。",
-                None, None, None, None)
     if limit and limit > 0:
         links = links[:int(limit)]
     progress(0.1, desc=f"リンク検出 {len(links)} 件")
     # 2) ダウンロード
@@ -209,7 +213,6 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
         progress(0.1 + 0.6 * i / max(1, len(links)),
                  desc=f"ダウンロード {i}/{len(links)}: {item['pref']}")
         try:
-            # 既存ファイルを利用（高速化）
             existing = None
             if not re_download and os.path.isdir(OUTDIR):
                 for fn in os.listdir(OUTDIR):
@@ -221,44 +224,42 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
             else:
                 path = download_one(session, item["href"], OUTDIR, item["pref"])
                 time.sleep(float(sleep_sec))
-            downloaded.append({"pref": item["pref"], "path": path})
         except Exception as e:
             print(f"[WARN] ダウンロード失敗: {item['pref']} {e}")
     if not downloaded:
-        return ("ダウンロードに失敗しました。ネットワークやサイト側制限をご確認ください。",
                 None, None, None, None)
-    # 3) 読み込み
-    frames = []
-    for i, it in enumerate(downloaded, start=1):
-        progress(0.72 + 0.18 * i / max(1, len(downloaded)),
-                 desc=f"読み込み {i}/{len(downloaded)}: {os.path.basename(it['path'])}")
-        df = load_excel(
-            it["path"],
-            sheet_name if sheet_name else None,
-            it["pref"],
-            use_header=(i == 1)  # 1枚目だけ上3行を列名に
-        )
         if df is not None and len(df) > 0:
             frames.append(df)
-    if not frames:
-        return ("Excelは取得できましたが、読み込めるデータがありませんでした（シート名の指定を見直してください）。",
-                None, None, None, None)
-    # 2枚目以降は列名が無いので、1枚目の列名を適用
-    if len(frames) > 1:
-        frames[1:] = [f.set_axis(frames[0].columns, axis=1) for f in frames[1:]]
-    # 4) 縦結合
     combined = pd.concat(frames, ignore_index=True)
-    # 5) 列名のフラット化（推奨）
     if do_flatten:
         combined.columns = flatten_columns(combined.columns, sep=sep or "_")
-    # 6) 出力
     os.makedirs(OUTDIR, exist_ok=True)
     out_xlsx = os.path.join(OUTDIR, "combined_fit.xlsx")
     out_parq = os.path.join(OUTDIR, "combined_fit.parquet")
@@ -266,16 +267,15 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
         combined.to_excel(w, index=False, sheet_name="combined")
     combined.to_parquet(out_parq, index=False)
-    # 7) 生ファイル一式のZIP
     raw_zip = os.path.join(OUTDIR, "raw_excels.zip")
-    zip_paths([it["path"] for it in downloaded], raw_zip)
-    # 8) プレビュー
     preview_csv = os.path.join(OUTDIR, "combined_head.csv")
     combined.head(1000).to_csv(preview_csv, index=False)
     progress(1.0, desc=f"完了（{len(combined):,} 行）")
     msg = (
         f"✅ 結合完了: 行数 = {len(combined):,}\n"
         f"・Excel: combined_fit.xlsx\n"
@@ -284,17 +284,19 @@ def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress
         f"・プレビュー: combined_head.csv\n"
         f"・列名フラット化: {'ON' if do_flatten else 'OFF'}（区切り: '{sep or '_'}'）"
     )
     return (msg, out_xlsx, out_parq, raw_zip, preview_csv)
-# -------------------- Gradio UI --------------------
 with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合") as demo:
     gr.Markdown(
         """
         # FIT 公表（都道府県別Excel）一括取得 & 結合
-        - 公表ページから都道府県別のExcelを取得し、**1枚目のみ上3行を列名**として採用、**2枚目以降は列名を削除**して縦結合します。
-        - 列名はオプションで**フラット化**（例：`大分類_中分類_小分類`）できます（推奨）。
         """
     )
     with gr.Row():

 from bs4 import BeautifulSoup
 PUBLIC_URL = "https://www.fit-portal.go.jp/PublicInfo"
+OUTDIR = "data_fit"
+# ---- ユーティリティ ---------------------------------------------------------
 def normalize_filename(name: str) -> str:
     name = unicodedata.normalize("NFKC", name)
         xl = pd.ExcelFile(xls_path)
         if preferred and preferred in xl.sheet_names:
             return preferred
         for candidate in ["代表地番", "代表地番のみ", "代表地番シート"]:
             if candidate in xl.sheet_names:
                 return candidate
     links = []
     for a in soup.find_all("a"):
         if is_pref_link(a):
+            links.append({"pref": extract_pref_name(a), "href": urljoin(PUBLIC_URL, a.get("href"))})
     # 重複除去
+    seen, uniq = set(), []
     for item in links:
         key = (item["pref"], item["href"])
         if key not in seen:
 def download_one(session: requests.Session, url: str, outdir: str, pref: str) -> str:
     os.makedirs(outdir, exist_ok=True)
     qs = parse_qs(urlparse(url).query)
     file_id = (qs.get("file", ["unknown"])[0])[:18]
     with session.get(url, timeout=180, stream=True) as r:
         fname = guess_filename_from_headers(r, f"{pref}_{file_id}.xlsx")
         path = os.path.join(outdir, fname)
         with open(path, "wb") as f:
+            for chunk in r.iter_iterable = r.iter_content(chunk_size=1 << 15)
+            for chunk in iter_iterable:
                 if chunk:
                     f.write(chunk)
     return path
+# ---- 3段ヘッダ → 1枚目のみ採用／他はスキップ行数で読込 ----------------------
+HEADER_ROWS = [1, 2, 3]  # 0行目は削除、1/2/3行目を列名として結合
+SKIP_ROWS_OTHERS = 4     # 2枚目以降は 0〜3 行目をスキップ
+def load_excel_first(xls_path: str, sheet_pref: str | None) -> tuple[pd.DataFrame, list]:
+    """
+    1枚目: 0行目は使わず、1/2/3行目を列名にする（MultiIndex）。
+    さらに「一番左の列」を削除して返す。
+    戻り値: (df, columns_multiindex)
+    """
+    sheet = pick_sheet_name(xls_path, sheet_pref)
+    if not sheet:
+        raise RuntimeError("シートが見つかりません")
+    df = pd.read_excel(
+        xls_path,
+        sheet_name=sheet,
+        engine="openpyxl",
+        header=HEADER_ROWS,
+        dtype=str
+    )
+    # 左端の余計な列を削除
+    df = df.iloc[:, 1:]
+    # 前後空白トリム
+    for c in df.select_dtypes(include=["object"]).columns:
+        df[c] = df[c].str.strip()
+    cols = list(df.columns)  # MultiIndex のまま保持
+    return df, cols
+def load_excel_other(xls_path: str, sheet_pref: str | None, target_cols: list) -> pd.DataFrame | None:
     """
+    2枚目以降: 3行目までスキップしてデータのみ読み込み。
+    左端列を削除後、1枚目の列（MultiIndex）を適用。
     """
     sheet = pick_sheet_name(xls_path, sheet_pref)
     if not sheet:
         return None
+    df = pd.read_excel(
+        xls_path,
+        sheet_name=sheet,
+        engine="openpyxl",
+        header=None,
+        skiprows=SKIP_ROWS_OTHERS,
+        dtype=str
+    )
+    # 左端の余計な列を削除
+    df = df.iloc[:, 1:]
+    # 前後空白トリム
+    for c in df.select_dtypes(include=["object"]).columns:
+        df[c] = df[c].str.strip()
+    # 列数が合わない場合は合わせられる範囲で調整（警告付き）
+    if df.shape[1] != len(target_cols):
+        print(f"[WARN] 列数不一致: file={os.path.basename(xls_path)} "
+              f"read={df.shape[1]} vs target={len(target_cols)} -> 自動調整")
+        if df.shape[1] > len(target_cols):
+            df = df.iloc[:, :len(target_cols)]
         else:
+            # 足りない場合は欠損列を追加
+            for _ in range(len(target_cols) - df.shape[1]):
+                df[pd.util.hash_pandas_object(df).name or f"_pad_{_}"] = None
+            # 列順を並べ替え
+            df = df.iloc[:, :len(target_cols)]
+    df.columns = target_cols
+    return df
+# ---- 列名フラット化 ---------------------------------------------------------
 def flatten_columns(cols, sep: str = "_") -> list[str]:
     """
+    MultiIndex 列を '上位_中位_下位' にフラット化。
     None / NaN / 空白は除去。重複名は .1, .2... を付与。
     """
     def as_str(x):
         s = "" if x is None else str(x)
         s = s.strip()
         return "" if s.lower() == "nan" else s
     if isinstance(cols, pd.MultiIndex):
+        raw = []
         for tpl in cols:
+            parts = [as_str(p) for p in tpl if as_str(p)]
+            raw.append(sep.join(parts) if parts else "col")
     else:
+        raw = [as_str(c) or "col" for c in cols]
+    seen, out = {}, []
+    for c in raw:
         if c not in seen:
             seen[c] = 0
+            out.append(c)
         else:
             seen[c] += 1
+            out.append(f"{c}.{seen[c]}")
+    return out
 def zip_paths(paths: list[str], out_zip: str) -> str:
     with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as z:
                 z.write(p, arcname=os.path.basename(p))
     return out_zip
+# ---- メイン（Gradioから呼び出し） -------------------------------------------
 def run_job(sheet_name, sleep_sec, limit, re_download, do_flatten, sep, progress=gr.Progress(track_tqdm=False)):
     progress(0, desc="初期化中…")
     session = requests.Session()
     session.headers.update({
+        "User-Agent": "Mozilla/5.0 (compatible; FITCollector/1.2; +https://huggingface.co/spaces)",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     })
     # 1) リンク収集
     links = collect_pref_links(session)
     if not links:
+        return ("都道府県ファイルのリンク検出に失敗しました。", None, None, None, None)
     if limit and limit > 0:
         links = links[:int(limit)]
     progress(0.1, desc=f"リンク検出 {len(links)} 件")
     # 2) ダウンロード
         progress(0.1 + 0.6 * i / max(1, len(links)),
                  desc=f"ダウンロード {i}/{len(links)}: {item['pref']}")
         try:
             existing = None
             if not re_download and os.path.isdir(OUTDIR):
                 for fn in os.listdir(OUTDIR):
             else:
                 path = download_one(session, item["href"], OUTDIR, item["pref"])
                 time.sleep(float(sleep_sec))
+            downloaded.append(path)
         except Exception as e:
             print(f"[WARN] ダウンロード失敗: {item['pref']} {e}")
     if not downloaded:
+        return ("ダウンロードに失敗しました。", None, None, None, None)
+    # 3) 読み込み・1枚目
+    progress(0.75, desc="1枚目を読み込み（列名生成）")
+    first_path = downloaded[0]
+    try:
+        df0, cols0 = load_excel_first(first_path, sheet_name if sheet_name else None)
+    except Exception as e:
+        return (f"1枚目の読み込みに失敗しました: {os.path.basename(first_path)} / {e}",
                 None, None, None, None)
+    frames = [df0]
+    # 4) 読み込み・2枚目以降
+    for j, p in enumerate(downloaded[1:], start=2):
+        progress(0.75 + 0.25 * (j - 1) / max(1, len(downloaded) - 1),
+                 desc=f"{j}枚目を読み込み")
+        df = load_excel_other(p, sheet_name if sheet_name else None, cols0)
         if df is not None and len(df) > 0:
             frames.append(df)
+        else:
+            print(f"[WARN] 読み込みスキップ: {os.path.basename(p)}")
+    # 5) 縦結合
     combined = pd.concat(frames, ignore_index=True)
+    # 6) 列名のフラット化（既定ON）
     if do_flatten:
         combined.columns = flatten_columns(combined.columns, sep=sep or "_")
+    # 7) 出力
     os.makedirs(OUTDIR, exist_ok=True)
     out_xlsx = os.path.join(OUTDIR, "combined_fit.xlsx")
     out_parq = os.path.join(OUTDIR, "combined_fit.parquet")
         combined.to_excel(w, index=False, sheet_name="combined")
     combined.to_parquet(out_parq, index=False)
+    # 8) ZIP（取得ファイル一式）
     raw_zip = os.path.join(OUTDIR, "raw_excels.zip")
+    zip_paths(downloaded, raw_zip)
+    # 9) プレビュー
     preview_csv = os.path.join(OUTDIR, "combined_head.csv")
     combined.head(1000).to_csv(preview_csv, index=False)
     progress(1.0, desc=f"完了（{len(combined):,} 行）")
     msg = (
         f"✅ 結合完了: 行数 = {len(combined):,}\n"
         f"・Excel: combined_fit.xlsx\n"
         f"・プレビュー: combined_head.csv\n"
         f"・列名フラット化: {'ON' if do_flatten else 'OFF'}（区切り: '{sep or '_'}'）"
     )
     return (msg, out_xlsx, out_parq, raw_zip, preview_csv)
+# ---- Gradio UI -------------------------------------------------------------
 with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合") as demo:
     gr.Markdown(
         """
         # FIT 公表（都道府県別Excel）一括取得 & 結合
+        **要件に沿った処理**:
+        - 1枚目のみ「0行目を削除」「1/2/3行目を結合して列名」。
+        - 2枚目以降は「3行目までスキップ」してデータのみ。
+        - すべてのファイルで**左端の列を削除**。
+        - ファイル/シート名などのメタ列は**付与しません**。
         """
     )
     with gr.Row():