Spaces:

hiroki0008
/

FIT_data

Sleeping

App Files Files Community

hiroki0008 commited on Sep 15, 2025

Commit

7c5a3be

verified ·

1 Parent(s): 7a6ea38

Update app.py

Browse files

Files changed (1) hide show

app.py +241 -16

app.py CHANGED Viewed

@@ -1,21 +1,246 @@
-# FIT 公表（都道府県別Excel）一括取得 & 結合（Hugging Face Spaces）
-このSpaceは、[FIT-PORTAL 公表ページ](https://www.fit-portal.go.jp/PublicInfo) から都道府県別のExcelを取得して結合します。
-## 使い方
-1. 上部の「実行」ボタンを押す
-2. **読み込むシート名**を指定（例：`代表地番`、未指定なら自動選択）
-3. **ダウンロード間隔（秒）**でアクセス間隔を調整（デフォルト 1.0 秒）
-4. 完了後、以下の成果物をダウンロードできます
-   - `combined_fit.xlsx`（結合Excel）
-   - `combined_fit.parquet`（同内容のParquet）
-   - `raw_excels.zip`（取得した都道府県ファイル一式）
-   - `combined_head.csv`（先頭1000行プレビュー）
-## 注意事項
-- 公表ページの構成変更やアクセス制限により取得できない場合があります。
-- 本ツールはページの **HTMLリンクを直接解析** してダウンロードしています。サイトの最新の利用規約・ロボッツ規約・著作権等を遵守のうえ、適切にご利用ください。
-- サーバ負荷軽減のため、間隔（sleep）を十分に確保してください。
-- 列名・体裁は月ごとに微修正される可能性があるため、まずは **全列を縦結合** し、後段の整形処理で標準化することを推奨します。

+import os
+import re
+import time
+import zipfile
+import unicodedata
+from io import BytesIO
+from urllib.parse import urljoin, urlparse, parse_qs, unquote
+import gradio as gr
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+PUBLIC_URL = "https://www.fit-portal.go.jp/PublicInfo"
+OUTDIR = "data_fit"  # Spacesの永続領域（リポジトリ直下）に保存
+# -------------------- ユーティリティ --------------------
+def normalize_filename(name: str) -> str:
+    name = unicodedata.normalize("NFKC", name)
+    name = re.sub(r'[\\/:*?"<>|]+', "_", name)
+    name = name.strip()
+    return name or "file"
+def guess_filename_from_headers(resp: requests.Response, fallback: str) -> str:
+    cd = resp.headers.get("Content-Disposition", "")
+    m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', cd, flags=re.IGNORECASE)
+    if m:
+        try:
+            fn = unquote(m.group(1))
+        except Exception:
+            fn = m.group(1)
+        return normalize_filename(fn)
+    return normalize_filename(fallback)
+def is_pref_link(a_tag) -> bool:
+    href = a_tag.get("href") or ""
+    return "servlet.FileDownload" in href and "file=" in href
+def extract_pref_name(a_tag) -> str:
+    txt = (a_tag.get_text() or "").strip()
+    return txt or "pref"
+def pick_sheet_name(xls_path: str, preferred: str | None) -> str | None:
+    try:
+        xl = pd.ExcelFile(xls_path)
+        if preferred and preferred in xl.sheet_names:
+            return preferred
+        # 代表地番を優先
+        for candidate in ["代表地番", "代表地番のみ", "代表地番シート"]:
+            if candidate in xl.sheet_names:
+                return candidate
+        return xl.sheet_names[0] if xl.sheet_names else None
+    except Exception:
+        return None
+def collect_pref_links(session: requests.Session) -> list[dict]:
+    r = session.get(PUBLIC_URL, timeout=60)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    links = []
+    for a in soup.find_all("a"):
+        if is_pref_link(a):
+            links.append({
+                "pref": extract_pref_name(a),
+                "href": urljoin(PUBLIC_URL, a.get("href")),
+            })
+    # 重複除去
+    seen = set()
+    uniq = []
+    for item in links:
+        key = (item["pref"], item["href"])
+        if key not in seen:
+            seen.add(key)
+            uniq.append(item)
+    return uniq
+def download_one(session: requests.Session, url: str, outdir: str, pref: str) -> str:
+    os.makedirs(outdir, exist_ok=True)
+    qs = parse_qs(urlparse(url).query)
+    file_id = (qs.get("file", ["unknown"])[0])[:18]
+    with session.get(url, timeout=180, stream=True) as r:
+        r.raise_for_status()
+        fname = guess_filename_from_headers(r, f"{pref}_{file_id}.xlsx")
+        path = os.path.join(outdir, fname)
+        with open(path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1 << 15):
+                if chunk:
+                    f.write(chunk)
+    return path
+def load_excel(xls_path: str, sheet_pref: str | None, pref_name: str) -> pd.DataFrame | None:
+    sheet = pick_sheet_name(xls_path, sheet_pref)
+    if not sheet:
+        return None
+    try:
+        df = pd.read_excel(xls_path, sheet_name=sheet, engine="openpyxl", dtype=str)
+        # 前後空白トリム
+        for c in df.select_dtypes(include=["object"]).columns:
+            df[c] = df[c].str.strip()
+        df.insert(0, "都道府県", pref_name)
+        df.insert(1, "元ファイル", os.path.basename(xls_path))
+        df.insert(2, "読込シート", sheet)
+        return df
+    except Exception:
+        return None
+def zip_paths(paths: list[str], out_zip: str) -> str:
+    with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as z:
+        for p in paths:
+            if os.path.exists(p):
+                z.write(p, arcname=os.path.basename(p))
+    return out_zip
+# -------------------- メイン実行（Gradioから呼ぶ） --------------------
+def run_job(sheet_name, sleep_sec, limit, re_download, progress=gr.Progress(track_tqdm=False)):
+    """
+    sheet_name: "代表地番" 等。空欄なら自動
+    sleep_sec: ダウンロード間隔
+    limit: 先頭N件のみ（テスト用）
+    re_download: 既存ファイルがあっても再取得する
+    """
+    progress(0, desc="初期化中…")
+    # polite headers（UA指摘を避けるため）
+    session = requests.Session()
+    session.headers.update({
+        "User-Agent": "Mozilla/5.0 (compatible; FITCollector/1.0; +https://huggingface.co/spaces)",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    })
+    # 1) リンク収集
+    links = collect_pref_links(session)
+    if not links:
+        return ("都道府県ファイルのリンク検出に失敗しました。ページ構成の変更や一時的なブロックの可能性があります。",
+                None, None, None, None)
+    if limit and limit > 0:
+        links = links[:int(limit)]
+    progress(0.1, desc=f"リンク検出 {len(links)} 件")
+    # 2) ダウンロード
+    downloaded = []
+    for i, item in enumerate(links, start=1):
+        progress(0.1 + 0.6 * i / max(1, len(links)),
+                 desc=f"ダウンロード {i}/{len(links)}: {item['pref']}")
+        try:
+            # 既存ファイルを利用する（高速化）
+            # 同一prefの既存xlsxがあればそれを優先（緩めの一致）
+            existing = None
+            if not re_download and os.path.isdir(OUTDIR):
+                for fn in os.listdir(OUTDIR):
+                    if fn.lower().endswith(".xlsx") and item["pref"] in fn:
+                        existing = os.path.join(OUTDIR, fn)
+                        break
+            if existing and os.path.exists(existing):
+                path = existing
+            else:
+                path = download_one(session, item["href"], OUTDIR, item["pref"])
+                time.sleep(float(sleep_sec))
+            downloaded.append({"pref": item["pref"], "path": path})
+        except Exception as e:
+            print(f"[WARN] ダウンロード失敗: {item['pref']} {e}")
+    if not downloaded:
+        return ("ダウンロードに失敗しました。ネットワークやサイト側制限をご確認ください。",
+                None, None, None, None)
+    # 3) 結合
+    frames = []
+    for i, it in enumerate(downloaded, start=1):
+        progress(0.72 + 0.18 * i / max(1, len(downloaded)),
+                 desc=f"読み込み {i}/{len(downloaded)}: {os.path.basename(it['path'])}")
+        df = load_excel(it["path"], sheet_name if sheet_name else None, it["pref"])
+        if df is not None and len(df) > 0:
+            frames.append(df)
+    if not frames:
+        return ("Excelは取得できましたが、読み込めるデータがありませんでした（シート名の指定を見直してください）。",
+                None, None, None, None)
+    combined = pd.concat(frames, ignore_index=True)
+    # 4) 出力
+    os.makedirs(OUTDIR, exist_ok=True)
+    out_xlsx = os.path.join(OUTDIR, "combined_fit.xlsx")
+    out_parq = os.path.join(OUTDIR, "combined_fit.parquet")
+    with pd.ExcelWriter(out_xlsx, engine="openpyxl") as w:
+        combined.to_excel(w, index=False, sheet_name="combined")
+    combined.to_parquet(out_parq, index=False)
+    # 5) 付帯：生ファイル一式のZIP
+    #   ダウンロードした都道府県別Excelも渡したいニーズ向け
+    raw_zip = os.path.join(OUTDIR, "raw_excels.zip")
+    zip_paths([it["path"] for it in downloaded], raw_zip)
+    # 6) 進捗完了
+    progress(1.0, desc=f"完了（{len(combined):,} 行）")
+    # Gradio File はパスを返せばダウンロード可能
+    msg = f"✅ 結合完了: 行数 = {len(combined):,}\n" \
+          f"・Excel: combined_fit.xlsx\n" \
+          f"・Parquet: combined_fit.parquet\n" \
+          f"・Raw ZIP: raw_excels.zip\n"
+    # 先頭数行のプレビューCSV（軽量）
+    preview_csv = os.path.join(OUTDIR, "combined_head.csv")
+    combined.head(1000).to_csv(preview_csv, index=False)
+    return (msg, out_xlsx, out_parq, raw_zip, preview_csv)
+# -------------------- Gradio UI --------------------
+with gr.Blocks(title="FIT 公表（都道府県別Excel）一括取得＆結合") as demo:
+    gr.Markdown(
+        """
+        # FIT 公表（都道府県別Excel）一括取得 & 結合
+        - 公表ページから都道府県別のExcelを取得し、縦結合します。
+        - サーバ負荷配慮のため**間隔（sleep）**を入れています。
+        - 出力：`combined_fit.xlsx` / `combined_fit.parquet` / 生ファイル一式`raw_excels.zip`
+        """
+    )
+    with gr.Row():
+        sheet = gr.Textbox(label="読み込むシート名（空欄=自動）", placeholder="例）代表地番 / 全地番")
+        sleep = gr.Slider(0.0, 5.0, value=1.0, step=0.1, label="ダウンロード間隔（秒）")
+    with gr.Row():
+        limit = gr.Number(value=None, precision=0, label="先頭N県のみ（テスト用・空欄は全県）")
+        reget = gr.Checkbox(label="既存ファイルがあっても再ダウンロードする", value=False)
+    run_btn = gr.Button("実行", variant="primary")
+    out_msg = gr.Markdown()
+    out_xlsx = gr.File(label="結合Excel（combined_fit.xlsx）")
+    out_parq = gr.File(label="結合Parquet（combined_fit.parquet）")
+    out_zip  = gr.File(label="取得した都道府県Excel一式（zip）")
+    out_preview = gr.File(label="先頭1000行プレビュー（CSV）")
+    run_btn.click(fn=run_job,
+                  inputs=[sheet, sleep, limit, reget],
+                  outputs=[out_msg, out_xlsx, out_parq, out_zip, out_preview])
+if __name__ == "__main__":
+    # queue を有効にして複数ユーザーでも安定実行
+    demo.queue(max_size=20).launch()