hiroki0008 commited on
Commit
ee18469
·
verified ·
1 Parent(s): db0cf7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +321 -287
app.py CHANGED
@@ -1,320 +1,354 @@
1
- # app.py (Folium + 無料タイル / data:URL不使用 / File出力)
2
- # pip install folium gradio pandas numpy requests openpyxl
3
-
4
  import os
5
  import re
6
  import time
7
- import tempfile
 
 
 
 
8
  import requests
9
  import pandas as pd
10
- import numpy as np
11
- import gradio as gr
 
 
 
 
12
 
13
- # ----------------------------
14
- # 設定
15
- # ----------------------------
16
- GSI_USER_AGENT = os.environ.get(
17
- "GSI_USER_AGENT",
18
- "jp-gsi-geocoding-demo (contact: your_email@example.com)" # 連絡先付き推奨
19
- )
20
- GSI_TIMEOUT_SEC = float(os.environ.get("GSI_TIMEOUT_SEC", "10"))
21
- GEOCODE_DELAY_SEC = float(os.environ.get("GSI_RATE_LIMIT_SEC", "0.0"))
22
-
23
- GSI_GEOCODE_URL = "https://msearch.gsi.go.jp/address-search/AddressSearch"
24
-
25
- CACHE_DIR = "data/cache"
26
- os.makedirs(CACHE_DIR, exist_ok=True)
27
- CACHE_PATH = os.path.join(CACHE_DIR, "geocode_cache.csv")
28
-
29
- # ----------------------------
30
- # キャッシュ
31
- # ----------------------------
32
- def load_cache():
33
- if os.path.exists(CACHE_PATH):
34
  try:
35
- df = pd.read_csv(CACHE_PATH)
36
- need = {"address_input", "lat", "lon", "CF"}
37
- if need.issubset(df.columns):
38
- df["CF"] = pd.to_numeric(df["CF"], errors="coerce")
39
- df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
40
- df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
41
- return df
42
  except Exception:
43
- pass
44
- return pd.DataFrame(columns=["address_input", "lat", "lon", "CF"])
 
 
 
 
 
 
 
 
 
45
 
46
- def save_cache(df_cache):
47
  try:
48
- df_cache.to_csv(CACHE_PATH, index=False)
 
 
 
 
 
 
 
49
  except Exception:
50
- pass
51
-
52
- # ----------------------------
53
- # 国土地理院 ジオコーダ
54
- # ----------------------------
55
- def make_gsi_session() -> requests.Session:
56
- s = requests.Session()
57
- s.headers.update({"User-Agent": GSI_USER_AGENT})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return s
59
 
60
- def gsi_geocode_once(address: str, session: requests.Session) -> tuple[float, float]:
61
  """
62
- 国土地理院 住所検索APIを1回呼び出し、(lat, lon) を返す(失敗時は (nan, nan))。
63
- APIは [lon, lat] を返すため、順を入れ替える。
 
 
64
  """
65
- try:
66
- if not address or str(address).strip() == "" or str(address).strip().lower() in ("nan", "none"):
67
- return (np.nan, np.nan)
68
-
69
- resp = session.get(GSI_GEOCODE_URL, params={"q": address}, timeout=GSI_TIMEOUT_SEC)
70
- if not resp.ok:
71
- return (np.nan, np.nan)
72
- data = resp.json()
73
- if isinstance(data, list) and len(data) > 0:
74
- feat = data[0]
75
- coords = (feat.get("geometry") or {}).get("coordinates") or []
76
- if isinstance(coords, (list, tuple)) and len(coords) >= 2:
77
- lon, lat = float(coords[0]), float(coords[1])
78
- return (lat, lon)
79
- except Exception:
80
- pass
81
- return (np.nan, np.nan)
82
-
83
- def geocode_with_cache(addresses, CFs, use_internet=True):
84
- cache = load_cache()
85
- cache_map = {row["address_input"]: (row["lat"], row["lon"], row["CF"]) for _, row in cache.iterrows()}
86
- results = []
87
- session = make_gsi_session() if use_internet else None
88
-
89
- for a, cf in zip(addresses, CFs):
90
- a = "" if (a is None or (isinstance(a, float) and np.isnan(a))) else str(a).strip()
91
- cf_num = pd.to_numeric(cf, errors="coerce")
92
-
93
- # cache hit
94
- if a in cache_map:
95
- lat, lon, _cached_cf = cache_map[a]
96
- if pd.notna(lat) and pd.notna(lon):
97
- results.append({"address_input": a, "CF": cf_num, "lat": float(lat), "lon": float(lon)})
98
- continue
99
-
100
- if not use_internet:
101
- results.append({"address_input": a, "CF": cf_num, "lat": np.nan, "lon": np.nan})
102
- continue
103
-
104
- lat, lon = gsi_geocode_once(a, session)
105
-
106
- if GEOCODE_DELAY_SEC > 0:
107
- time.sleep(GEOCODE_DELAY_SEC)
108
-
109
- # キャッシュ更新
110
- cache = cache[cache["address_input"] != a]
111
- cache = pd.concat(
112
- [cache, pd.DataFrame([{"address_input": a, "lat": lat, "lon": lon, "CF": cf_num}])],
113
- ignore_index=True
114
- )
115
- save_cache(cache)
116
- results.append({"address_input": a, "CF": cf_num, "lat": lat, "lon": lon})
117
-
118
- df = pd.DataFrame(results)
119
- df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
120
- df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
121
- df["CF"] = pd.to_numeric(df["CF"], errors="coerce")
122
- return df
123
-
124
- # ----------------------------
125
- # Folium 地図生成(無料タイル)
126
- # ----------------------------
127
- import folium
128
-
129
- TILE_CATALOG = {
130
- "GSI 標準地図": "https://cyberjapandata.gsi.go.jp/xyz/std/{z}/{x}/{y}.png",
131
- "GSI 淡色地図": "https://cyberjapandata.gsi.go.jp/xyz/pale/{z}/{x}/{y}.png",
132
- "GSI 写真(シームレス)": "https://cyberjapandata.gsi.go.jp/xyz/seamlessphoto/{z}/{x}/{y}.jpg",
133
- "OpenStreetMap": "https://tile.openstreetmap.org/{z}/{x}/{y}.png",
134
- }
135
-
136
- def _build_folium_map_html(df_points: pd.DataFrame, base_name: str) -> str:
137
- df_valid = df_points.dropna(subset=["lat", "lon"]).copy()
138
- if df_valid.empty:
139
- center_lat, center_lon, zoom = 35.0, 135.0, 4
140
- else:
141
- center_lat = float(df_valid["lat"].median())
142
- center_lon = float(df_valid["lon"].median())
143
- zoom = 6
144
-
145
- # ベースマップ(複数切替)
146
- m = folium.Map(location=[center_lat, center_lon], zoom_start=zoom, control_scale=True, tiles=None)
147
- for name, url in TILE_CATALOG.items():
148
- folium.TileLayer(
149
- tiles=url,
150
- name=name,
151
- attr=f"© {name}",
152
- overlay=False,
153
- control=True,
154
- max_zoom=20,
155
- ).add_to(m)
156
-
157
- # マーカー(CF でサイズ可変)
158
- if "CF" in df_valid.columns and df_valid["CF"].notna().any():
159
- cf = df_valid["CF"].clip(lower=0)
160
- cf_norm = (cf - cf.min()) / (cf.max() - cf.min() + 1e-9)
161
- sizes = (cf_norm * 12 + 3).fillna(6).tolist()
162
- else:
163
- sizes = [6] * len(df_valid)
164
-
165
- for (_, row), r in zip(df_valid.iterrows(), sizes):
166
- lat, lon = float(row["lat"]), float(row["lon"])
167
- addr = str(row.get("address_input", ""))
168
- cfv = row.get("CF", np.nan)
169
- popup_html = f"<b>住所:</b> {addr}<br><b>CF:</b> {'' if pd.isna(cfv) else cfv}"
170
-
171
- folium.CircleMarker(
172
- location=(lat, lon),
173
- radius=float(r),
174
- weight=1,
175
- color="#117a8b",
176
- fill=True,
177
- fill_opacity=0.8,
178
- fill_color="#12939A",
179
- popup=folium.Popup(popup_html, max_width=260),
180
- ).add_to(m)
181
-
182
- folium.LayerControl(position="topright").add_to(m)
183
- return m.get_root().render()
184
-
185
- def _rewrite_leaflet_cdn(html_text: str, host: str) -> str:
186
  """
187
- Folium が出力する Leaflet の CDN(通常 jsDelivr)を、必要に応じて置換。
188
- SRI不整合を避けるため integrity/crossorigin を除去する。
 
 
 
189
  """
190
- # integrity / crossorigin を削除(SRIミスマッチ回避)
191
- html_text = re.sub(r'\sintegrity="[^"]+"', "", html_text)
192
- html_text = re.sub(r'\scrossorigin="[^"]+"', "", html_text)
193
-
194
- if host == "jsdelivr":
195
- return html_text # 置換しない
196
- elif host == "cdnjs":
197
- html_text = html_text.replace(
198
- "https://cdn.jsdelivr.net/npm/leaflet@", "https://cdnjs.cloudflare.com/ajax/libs/leaflet/"
199
- )
200
- html_text = html_text.replace("/dist/leaflet.css", "/leaflet.css")
201
- html_text = html_text.replace("/dist/leaflet.js", "/leaflet.js")
202
- return html_text
203
- elif host == "unpkg":
204
- html_text = html_text.replace(
205
- "https://cdn.jsdelivr.net/npm/", "https://unpkg.com/"
206
- )
207
- return html_text
 
208
  else:
209
- return html_text
210
-
211
- def _save_map_html_file(html_text: str) -> str:
212
- """地図HTMLを実ファイルに保存(Gradio Fileに渡すパスを返す)"""
213
- fd, path = tempfile.mkstemp(suffix=".html")
214
- os.close(fd)
215
- with open(path, "w", encoding="utf-8") as f:
216
- f.write(html_text)
217
- return path
218
-
219
- # ----------------------------
220
- # 実行パイプライン
221
- # ----------------------------
222
- def _parse_indexer(x):
223
- try:
224
- return int(x)
225
- except Exception:
226
- return x
227
-
228
- def run(excel_file, sheet_name, header_row, address_col, power_col, use_inet, base_name, leaflet_cdn):
229
- # Excel 読み込み
230
- if excel_file is None or not hasattr(excel_file, "name"):
231
- table_df = pd.DataFrame(columns=["address_input", "CF", "lat", "lon"])
232
- return ("Excelファイルを指定してください。", table_df, "", None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  try:
235
- df = pd.read_excel(excel_file.name, sheet_name=sheet_name, header=int(header_row))
236
  except Exception as e:
237
- empty_df = pd.DataFrame(columns=["address_input", "CF", "lat", "lon"])
238
- return (f"Excel の読み込みに失敗しました: {e}", empty_df, "", None)
239
-
240
- # 列参照(番号/名前の両対応)
241
- addr_series = df.iloc[:, address_col] if isinstance(address_col, int) else df[address_col]
242
- cf_series = df.iloc[:, power_col] if isinstance(power_col, int) else df[power_col]
243
-
244
- addresses = addr_series.astype(str).tolist()
245
- cfs = cf_series.tolist()
246
-
247
- # ジオコーディング
248
- geo_df = geocode_with_cache(addresses, cfs, use_internet=bool(use_inet))
249
- table_df = geo_df[["address_input", "CF", "lat", "lon"]].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # 地図HTML生成 CDN書換 → 実ファイル保存 → File出力
252
- try:
253
- html_text = _build_folium_map_html(table_df, base_name=base_name)
254
- html_text = _rewrite_leaflet_cdn(html_text, host=leaflet_cdn)
255
- map_file_path = _save_map_html_file(html_text)
256
-
257
- msg = (
258
- "✅ 地図HTMLを生成しました。下の **地図HTMLファイル** をクリックして新規タブで開いてください。\n"
259
- "(埋め込みではなく実ファイル配信なので、CSPが厳しい環境でも表示できるはずです)"
260
- )
261
- info = f"ポイント数(有効座標): {int(table_df[['lat','lon']].dropna().shape[0])} / {len(table_df)}"
262
- return (msg, table_df, info, map_file_path)
263
- except Exception as e:
264
- return (f"地図描画に失敗しました: {e}", table_df, "", None)
265
 
266
- # ----------------------------
267
- # Gradio UI
268
- # ----------------------------
269
- with gr.Blocks(title="Excel住所 → Folium(無料タイル・File配信)") as demo:
270
  gr.Markdown(
271
- "## Excelの住所を国土地理院APIでジオコーディング → Folium(Leaflet)で地図表示(無料タイル・Mapbox不要)\n"
272
- "- 地図は **実ファイル(.html)** として配信します(CSPが厳しい環境でもOK)。\n"
273
- "- タイル=地理院/OSM、CDNは必要に応じて切替できます。"
 
 
 
 
 
 
274
  )
275
-
276
- with gr.Row():
277
- xlsx_in = gr.File(label="Excelファイル(住所付き)", file_count="single", file_types=[".xlsx", ".xls"])
278
-
279
  with gr.Row():
280
- sheet = gr.Textbox(label="シート名", value="認定設備")
281
- header_row = gr.Number(label="ヘッダー行番号(0始まり)", value=2, precision=0)
282
-
283
- with gr.Row():
284
- address_col = gr.Textbox(label="住所列(列名 or 0始まり列番号)", value="発電設備の所在地")
285
- power_col = gr.Textbox(label="数値列(任意:列名 or 0始まり列番号)", value="発電出力(kW)")
286
-
287
  with gr.Row():
288
- use_inet = gr.Checkbox(label="国土地理院APIに問い合わせ(オフでキャッシュのみ使用)", value=True)
289
- base_name = gr.Dropdown(choices=list(TILE_CATALOG.keys()), value="GSI 標準地図", label="ベースマップ")
290
- leaflet_cdn = gr.Dropdown(
291
- choices=["jsdelivr", "cdnjs", "unpkg"], value="jsdelivr",
292
- label="Leaflet CDN(遮断時に切替)"
293
- )
294
-
295
- run_btn = gr.Button("描画")
296
-
297
- out_html = gr.HTML(label="案内メッセージ")
298
- out_table = gr.Dataframe(label="ジオコーディング結果(住所・緯度・経度・CF)", wrap=True)
299
- out_info = gr.Textbox(label="メタ情報", lines=2)
300
- out_file = gr.File(label="地図HTMLファイル(クリックで開く/ダウンロード)")
301
-
302
- def _parse(x):
303
- try:
304
- return int(x)
305
- except Exception:
306
- return x
307
 
308
- def app_run(xls, s, h, a, p, inet, base, cdn):
309
- return run(
310
- xls, s, int(h), _parse(a), _parse(p), inet, base, cdn
311
- )
 
 
312
 
313
  run_btn.click(
314
- fn=app_run,
315
- inputs=[xlsx_in, sheet, header_row, address_col, power_col, use_inet, base_name, leaflet_cdn],
316
- outputs=[out_html, out_table, out_info, out_file],
317
  )
318
 
319
  if __name__ == "__main__":
320
- demo.launch()
 
 
 
 
1
  import os
2
  import re
3
  import time
4
+ import zipfile
5
+ import unicodedata
6
+ from urllib.parse import urljoin, urlparse, parse_qs, unquote
7
+
8
+ import gradio as gr
9
  import requests
10
  import pandas as pd
11
+ from bs4 import BeautifulSoup
12
+
13
+ PUBLIC_URL = "https://www.fit-portal.go.jp/PublicInfo"
14
+ OUTDIR = "data_fit"
15
+
16
+ # -------------------- ユーティリティ --------------------
17
 
18
+ def normalize_filename(name: str) -> str:
19
+ name = unicodedata.normalize("NFKC", name)
20
+ name = re.sub(r'[\\/:*?"<>|]+', "_", name)
21
+ name = name.strip()
22
+ return name or "file"
23
+
24
+ def guess_filename_from_headers(resp: requests.Response, fallback: str) -> str:
25
+ cd = resp.headers.get("Content-Disposition", "")
26
+ m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', cd, flags=re.IGNORECASE)
27
+ if m:
 
 
 
 
 
 
 
 
 
 
 
28
  try:
29
+ fn = unquote(m.group(1))
 
 
 
 
 
 
30
  except Exception:
31
+ fn = m.group(1)
32
+ return normalize_filename(fn)
33
+ return normalize_filename(fallback)
34
+
35
+ def is_pref_link(a_tag) -> bool:
36
+ href = a_tag.get("href") or ""
37
+ return "servlet.FileDownload" in href and "file=" in href
38
+
39
+ def extract_pref_name(a_tag) -> str:
40
+ txt = (a_tag.get_text() or "").strip()
41
+ return txt or "pref"
42
 
43
+ def pick_sheet_name(xls_path: str, preferred: str | None) -> str | None:
44
  try:
45
+ xl = pd.ExcelFile(xls_path)
46
+ if preferred and preferred in xl.sheet_names:
47
+ return preferred
48
+ # 一般的に「代表地番」を優先
49
+ for candidate in ["代表地番", "代表地番のみ", "代表地番シート"]:
50
+ if candidate in xl.sheet_names:
51
+ return candidate
52
+ return xl.sheet_names[0] if xl.sheet_names else None
53
  except Exception:
54
+ return None
55
+
56
+ def collect_pref_links(session: requests.Session) -> list[dict]:
57
+ r = session.get(PUBLIC_URL, timeout=60)
58
+ r.raise_for_status()
59
+ soup = BeautifulSoup(r.text, "html.parser")
60
+ links = []
61
+ for a in soup.find_all("a"):
62
+ if is_pref_link(a):
63
+ links.append({"pref": extract_pref_name(a), "href": urljoin(PUBLIC_URL, a.get("href"))})
64
+ # 重複除去
65
+ seen, uniq = set(), []
66
+ for item in links:
67
+ key = (item["pref"], item["href"])
68
+ if key not in seen:
69
+ seen.add(key)
70
+ uniq.append(item)
71
+ return uniq
72
+
73
+ def download_one(session: requests.Session, url: str, outdir: str, pref: str) -> str:
74
+ os.makedirs(outdir, exist_ok=True)
75
+ qs = parse_qs(urlparse(url).query)
76
+ file_id = (qs.get("file", ["unknown"])[0])[:18]
77
+ with session.get(url, timeout=180, stream=True) as r:
78
+ r.raise_for_status()
79
+ fname = guess_filename_from_headers(r, f"{pref}_{file_id}.xlsx")
80
+ path = os.path.join(outdir, fname)
81
+ with open(path, "wb") as f:
82
+ for chunk in r.iter_content(chunk_size=1 << 15):
83
+ if chunk:
84
+ f.write(chunk)
85
+ return path
86
+
87
+ # -------------------- 列名選択: 小分類 > 中分類 > 大分類 --------------------
88
+
89
+ def _clean_cell(x) -> str:
90
+ if x is None:
91
+ return ""
92
+ s = str(x).strip()
93
+ if s.lower() == "nan":
94
+ return ""
95
  return s
96
 
97
+ def choose_names_from_multiindex(mi: pd.MultiIndex) -> list[str]:
98
  """
99
+ 3段ヘッダ(MultiIndex)から列名を選ぶ。
100
+ ルール: 小分類(第3段)に値があればそれ、無ければ中分類(第2段)、
101
+ それも無ければ大分類(第1段)。すべて空なら 'col'。
102
+ 最後に重複を .1, .2… で解消。
103
  """
104
+ names = []
105
+ for tpl in mi:
106
+ # tpl は (大, 中, 小) 想定
107
+ if len(tpl) < 3:
108
+ # 念のため不足時の安全対策
109
+ a = _clean_cell(tpl[0]) if len(tpl) >= 1 else ""
110
+ b = _clean_cell(tpl[1]) if len(tpl) >= 2 else ""
111
+ c = ""
112
+ else:
113
+ a, b, c = (_clean_cell(tpl[0]), _clean_cell(tpl[1]), _clean_cell(tpl[2]))
114
+ name = c or b or a or "col"
115
+ names.append(name)
116
+
117
+ # 重複解消
118
+ seen = {}
119
+ out = []
120
+ for n in names:
121
+ if n not in seen:
122
+ seen[n] = 0
123
+ out.append(n)
124
+ else:
125
+ seen[n] += 1
126
+ out.append(f"{n}.{seen[n]}")
127
+ return out
128
+
129
+ # -------------------- 読み込みルール --------------------
130
+ # 0行目は削除し、1/2/3行目をヘッダ(= header=[1,2,3])
131
+ HEADER_ROWS = [1, 2, 3]
132
+ # 2枚目以降は 0〜3行目をスキップ(= skiprows=4)、header=None でデータのみ
133
+ SKIP_ROWS_OTHERS = 4
134
+
135
+ def load_excel_first(xls_path: str, sheet_pref: str | None) -> tuple[pd.DataFrame, list[str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  """
137
+ 1枚目:
138
+ - header=[1,2,3] で3段ヘッダを読み込み(0行目は自動的に使われない)
139
+ - 左端の列を削除
140
+ - MultiIndex から列名を「小>中>大」の優先で単一行に変換
141
+ 戻り値: (df, chosen_names)
142
  """
143
+ sheet = pick_sheet_name(xls_path, sheet_pref)
144
+ if not sheet:
145
+ raise RuntimeError("シートが見つかりません")
146
+ df = pd.read_excel(
147
+ xls_path,
148
+ sheet_name=sheet,
149
+ engine="openpyxl",
150
+ header=HEADER_ROWS,
151
+ dtype=str
152
+ )
153
+ # 左端の列を削除
154
+ df = df.iloc[:, 1:]
155
+ # 前後空白トリム
156
+ for c in df.select_dtypes(include=["object"]).columns:
157
+ df[c] = df[c].str.strip()
158
+
159
+ # 列名を選択
160
+ if isinstance(df.columns, pd.MultiIndex):
161
+ chosen = choose_names_from_multiindex(df.columns)
162
  else:
163
+ # 念のため単層だった場合もクリーニング&重複解消
164
+ raw = [_clean_cell(c) or "col" for c in df.columns]
165
+ seen = {}
166
+ chosen = []
167
+ for n in raw:
168
+ if n not in seen:
169
+ seen[n] = 0
170
+ chosen.append(n)
171
+ else:
172
+ seen[n] += 1
173
+ chosen.append(f"{n}.{seen[n]}")
174
+ df.columns = chosen
175
+ return df, chosen
176
+
177
+ def load_excel_other(xls_path: str, sheet_pref: str | None, target_cols: list[str]) -> pd.DataFrame | None:
178
+ """
179
+ 2枚目以降:
180
+ - skiprows=4, header=None でデータのみ
181
+ - 左端の列を削除
182
+ - 列数が合わなければ切り詰め/ダミー列追加で合わせる
183
+ - 列名を 1枚目の chosen に置換
184
+ """
185
+ sheet = pick_sheet_name(xls_path, sheet_pref)
186
+ if not sheet:
187
+ return None
188
+ df = pd.read_excel(
189
+ xls_path,
190
+ sheet_name=sheet,
191
+ engine="openpyxl",
192
+ header=None,
193
+ skiprows=SKIP_ROWS_OTHERS,
194
+ dtype=str
195
+ )
196
+ # 左端の列を削除
197
+ df = df.iloc[:, 1:]
198
+ # 前後空白トリム
199
+ for c in df.select_dtypes(include=["object"]).columns:
200
+ df[c] = df[c].str.strip()
201
+
202
+ # 列数調整
203
+ if df.shape[1] != len(target_cols):
204
+ print(f"[WARN] 列数不一致: file={os.path.basename(xls_path)} "
205
+ f"read={df.shape[1]} vs target={len(target_cols)} -> 自動調整")
206
+ if df.shape[1] > len(target_cols):
207
+ df = df.iloc[:, :len(target_cols)]
208
+ else:
209
+ # 足りないときは None 列を追加
210
+ for k in range(len(target_cols) - df.shape[1]):
211
+ df[f"_pad_{k}"] = None
212
+ df = df.iloc[:, :len(target_cols)]
213
+
214
+ df.columns = target_cols
215
+ return df
216
 
217
+ def zip_paths(paths: list[str], out_zip: str) -> str:
218
+ with zipfile.ZipFile(out_zip, "w", compression=zipfile.ZIP_DEFLATED) as z:
219
+ for p in paths:
220
+ if os.path.exists(p):
221
+ z.write(p, arcname=os.path.basename(p))
222
+ return out_zip
223
+
224
+ # -------------------- メイン実行(Gradioから呼ぶ) --------------------
225
+
226
+ def run_job(sheet_name, sleep_sec, limit, re_download, progress=gr.Progress(track_tqdm=False)):
227
+ progress(0, desc="初期化中…")
228
+
229
+ session = requests.Session()
230
+ session.headers.update({
231
+ "User-Agent": "Mozilla/5.0 (compatible; FITCollector/1.3; +https://huggingface.co/spaces)",
232
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
233
+ })
234
+
235
+ # 1) リンク収集
236
+ links = collect_pref_links(session)
237
+ if not links:
238
+ return ("都道府県ファイルのリンク検出に失敗しました。ページ構成の変更/一時的な制限の可能性があります。",
239
+ None, None, None, None)
240
+ if limit and limit > 0:
241
+ links = links[:int(limit)]
242
+ progress(0.1, desc=f"リンク検出 {len(links)} 件")
243
+
244
+ # 2) ダウンロード
245
+ downloaded = []
246
+ for i, item in enumerate(links, start=1):
247
+ progress(0.1 + 0.6 * i / max(1, len(links)),
248
+ desc=f"ダウンロード {i}/{len(links)}: {item['pref']}")
249
+ try:
250
+ existing = None
251
+ if not re_download and os.path.isdir(OUTDIR):
252
+ for fn in os.listdir(OUTDIR):
253
+ if fn.lower().endswith(".xlsx") and item["pref"] in fn:
254
+ existing = os.path.join(OUTDIR, fn)
255
+ break
256
+ if existing and os.path.exists(existing):
257
+ path = existing
258
+ else:
259
+ path = download_one(session, item["href"], OUTDIR, item["pref"])
260
+ time.sleep(float(sleep_sec))
261
+ downloaded.append(path)
262
+ except Exception as e:
263
+ print(f"[WARN] ダウンロード失敗: {item['pref']} {e}")
264
+
265
+ if not downloaded:
266
+ return ("ダウンロードに失敗しました。", None, None, None, None)
267
+
268
+ # 3) 読み込み(1枚目で列名確定)
269
+ progress(0.75, desc="1枚目を読み込み(列名を確定)")
270
+ first_path = downloaded[0]
271
  try:
272
+ df0, cols0 = load_excel_first(first_path, sheet_name if sheet_name else None)
273
  except Exception as e:
274
+ return (f"1枚目の読み込みに失敗しました: {os.path.basename(first_path)} / {e}",
275
+ None, None, None, None)
276
+
277
+ frames = [df0]
278
+
279
+ # 4) 読み込み(2枚目以降)
280
+ for j, p in enumerate(downloaded[1:], start=2):
281
+ progress(0.75 + 0.25 * (j - 1) / max(1, len(downloaded) - 1),
282
+ desc=f"{j}枚目を読み込み")
283
+ df = load_excel_other(p, sheet_name if sheet_name else None, cols0)
284
+ if df is not None and len(df) > 0:
285
+ frames.append(df)
286
+ else:
287
+ print(f"[WARN] 読み込みスキップ: {os.path.basename(p)}")
288
+
289
+ # 5) 縦結合
290
+ combined = pd.concat(frames, ignore_index=True)
291
+
292
+ # 6) 出力
293
+ os.makedirs(OUTDIR, exist_ok=True)
294
+ out_xlsx = os.path.join(OUTDIR, "combined_fit.xlsx")
295
+ out_parq = os.path.join(OUTDIR, "combined_fit.parquet")
296
+ with pd.ExcelWriter(out_xlsx, engine="openpyxl") as w:
297
+ combined.to_excel(w, index=False, sheet_name="combined")
298
+ combined.to_parquet(out_parq, index=False)
299
+
300
+ # 7) ZIP(取得ファイル一式)
301
+ raw_zip = os.path.join(OUTDIR, "raw_excels.zip")
302
+ zip_paths(downloaded, raw_zip)
303
+
304
+ # 8) プレビュー
305
+ preview_csv = os.path.join(OUTDIR, "combined_head.csv")
306
+ combined.head(1000).to_csv(preview_csv, index=False)
307
+
308
+ progress(1.0, desc=f"完了({len(combined):,} 行)")
309
+ msg = (
310
+ f"✅ 結合完了: 行数 = {len(combined):,}\n"
311
+ f"・Excel: combined_fit.xlsx\n"
312
+ f"・Parquet: combined_fit.parquet\n"
313
+ f"・Raw ZIP: raw_excels.zip\n"
314
+ f"・プレビュー: combined_head.csv\n"
315
+ f"・列名は『小分類>中分類>大分類』の優先で単一行化(結合は不実施)"
316
+ )
317
+ return (msg, out_xlsx, out_parq, raw_zip, preview_csv)
318
 
319
+ # -------------------- Gradio UI --------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ with gr.Blocks(title="FIT 公表(都道府県別Excel)一括取得&結合") as demo:
 
 
 
322
  gr.Markdown(
323
+ """
324
+ # FIT 公表(都道府県別Excel)一括取得 & 結合
325
+ **列名ポリシー**:
326
+ - 1枚目: 0行目を使わず、1/2/3行目をヘッダとして読み込み(3段)。
327
+ - 列名は **小分類に値があれば小分類、無ければ中分類のみ**(結合しません)。
328
+ - 2枚目以降: 0〜3行目をスキップし、データのみ読み込み。
329
+ - すべてのファイルで **左端の列は削除**。
330
+ - ファイル名/シート名などのメタ列は付与しません。
331
+ """
332
  )
 
 
 
 
333
  with gr.Row():
334
+ sheet = gr.Textbox(label="読み込むシート名(空欄=自動)", placeholder="例)代表地番 / 全地番")
335
+ sleep = gr.Slider(0.0, 5.0, value=1.0, step=0.1, label="ダウンロード間隔(秒)")
 
 
 
 
 
336
  with gr.Row():
337
+ limit = gr.Number(value=None, precision=0, label="先頭N県のみ(テスト用・空欄は全県)")
338
+ reget = gr.Checkbox(label="既存ファイルがあっても再ダウンロードする", value=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ run_btn = gr.Button("実行", variant="primary")
341
+ out_msg = gr.Markdown()
342
+ out_xlsx = gr.File(label="結合Excel(combined_fit.xlsx)")
343
+ out_parq = gr.File(label="結合Parquet(combined_fit.parquet)")
344
+ out_zip = gr.File(label="取得した都道府県Excel一式(zip)")
345
+ out_preview = gr.File(label="先頭1000行プレビュー(CSV)")
346
 
347
  run_btn.click(
348
+ fn=run_job,
349
+ inputs=[sheet, sleep, limit, reget],
350
+ outputs=[out_msg, out_xlsx, out_parq, out_zip, out_preview]
351
  )
352
 
353
  if __name__ == "__main__":
354
+ demo.queue(max_size=20).launch()