Update services/web_sources.py
Browse files- services/web_sources.py +91 -36
services/web_sources.py
CHANGED
|
@@ -8,6 +8,56 @@ from .chain_filters import is_chain
|
|
| 8 |
UA = {"User-Agent": "HF-Space-Trip-Planner/1.0 (+web-sources)"}
|
| 9 |
OVERPASS = "https://overpass-api.de/api/interpreter"
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# ========== OSM(Overpass) ==========
|
| 12 |
|
| 13 |
def _overpass(q: str) -> dict:
|
|
@@ -30,6 +80,9 @@ def _doc_from_osm(e: dict, tags: dict, extra_tags: List[str]) -> dict | None:
|
|
| 30 |
name = tags.get("name") or tags.get("name:ja") or tags.get("name:en")
|
| 31 |
if not name:
|
| 32 |
return None
|
|
|
|
|
|
|
|
|
|
| 33 |
lat, lon = c
|
| 34 |
title = name
|
| 35 |
text = ""
|
|
@@ -39,7 +92,10 @@ def _doc_from_osm(e: dict, tags: dict, extra_tags: List[str]) -> dict | None:
|
|
| 39 |
"source": "osm",
|
| 40 |
"tags": extra_tags
|
| 41 |
}
|
| 42 |
-
#
|
|
|
|
|
|
|
|
|
|
| 43 |
if "opening_hours" in tags:
|
| 44 |
meta["hours"] = str(tags["opening_hours"])
|
| 45 |
return {"id": meta["osm_id"], "text": text, "meta": meta}
|
|
@@ -61,7 +117,6 @@ out center tags {limit};
|
|
| 61 |
out = []
|
| 62 |
for e in j.get("elements", []):
|
| 63 |
tags = e.get("tags", {}) or {}
|
| 64 |
-
# fast_food を除外
|
| 65 |
if tags.get("amenity") == "fast_food":
|
| 66 |
continue
|
| 67 |
name = tags.get("name") or ""
|
|
@@ -74,21 +129,19 @@ out center tags {limit};
|
|
| 74 |
except Exception:
|
| 75 |
return []
|
| 76 |
|
| 77 |
-
def _osm_sights(center: Tuple[float, float], radius_km: float, limit: int =
|
| 78 |
lat, lon = center
|
| 79 |
radius_m = int(radius_km * 1000)
|
| 80 |
-
#
|
| 81 |
q = f"""
|
| 82 |
[out:json][timeout:25];
|
| 83 |
(
|
| 84 |
-
node["tourism"~"museum|gallery|attraction"](around:{radius_m},{lat},{lon});
|
| 85 |
-
way["tourism"~"museum|gallery|attraction"](around:{radius_m},{lat},{lon});
|
| 86 |
-
node["leisure"
|
| 87 |
-
way["leisure"
|
| 88 |
-
node["amenity"
|
| 89 |
-
way["amenity"
|
| 90 |
-
node["amenity"="arts_centre"](around:{radius_m},{lat},{lon});
|
| 91 |
-
node["amenity"="theatre"](around:{radius_m},{lat},{lon});
|
| 92 |
node["historic"](around:{radius_m},{lat},{lon});
|
| 93 |
way["historic"](around:{radius_m},{lat},{lon});
|
| 94 |
);
|
|
@@ -99,22 +152,24 @@ out center tags {limit};
|
|
| 99 |
out = []
|
| 100 |
for e in j.get("elements", []):
|
| 101 |
tags = e.get("tags", {}) or {}
|
|
|
|
|
|
|
| 102 |
name = tags.get("name") or ""
|
| 103 |
-
# 劇場/映画館は候補に入れるが後段で上限をかける
|
| 104 |
ex_tags: List[str] = []
|
| 105 |
-
tourism = tags.get("tourism")
|
| 106 |
-
amenity = tags.get("amenity")
|
| 107 |
-
leisure = tags.get("leisure")
|
| 108 |
-
historic = tags.get("historic")
|
| 109 |
|
| 110 |
if tourism in ("museum","gallery"):
|
| 111 |
ex_tags = ["Culture","indoor","museum"]
|
| 112 |
-
elif
|
| 113 |
-
ex_tags = ["
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
elif amenity == "place_of_worship":
|
| 115 |
ex_tags = ["Heritage","outdoor","temple"]
|
| 116 |
-
elif amenity in ("arts_centre","
|
| 117 |
-
ex_tags = ["
|
| 118 |
elif tourism == "attraction" or historic:
|
| 119 |
ex_tags = ["Sightseeing","outdoor","attraction"]
|
| 120 |
else:
|
|
@@ -133,7 +188,6 @@ def _wikipedia_places(center: Tuple[float, float], radius_km: float, limit: int
|
|
| 133 |
lat, lon = center
|
| 134 |
radius_m = int(radius_km * 1000)
|
| 135 |
try:
|
| 136 |
-
# 近傍ページ検索
|
| 137 |
r = requests.get(
|
| 138 |
"https://ja.wikipedia.org/w/api.php",
|
| 139 |
params={
|
|
@@ -147,7 +201,6 @@ def _wikipedia_places(center: Tuple[float, float], radius_km: float, limit: int
|
|
| 147 |
if not gs:
|
| 148 |
return []
|
| 149 |
page_ids = [str(x["pageid"]) for x in gs if "pageid" in x]
|
| 150 |
-
# 抜粋取得
|
| 151 |
r2 = requests.get(
|
| 152 |
"https://ja.wikipedia.org/w/api.php",
|
| 153 |
params={
|
|
@@ -164,8 +217,15 @@ def _wikipedia_places(center: Tuple[float, float], radius_km: float, limit: int
|
|
| 164 |
if not p:
|
| 165 |
continue
|
| 166 |
title = p.get("title") or "Unknown"
|
| 167 |
-
extract = (p.get("extract") or "")[:
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
match = next((x for x in gs if str(x.get("pageid")) == gid), None)
|
| 170 |
if not match:
|
| 171 |
continue
|
|
@@ -182,17 +242,14 @@ def gather_web_docs(center: Tuple[float, float], interests: List[str], date: str
|
|
| 182 |
"""
|
| 183 |
Web only / Hybrid 用の外部ソース収集。
|
| 184 |
- OSM/Overpass: 独立系レストラン(チェーン除外)
|
| 185 |
-
- OSM/Overpass: 観光POI(
|
| 186 |
-
- Wikipedia: 近傍の観光スポット解説
|
| 187 |
"""
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
# Wikipedia(軽め)
|
| 193 |
-
wiki = _wikipedia_places(center, min(radius_km, 8.0), limit=20)
|
| 194 |
|
| 195 |
-
# 連結+重複排除(title
|
| 196 |
def key(d): return (d.get("meta", {}).get("title") or "").strip().lower()
|
| 197 |
seen = set(); merged: List[dict] = []
|
| 198 |
for d in sights + restos + wiki:
|
|
@@ -201,6 +258,4 @@ def gather_web_docs(center: Tuple[float, float], interests: List[str], date: str
|
|
| 201 |
continue
|
| 202 |
seen.add(k)
|
| 203 |
merged.append(d)
|
| 204 |
-
|
| 205 |
-
# 興味があれば軽くブースト…は planner 側スコアに任せるためここではそのまま返す
|
| 206 |
return merged
|
|
|
|
| 8 |
UA = {"User-Agent": "HF-Space-Trip-Planner/1.0 (+web-sources)"}
|
| 9 |
OVERPASS = "https://overpass-api.de/api/interpreter"
|
| 10 |
|
| 11 |
+
# ------------ ヘルパ(非レジャー/閉館検出) ------------
|
| 12 |
+
|
| 13 |
+
_NON_LEISURE_NEG = {
|
| 14 |
+
# 行政・公共
|
| 15 |
+
"区役所","市役所","県庁","都庁","役所","庁舎","合同庁舎","裁判所","税務署","警察署","消防署",
|
| 16 |
+
# オフィス/ビジネス
|
| 17 |
+
"本社","オフィス","事務所","本部","支社","支店","株式会社","財団法人","社屋","企業",
|
| 18 |
+
# 学術・医療(観光目的外)
|
| 19 |
+
"大学","研究所","病院","クリニック","高校","中学校","小学校",
|
| 20 |
+
# コンベンション/展示
|
| 21 |
+
"コンベンション","会議","展示場","見本市","フォーラム","国際フォーラム","カンファレンス","conference","exhibition"
|
| 22 |
+
}
|
| 23 |
+
_LEISURE_POS = {
|
| 24 |
+
# レジャー/商業の肯定表現
|
| 25 |
+
"美術館","博物館","水族館","動物園","公園","庭園","展望","遊園地","テーマパーク","市場","商店街",
|
| 26 |
+
"温泉","銭湯","スパ","サウナ","ミュージアム","ギャラリー","ショッピング","アミューズメント","アート",
|
| 27 |
+
"神社","寺","城","史跡","遺跡","mall","shopping","market","museum","gallery","aquarium","zoo","park","theme park",
|
| 28 |
+
}
|
| 29 |
+
def _text_has_any(s: str, keys: set[str]) -> bool:
|
| 30 |
+
s = (s or "").lower()
|
| 31 |
+
return any(k.lower() in s for k in keys)
|
| 32 |
+
|
| 33 |
+
def _is_closed_text(s: str) -> bool:
|
| 34 |
+
# Wikipedia 抜粋の閉館/閉業/休館検出
|
| 35 |
+
return _text_has_any(s, {"閉館","閉業","休館中","廃止","permanently closed","closed in","defunct","abolished"})
|
| 36 |
+
|
| 37 |
+
def _is_non_leisure_wiki(title: str, extract: str) -> bool:
|
| 38 |
+
# 非レジャー語が含まれ、かつレジャー肯定語が無い場合に除外
|
| 39 |
+
t = f"{title} {extract}"
|
| 40 |
+
return (_text_has_any(t, _NON_LEISURE_NEG) and not _text_has_any(t, _LEISURE_POS))
|
| 41 |
+
|
| 42 |
+
def _is_osm_non_leisure(tags: dict) -> bool:
|
| 43 |
+
# OSMタグから非レジャー/オフィス系を弾く
|
| 44 |
+
if not isinstance(tags, dict):
|
| 45 |
+
return False
|
| 46 |
+
if tags.get("office"): # office=*
|
| 47 |
+
return True
|
| 48 |
+
amenity = (tags.get("amenity") or "").lower()
|
| 49 |
+
building = (tags.get("building") or "").lower()
|
| 50 |
+
banned_amenity = {
|
| 51 |
+
"townhall","police","fire_station","embassy","courthouse","clinic","hospital",
|
| 52 |
+
"university","school","college","kindergarten","bank","post_office","conference_centre"
|
| 53 |
+
}
|
| 54 |
+
banned_building = {"office","public","civic","government"}
|
| 55 |
+
if amenity in banned_amenity:
|
| 56 |
+
return True
|
| 57 |
+
if building in banned_building:
|
| 58 |
+
return True
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
# ========== OSM(Overpass) ==========
|
| 62 |
|
| 63 |
def _overpass(q: str) -> dict:
|
|
|
|
| 80 |
name = tags.get("name") or tags.get("name:ja") or tags.get("name:en")
|
| 81 |
if not name:
|
| 82 |
return None
|
| 83 |
+
if _is_osm_non_leisure(tags):
|
| 84 |
+
return None
|
| 85 |
+
# チェーン/ファストフードは上位で判定(飲食系のみ)
|
| 86 |
lat, lon = c
|
| 87 |
title = name
|
| 88 |
text = ""
|
|
|
|
| 92 |
"source": "osm",
|
| 93 |
"tags": extra_tags
|
| 94 |
}
|
| 95 |
+
# 参考タグ(チェーン検出用)
|
| 96 |
+
for k in ("brand","operator","network","brand:wikidata","brand:wikipedia","amenity","building"):
|
| 97 |
+
if k in tags:
|
| 98 |
+
meta[k] = tags[k]
|
| 99 |
if "opening_hours" in tags:
|
| 100 |
meta["hours"] = str(tags["opening_hours"])
|
| 101 |
return {"id": meta["osm_id"], "text": text, "meta": meta}
|
|
|
|
| 117 |
out = []
|
| 118 |
for e in j.get("elements", []):
|
| 119 |
tags = e.get("tags", {}) or {}
|
|
|
|
| 120 |
if tags.get("amenity") == "fast_food":
|
| 121 |
continue
|
| 122 |
name = tags.get("name") or ""
|
|
|
|
| 129 |
except Exception:
|
| 130 |
return []
|
| 131 |
|
| 132 |
+
def _osm_sights(center: Tuple[float, float], radius_km: float, limit: int = 80) -> List[dict]:
|
| 133 |
lat, lon = center
|
| 134 |
radius_m = int(radius_km * 1000)
|
| 135 |
+
# レジャー/商業寄りを拡充(aquarium/zoo/theme_park/marketplace/stadium など)
|
| 136 |
q = f"""
|
| 137 |
[out:json][timeout:25];
|
| 138 |
(
|
| 139 |
+
node["tourism"~"museum|gallery|attraction|aquarium|zoo|theme_park"](around:{radius_m},{lat},{lon});
|
| 140 |
+
way["tourism"~"museum|gallery|attraction|aquarium|zoo|theme_park"](around:{radius_m},{lat},{lon});
|
| 141 |
+
node["leisure"~"park|stadium|water_park|amusement_arcade|garden"](around:{radius_m},{lat},{lon});
|
| 142 |
+
way["leisure"~"park|stadium|water_park|amusement_arcade|garden"](around:{radius_m},{lat},{lon});
|
| 143 |
+
node["amenity"~"place_of_worship|arts_centre|marketplace|spa|public_bath"](around:{radius_m},{lat},{lon});
|
| 144 |
+
way["amenity"~"place_of_worship|arts_centre|marketplace|spa|public_bath"](around:{radius_m},{lat},{lon});
|
|
|
|
|
|
|
| 145 |
node["historic"](around:{radius_m},{lat},{lon});
|
| 146 |
way["historic"](around:{radius_m},{lat},{lon});
|
| 147 |
);
|
|
|
|
| 152 |
out = []
|
| 153 |
for e in j.get("elements", []):
|
| 154 |
tags = e.get("tags", {}) or {}
|
| 155 |
+
if _is_osm_non_leisure(tags):
|
| 156 |
+
continue
|
| 157 |
name = tags.get("name") or ""
|
|
|
|
| 158 |
ex_tags: List[str] = []
|
| 159 |
+
tourism = tags.get("tourism"); amenity = tags.get("amenity"); leisure = tags.get("leisure"); historic = tags.get("historic")
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
if tourism in ("museum","gallery"):
|
| 162 |
ex_tags = ["Culture","indoor","museum"]
|
| 163 |
+
elif tourism in ("aquarium","zoo","theme_park"):
|
| 164 |
+
ex_tags = ["Leisure","outdoor",tourism]
|
| 165 |
+
elif leisure in ("park","garden"):
|
| 166 |
+
ex_tags = ["Nature","outdoor",leisure]
|
| 167 |
+
elif leisure in ("stadium","water_park","amusement_arcade"):
|
| 168 |
+
ex_tags = ["Leisure","outdoor",leisure]
|
| 169 |
elif amenity == "place_of_worship":
|
| 170 |
ex_tags = ["Heritage","outdoor","temple"]
|
| 171 |
+
elif amenity in ("arts_centre","marketplace","spa","public_bath"):
|
| 172 |
+
ex_tags = ["Leisure","indoor",amenity]
|
| 173 |
elif tourism == "attraction" or historic:
|
| 174 |
ex_tags = ["Sightseeing","outdoor","attraction"]
|
| 175 |
else:
|
|
|
|
| 188 |
lat, lon = center
|
| 189 |
radius_m = int(radius_km * 1000)
|
| 190 |
try:
|
|
|
|
| 191 |
r = requests.get(
|
| 192 |
"https://ja.wikipedia.org/w/api.php",
|
| 193 |
params={
|
|
|
|
| 201 |
if not gs:
|
| 202 |
return []
|
| 203 |
page_ids = [str(x["pageid"]) for x in gs if "pageid" in x]
|
|
|
|
| 204 |
r2 = requests.get(
|
| 205 |
"https://ja.wikipedia.org/w/api.php",
|
| 206 |
params={
|
|
|
|
| 217 |
if not p:
|
| 218 |
continue
|
| 219 |
title = p.get("title") or "Unknown"
|
| 220 |
+
extract = (p.get("extract") or "")[:700]
|
| 221 |
+
|
| 222 |
+
# 閉館/閉業/休館の除外
|
| 223 |
+
if _is_closed_text(extract):
|
| 224 |
+
continue
|
| 225 |
+
# 非レジャー(オフィス/行政/コンベンション等)の除外
|
| 226 |
+
if _is_non_leisure_wiki(title, extract):
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
match = next((x for x in gs if str(x.get("pageid")) == gid), None)
|
| 230 |
if not match:
|
| 231 |
continue
|
|
|
|
| 242 |
"""
|
| 243 |
Web only / Hybrid 用の外部ソース収集。
|
| 244 |
- OSM/Overpass: 独立系レストラン(チェーン除外)
|
| 245 |
+
- OSM/Overpass: 観光/レジャーPOI(商業・娯楽を拡充)
|
| 246 |
+
- Wikipedia: 近傍の観光スポット解説(オフィス/行政/閉館ページは除外)
|
| 247 |
"""
|
| 248 |
+
sights = _osm_sights(center, radius_km, limit=100)
|
| 249 |
+
restos = _osm_restaurants(center, radius_km, limit=60)
|
| 250 |
+
wiki = _wikipedia_places(center, min(radius_km, 8.0), limit=24)
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
+
# 連結+重複排除(title��文字キー)
|
| 253 |
def key(d): return (d.get("meta", {}).get("title") or "").strip().lower()
|
| 254 |
seen = set(); merged: List[dict] = []
|
| 255 |
for d in sights + restos + wiki:
|
|
|
|
| 258 |
continue
|
| 259 |
seen.add(k)
|
| 260 |
merged.append(d)
|
|
|
|
|
|
|
| 261 |
return merged
|