Corin1998 commited on
Commit
fc2737c
·
verified ·
1 Parent(s): 671cc48

Update services/web_sources.py

Browse files
Files changed (1) hide show
  1. services/web_sources.py +91 -36
services/web_sources.py CHANGED
@@ -8,6 +8,56 @@ from .chain_filters import is_chain
8
  UA = {"User-Agent": "HF-Space-Trip-Planner/1.0 (+web-sources)"}
9
  OVERPASS = "https://overpass-api.de/api/interpreter"
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # ========== OSM(Overpass) ==========
12
 
13
  def _overpass(q: str) -> dict:
@@ -30,6 +80,9 @@ def _doc_from_osm(e: dict, tags: dict, extra_tags: List[str]) -> dict | None:
30
  name = tags.get("name") or tags.get("name:ja") or tags.get("name:en")
31
  if not name:
32
  return None
 
 
 
33
  lat, lon = c
34
  title = name
35
  text = ""
@@ -39,7 +92,10 @@ def _doc_from_osm(e: dict, tags: dict, extra_tags: List[str]) -> dict | None:
39
  "source": "osm",
40
  "tags": extra_tags
41
  }
42
- # opening_hours はそのままメモ。厳密解釈は別途今回は昼/夜の枠で調整
 
 
 
43
  if "opening_hours" in tags:
44
  meta["hours"] = str(tags["opening_hours"])
45
  return {"id": meta["osm_id"], "text": text, "meta": meta}
@@ -61,7 +117,6 @@ out center tags {limit};
61
  out = []
62
  for e in j.get("elements", []):
63
  tags = e.get("tags", {}) or {}
64
- # fast_food を除外
65
  if tags.get("amenity") == "fast_food":
66
  continue
67
  name = tags.get("name") or ""
@@ -74,21 +129,19 @@ out center tags {limit};
74
  except Exception:
75
  return []
76
 
77
- def _osm_sights(center: Tuple[float, float], radius_km: float, limit: int = 60) -> List[dict]:
78
  lat, lon = center
79
  radius_m = int(radius_km * 1000)
80
- # ミューアム/ギラリー/神社仏閣/公園/展望/歴史/アート関連など
81
  q = f"""
82
  [out:json][timeout:25];
83
  (
84
- node["tourism"~"museum|gallery|attraction"](around:{radius_m},{lat},{lon});
85
- way["tourism"~"museum|gallery|attraction"](around:{radius_m},{lat},{lon});
86
- node["leisure"="park"](around:{radius_m},{lat},{lon});
87
- way["leisure"="park"](around:{radius_m},{lat},{lon});
88
- node["amenity"="place_of_worship"](around:{radius_m},{lat},{lon});
89
- way["amenity"="place_of_worship"](around:{radius_m},{lat},{lon});
90
- node["amenity"="arts_centre"](around:{radius_m},{lat},{lon});
91
- node["amenity"="theatre"](around:{radius_m},{lat},{lon});
92
  node["historic"](around:{radius_m},{lat},{lon});
93
  way["historic"](around:{radius_m},{lat},{lon});
94
  );
@@ -99,22 +152,24 @@ out center tags {limit};
99
  out = []
100
  for e in j.get("elements", []):
101
  tags = e.get("tags", {}) or {}
 
 
102
  name = tags.get("name") or ""
103
- # 劇場/映画館は候補に入れるが後段で上限をかける
104
  ex_tags: List[str] = []
105
- tourism = tags.get("tourism")
106
- amenity = tags.get("amenity")
107
- leisure = tags.get("leisure")
108
- historic = tags.get("historic")
109
 
110
  if tourism in ("museum","gallery"):
111
  ex_tags = ["Culture","indoor","museum"]
112
- elif leisure == "park":
113
- ex_tags = ["Nature","outdoor","park"]
 
 
 
 
114
  elif amenity == "place_of_worship":
115
  ex_tags = ["Heritage","outdoor","temple"]
116
- elif amenity in ("arts_centre","theatre","cinema"):
117
- ex_tags = ["PerformingArts","indoor","performing_arts"]
118
  elif tourism == "attraction" or historic:
119
  ex_tags = ["Sightseeing","outdoor","attraction"]
120
  else:
@@ -133,7 +188,6 @@ def _wikipedia_places(center: Tuple[float, float], radius_km: float, limit: int
133
  lat, lon = center
134
  radius_m = int(radius_km * 1000)
135
  try:
136
- # 近傍ページ検索
137
  r = requests.get(
138
  "https://ja.wikipedia.org/w/api.php",
139
  params={
@@ -147,7 +201,6 @@ def _wikipedia_places(center: Tuple[float, float], radius_km: float, limit: int
147
  if not gs:
148
  return []
149
  page_ids = [str(x["pageid"]) for x in gs if "pageid" in x]
150
- # 抜粋取得
151
  r2 = requests.get(
152
  "https://ja.wikipedia.org/w/api.php",
153
  params={
@@ -164,8 +217,15 @@ def _wikipedia_places(center: Tuple[float, float], radius_km: float, limit: int
164
  if not p:
165
  continue
166
  title = p.get("title") or "Unknown"
167
- extract = (p.get("extract") or "")[:500]
168
- # 座標は geosearch の方に入っている
 
 
 
 
 
 
 
169
  match = next((x for x in gs if str(x.get("pageid")) == gid), None)
170
  if not match:
171
  continue
@@ -182,17 +242,14 @@ def gather_web_docs(center: Tuple[float, float], interests: List[str], date: str
182
  """
183
  Web only / Hybrid 用の外部ソース収集。
184
  - OSM/Overpass: 独立系レストラン(チェーン除外)
185
- - OSM/Overpass: 観光POI(博物館/公園/神社仏閣/歴史/劇場ほか
186
- - Wikipedia: 近傍の観光スポット解説
187
  """
188
- # OSM
189
- sights = _osm_sights(center, radius_km, limit=80)
190
- restos = _osm_restaurants(center, radius_km, limit=50)
191
-
192
- # Wikipedia(軽め)
193
- wiki = _wikipedia_places(center, min(radius_km, 8.0), limit=20)
194
 
195
- # 連結+重複排除(titleベース、小文字)
196
  def key(d): return (d.get("meta", {}).get("title") or "").strip().lower()
197
  seen = set(); merged: List[dict] = []
198
  for d in sights + restos + wiki:
@@ -201,6 +258,4 @@ def gather_web_docs(center: Tuple[float, float], interests: List[str], date: str
201
  continue
202
  seen.add(k)
203
  merged.append(d)
204
-
205
- # 興味があれば軽くブースト…は planner 側スコアに任せるためここではそのまま返す
206
  return merged
 
8
  UA = {"User-Agent": "HF-Space-Trip-Planner/1.0 (+web-sources)"}
9
  OVERPASS = "https://overpass-api.de/api/interpreter"
10
 
11
+ # ------------ ヘルパ(非レジャー/閉館検出) ------------
12
+
13
+ _NON_LEISURE_NEG = {
14
+ # 行政・公共
15
+ "区役所","市役所","県庁","都庁","役所","庁舎","合同庁舎","裁判所","税務署","警察署","消防署",
16
+ # オフィス/ビジネス
17
+ "本社","オフィス","事務所","本部","支社","支店","株式会社","財団法人","社屋","企業",
18
+ # 学術・医療(観光目的外)
19
+ "大学","研究所","病院","クリニック","高校","中学校","小学校",
20
+ # コンベンション/展示
21
+ "コンベンション","会議","展示場","見本市","フォーラム","国際フォーラム","カンファレンス","conference","exhibition"
22
+ }
23
+ _LEISURE_POS = {
24
+ # レジャー/商業の肯定表現
25
+ "美術館","博物館","水族館","動物園","公園","庭園","展望","遊園地","テーマパーク","市場","商店街",
26
+ "温泉","銭湯","スパ","サウナ","ミュージアム","ギャラリー","ショッピング","アミューズメント","アート",
27
+ "神社","寺","城","史跡","遺跡","mall","shopping","market","museum","gallery","aquarium","zoo","park","theme park",
28
+ }
29
+ def _text_has_any(s: str, keys: set[str]) -> bool:
30
+ s = (s or "").lower()
31
+ return any(k.lower() in s for k in keys)
32
+
33
+ def _is_closed_text(s: str) -> bool:
34
+ # Wikipedia 抜粋の閉館/閉業/休館検出
35
+ return _text_has_any(s, {"閉館","閉業","休館中","廃止","permanently closed","closed in","defunct","abolished"})
36
+
37
+ def _is_non_leisure_wiki(title: str, extract: str) -> bool:
38
+ # 非レジャー語が含まれ、かつレジャー肯定語が無い場合に除外
39
+ t = f"{title} {extract}"
40
+ return (_text_has_any(t, _NON_LEISURE_NEG) and not _text_has_any(t, _LEISURE_POS))
41
+
42
+ def _is_osm_non_leisure(tags: dict) -> bool:
43
+ # OSMタグから非レジャー/オフィス系を弾く
44
+ if not isinstance(tags, dict):
45
+ return False
46
+ if tags.get("office"): # office=*
47
+ return True
48
+ amenity = (tags.get("amenity") or "").lower()
49
+ building = (tags.get("building") or "").lower()
50
+ banned_amenity = {
51
+ "townhall","police","fire_station","embassy","courthouse","clinic","hospital",
52
+ "university","school","college","kindergarten","bank","post_office","conference_centre"
53
+ }
54
+ banned_building = {"office","public","civic","government"}
55
+ if amenity in banned_amenity:
56
+ return True
57
+ if building in banned_building:
58
+ return True
59
+ return False
60
+
61
  # ========== OSM(Overpass) ==========
62
 
63
  def _overpass(q: str) -> dict:
 
80
  name = tags.get("name") or tags.get("name:ja") or tags.get("name:en")
81
  if not name:
82
  return None
83
+ if _is_osm_non_leisure(tags):
84
+ return None
85
+ # チェーン/ファストフードは上位で判定(飲食系のみ)
86
  lat, lon = c
87
  title = name
88
  text = ""
 
92
  "source": "osm",
93
  "tags": extra_tags
94
  }
95
+ # 参考タグチェーン検出用
96
+ for k in ("brand","operator","network","brand:wikidata","brand:wikipedia","amenity","building"):
97
+ if k in tags:
98
+ meta[k] = tags[k]
99
  if "opening_hours" in tags:
100
  meta["hours"] = str(tags["opening_hours"])
101
  return {"id": meta["osm_id"], "text": text, "meta": meta}
 
117
  out = []
118
  for e in j.get("elements", []):
119
  tags = e.get("tags", {}) or {}
 
120
  if tags.get("amenity") == "fast_food":
121
  continue
122
  name = tags.get("name") or ""
 
129
  except Exception:
130
  return []
131
 
132
+ def _osm_sights(center: Tuple[float, float], radius_km: float, limit: int = 80) -> List[dict]:
133
  lat, lon = center
134
  radius_m = int(radius_km * 1000)
135
+ # ジャー/商業寄りを拡充(aquarium/zoo/theme_park/marketplace/stadium など
136
  q = f"""
137
  [out:json][timeout:25];
138
  (
139
+ node["tourism"~"museum|gallery|attraction|aquarium|zoo|theme_park"](around:{radius_m},{lat},{lon});
140
+ way["tourism"~"museum|gallery|attraction|aquarium|zoo|theme_park"](around:{radius_m},{lat},{lon});
141
+ node["leisure"~"park|stadium|water_park|amusement_arcade|garden"](around:{radius_m},{lat},{lon});
142
+ way["leisure"~"park|stadium|water_park|amusement_arcade|garden"](around:{radius_m},{lat},{lon});
143
+ node["amenity"~"place_of_worship|arts_centre|marketplace|spa|public_bath"](around:{radius_m},{lat},{lon});
144
+ way["amenity"~"place_of_worship|arts_centre|marketplace|spa|public_bath"](around:{radius_m},{lat},{lon});
 
 
145
  node["historic"](around:{radius_m},{lat},{lon});
146
  way["historic"](around:{radius_m},{lat},{lon});
147
  );
 
152
  out = []
153
  for e in j.get("elements", []):
154
  tags = e.get("tags", {}) or {}
155
+ if _is_osm_non_leisure(tags):
156
+ continue
157
  name = tags.get("name") or ""
 
158
  ex_tags: List[str] = []
159
+ tourism = tags.get("tourism"); amenity = tags.get("amenity"); leisure = tags.get("leisure"); historic = tags.get("historic")
 
 
 
160
 
161
  if tourism in ("museum","gallery"):
162
  ex_tags = ["Culture","indoor","museum"]
163
+ elif tourism in ("aquarium","zoo","theme_park"):
164
+ ex_tags = ["Leisure","outdoor",tourism]
165
+ elif leisure in ("park","garden"):
166
+ ex_tags = ["Nature","outdoor",leisure]
167
+ elif leisure in ("stadium","water_park","amusement_arcade"):
168
+ ex_tags = ["Leisure","outdoor",leisure]
169
  elif amenity == "place_of_worship":
170
  ex_tags = ["Heritage","outdoor","temple"]
171
+ elif amenity in ("arts_centre","marketplace","spa","public_bath"):
172
+ ex_tags = ["Leisure","indoor",amenity]
173
  elif tourism == "attraction" or historic:
174
  ex_tags = ["Sightseeing","outdoor","attraction"]
175
  else:
 
188
  lat, lon = center
189
  radius_m = int(radius_km * 1000)
190
  try:
 
191
  r = requests.get(
192
  "https://ja.wikipedia.org/w/api.php",
193
  params={
 
201
  if not gs:
202
  return []
203
  page_ids = [str(x["pageid"]) for x in gs if "pageid" in x]
 
204
  r2 = requests.get(
205
  "https://ja.wikipedia.org/w/api.php",
206
  params={
 
217
  if not p:
218
  continue
219
  title = p.get("title") or "Unknown"
220
+ extract = (p.get("extract") or "")[:700]
221
+
222
+ # 閉館/閉業/休館の除外
223
+ if _is_closed_text(extract):
224
+ continue
225
+ # 非レジャー(オフィス/行政/コンベンション等)の除外
226
+ if _is_non_leisure_wiki(title, extract):
227
+ continue
228
+
229
  match = next((x for x in gs if str(x.get("pageid")) == gid), None)
230
  if not match:
231
  continue
 
242
  """
243
  Web only / Hybrid 用の外部ソース収集。
244
  - OSM/Overpass: 独立系レストラン(チェーン除外)
245
+ - OSM/Overpass: 観光/レジャーPOI(商業・娯楽を拡充
246
+ - Wikipedia: 近傍の観光スポット解説(オフィス/行政/閉館ページは除外)
247
  """
248
+ sights = _osm_sights(center, radius_km, limit=100)
249
+ restos = _osm_restaurants(center, radius_km, limit=60)
250
+ wiki = _wikipedia_places(center, min(radius_km, 8.0), limit=24)
 
 
 
251
 
252
+ # 連結+重複排除(title��文字キー
253
  def key(d): return (d.get("meta", {}).get("title") or "").strip().lower()
254
  seen = set(); merged: List[dict] = []
255
  for d in sights + restos + wiki:
 
258
  continue
259
  seen.add(k)
260
  merged.append(d)
 
 
261
  return merged