Jongpal12 commited on
Commit
ca4b63f
·
verified ·
1 Parent(s): 06b0d5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -291
app.py CHANGED
@@ -1,23 +1,55 @@
1
  # -*- coding: utf-8 -*-
2
- import os, pathlib, io, json, random, requests
3
- import pandas as pd
4
- import streamlit as st
5
- from streamlit.components.v1 import html
6
 
7
- # ──────────────────────────────── 캐시/환경 경로 설정 ────────────────────────────────
8
- HOME = pathlib.Path.home()
9
  APP_DIR = pathlib.Path(__file__).parent.resolve()
 
 
10
  STREAMLIT_DIR = HOME / ".streamlit"
11
  STREAMLIT_DIR.mkdir(parents=True, exist_ok=True)
12
  os.environ["STREAMLIT_HOME"] = str(STREAMLIT_DIR)
13
  os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
14
  os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from huggingface_hub import hf_hub_download
 
 
 
 
17
 
18
- # (선택) 데이터셋 리포를 쓰고 싶으면 환경변수로 지정하세요.
19
- # 예: HF_DATASET_REPO="yourname/moai-travel-data", HF_DATASET_REV="main"
20
- HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", None) # None이면 로컬 우선
21
  HF_DATASET_REV = os.getenv("HF_DATASET_REV", "main")
22
 
23
  def _is_pointer_bytes(b: bytes) -> bool:
@@ -25,12 +57,11 @@ def _is_pointer_bytes(b: bytes) -> bool:
25
  return (
26
  "version https://git-lfs.github.com/spec/v1" in head
27
  or "git-lfs" in head
28
- or "xet" in head
29
  or "pointer size" in head
30
  )
31
 
32
  def _read_csv_bytes(b: bytes) -> pd.DataFrame:
33
- # utf-8 → cp949 순으로 시도
34
  try:
35
  return pd.read_csv(io.BytesIO(b), encoding="utf-8")
36
  except UnicodeDecodeError:
@@ -38,81 +69,41 @@ def _read_csv_bytes(b: bytes) -> pd.DataFrame:
38
 
39
  def load_csv_smart(local_path: str,
40
  hub_filename: str | None = None,
41
- repo_id: str | None = HF_DATASET_REPO,
42
  repo_type: str = "dataset",
43
  revision: str = HF_DATASET_REV) -> pd.DataFrame:
44
- """
45
- 1) 로컬 파일이 있으면 즉시 사용
46
- 2) 없고 repo_id가 있으면 HF Hub에서 받아서 사용
47
- 3) 둘 다 실패하면 Streamlit 에러
48
- """
49
  if hub_filename is None:
50
  hub_filename = os.path.basename(local_path)
51
-
52
- # 1) 로컬 우선
53
  if os.path.exists(local_path):
54
  with open(local_path, "rb") as f:
55
  data = f.read()
56
  if not _is_pointer_bytes(data):
57
  return _read_csv_bytes(data)
58
-
59
- # 2) 허브에서 받기 (repo_id가 설정된 경우)
60
- if repo_id:
61
- try:
62
- cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
63
- repo_type=repo_type, revision=revision)
64
- # 파일 자체를 다시 읽어서 인코딩 안전처리
65
- with open(cached, "rb") as f:
66
- data = f.read()
67
- return _read_csv_bytes(data)
68
- except Exception as e:
69
- st.error(f"Hub에서 {hub_filename} 받기 실패: {e}")
70
-
71
- # 3) 최종 실패
72
- st.error(f"데이터 파일을 찾을 수 없습니다: {local_path} (또는 Hub: {hub_filename})")
73
- st.stop()
74
 
75
  def load_json_smart(local_path: str,
76
  hub_filename: str | None = None,
77
- repo_id: str | None = HF_DATASET_REPO,
78
  repo_type: str = "dataset",
79
  revision: str = HF_DATASET_REV):
80
  if hub_filename is None:
81
  hub_filename = os.path.basename(local_path)
82
-
83
- # 1) 로컬 우선
84
  if os.path.exists(local_path):
85
  with open(local_path, "rb") as f:
86
  data = f.read()
87
  if not _is_pointer_bytes(data):
88
- try:
89
- return json.loads(data.decode("utf-8"))
90
- except Exception:
91
- return json.loads(data.decode("cp949"))
92
-
93
- # 2) 허브
94
- if repo_id:
95
- try:
96
- cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
97
- repo_type=repo_type, revision=revision)
98
- with open(cached, "r", encoding="utf-8") as f:
99
- return json.load(f)
100
- except Exception as e:
101
- st.error(f"Hub에서 {hub_filename} 받기 실패: {e}")
102
-
103
- # 3) 최종 실패
104
- st.error(f"JSON 파일을 찾을 수 없습니다: {local_path} (또는 Hub: {hub_filename})")
105
- st.stop()
106
- # ──────────────────────────────── CSV 안전 로더 ────────────────────────────────
107
- def read_csv_safe(path, encodings=("utf-8", "cp949")):
108
- last_err = None
109
- for enc in encodings:
110
- try:
111
- return pd.read_csv(path, encoding=enc)
112
- except Exception as e:
113
- last_err = e
114
- raise last_err
115
 
 
116
  travel_df = load_csv_smart("trip_emotions.csv", "trip_emotions.csv")
117
  external_score_df = load_csv_smart("external_scores.csv", "external_scores.csv")
118
  festival_df = load_csv_smart("festivals.csv", "festivals.csv")
@@ -120,229 +111,3 @@ weather_df = load_csv_smart("weather.csv", "weather.csv")
120
  package_df = load_csv_smart("packages.csv", "packages.csv")
121
  master_df = load_csv_smart("countries_cities.csv", "countries_cities.csv")
122
  theme_title_phrases = load_json_smart("theme_title_phrases.json", "theme_title_phrases.json")
123
-
124
-
125
- # ──────────────────────────────── theme_title_phrases ────────────────────────────────
126
- def load_theme_title_phrases(json_path="theme_title_phrases.json"):
127
- default_map = {
128
- "힐링": ["휴양 가볍게", "조용히 쉬기", "잔잔한 힐링"],
129
- "액티비티": ["스릴 가득", "체험 중심", "짜릿한 하루"],
130
- "미식": ["현지 미식 탐방", "숨은 맛집", "식도락 여행"],
131
- "자연": ["자연 한가운데", "풍경 맛집", "자연 충전"],
132
- "도시": ["핫플 모음", "핵심만 알차게", "도심 산책"],
133
- "문화": ["역사와 예술", "전통과 현대", "아카이빙 투어"],
134
- "가성비": ["알뜰 추천", "가심비 만족", "똑똑한 선택"],
135
- "추천": ["핵심 하이라이트", "이번엔 여기", "요즘 뜨는 곳"]
136
- }
137
- if os.path.exists(json_path):
138
- try:
139
- with open(json_path, "r", encoding="utf-8") as f:
140
- data = json.load(f)
141
- if isinstance(data, dict) and data:
142
- return data
143
- except Exception:
144
- pass
145
- with open(json_path, "w", encoding="utf-8") as f:
146
- json.dump(default_map, f, ensure_ascii=False, indent=2)
147
- return default_map
148
-
149
- theme_title_phrases = load_theme_title_phrases("theme_title_phrases.json")
150
-
151
- # ───────────────────────────���──── chat_a 모듈 ────────────────────────────────
152
- from chat_a import (
153
- analyze_emotion, detect_intent, extract_themes,
154
- recommend_places_by_theme, detect_location_filter,
155
- generate_intro_message, theme_ui_map, ui_to_theme_map,
156
- theme_opening_lines, intent_opening_lines, apply_weighted_score_filter,
157
- get_highlight_message, get_weather_message, get_intent_intro_message,
158
- recommend_packages, handle_selected_place, generate_region_intro,
159
- parse_companion_and_age, filter_packages_by_companion_age,
160
- make_top2_description_custom, format_summary_tags_custom,
161
- make_companion_age_message
162
- )
163
-
164
- # ──────────────────────────────── Ollama LLM (gemma2:9b) ────────────────────────────────
165
- OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
166
- OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "gemma2:9b")
167
- OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "60"))
168
-
169
- def _call_ollama_chat(messages, model=OLLAMA_MODEL,
170
- temperature=0.8, top_p=0.9, top_k=40, repeat_penalty=1.1,
171
- system_prompt=None):
172
- url = f"{OLLAMA_HOST}/api/chat"
173
- _msgs = []
174
- if system_prompt:
175
- _msgs.append({"role": "system", "content": system_prompt})
176
- _msgs.extend(messages)
177
- payload = {
178
- "model": model,
179
- "messages": _msgs,
180
- "options": {
181
- "temperature": temperature,
182
- "top_p": top_p,
183
- "top_k": top_k,
184
- "repeat_penalty": repeat_penalty,
185
- },
186
- "stream": False,
187
- }
188
- try:
189
- r = requests.post(url, json=payload, timeout=OLLAMA_TIMEOUT)
190
- r.raise_for_status()
191
- j = r.json() or {}
192
- return (j.get("message") or {}).get("content", "") or ""
193
- except Exception:
194
- return ""
195
-
196
- STRUCTURED_EXTRACTION_SYSTEM = """\
197
- You are a travel assistant that extracts structured fields from Korean user queries.
198
- Return ONLY a valid JSON object:
199
- {
200
- "emotion": "happy|sad|stressed|excited|tired|none",
201
- "intent": "beach|hiking|shopping|food|museum|relaxing|none",
202
- "country_hint": "",
203
- "city_hint": "",
204
- "themes_hint": ["<0..3 words>"],
205
- "notes": "<very short reasoning in Korean>"
206
- }
207
- If unknown, use "none" or "" and NEVER add extra text outside JSON.
208
- """
209
-
210
- def _build_structured_user_prompt(user_text: str) -> str:
211
- return (
212
- "다음 한국어 문장에서 감정/의도/지역/테마 힌트를 추출해 주세요. "
213
- "오직 유효한 JSON만 반환하세요.\n\n"
214
- f"문장: {user_text}\n"
215
- )
216
-
217
- def _llm_structured_extract(user_text: str):
218
- out = _call_ollama_chat([
219
- {"role": "system", "content": STRUCTURED_EXTRACTION_SYSTEM},
220
- {"role": "user", "content": _build_structured_user_prompt(user_text)}
221
- ])
222
- try:
223
- data = json.loads(out)
224
- except Exception:
225
- data = {}
226
- data.setdefault("emotion", "none")
227
- data.setdefault("intent", "none")
228
- data.setdefault("country_hint", "")
229
- data.setdefault("city_hint", "")
230
- data.setdefault("themes_hint", [])
231
- data.setdefault("notes", "")
232
- return data
233
-
234
- # ──────────────────────────────── 규칙/LLM 신호 병합 ────────────────────────────────
235
- def _merge_signals(user_input: str,
236
- travel_df: pd.DataFrame,
237
- use_llm: bool = True,
238
- intent_threshold: float = 0.70):
239
- country_rb, city_rb, loc_mode = detect_location_filter(user_input)
240
- intent_rb, intent_score = detect_intent(user_input)
241
- llm = _llm_structured_extract(user_input) if use_llm else {
242
- "emotion": "none", "intent": "none",
243
- "country_hint": "", "city_hint": "",
244
- "themes_hint": [], "notes": ""
245
- }
246
- country = country_rb or (llm["country_hint"] or "")
247
- city = city_rb or (llm["city_hint"] or "")
248
- city_exists = bool(city) and city in travel_df["여행도시"].values
249
- country_exists = bool(country) and country in travel_df["여행나라"].values
250
- if intent_score >= intent_threshold:
251
- intent = intent_rb
252
- else:
253
- intent = llm["intent"] if llm["intent"] != "none" else intent_rb
254
- if city_exists or country_exists:
255
- mode = "region"
256
- elif intent and intent_score >= intent_threshold:
257
- mode = "intent"
258
- elif country or city:
259
- mode = "unknown"
260
- else:
261
- mode = "emotion"
262
- return mode, country, city, intent, llm
263
-
264
- def _llm_place_copy(city: str, place: str) -> str:
265
- sys = "You are a Korean copywriter for a travel agency."
266
- prompt = (
267
- f"'{city} - {place}'를 2문장으로 매력적으로 소개해줘. "
268
- "첫 문장은 감성 한 줄, 둘째 문장은 활동/포인트 3개를 쉼표로 요약. 존댓말, 과장 금지."
269
- )
270
- out = _call_ollama_chat([
271
- {"role": "system", "content": sys},
272
- {"role": "user", "content": prompt}
273
- ], temperature=0.6, top_p=0.9)
274
- return out.strip()
275
- # ──────────────────────────────── Streamlit UI + main ────────────────────────────────
276
- st.set_page_config(page_title="여행은 모두투어 : 모아(MoAi)", layout="centered")
277
-
278
- st.sidebar.subheader("⚙️ 대화 표시")
279
- st.sidebar.selectbox("테마", ["피스타치오", "스카이블루", "크리미오트"], key="bubble_theme")
280
- st.sidebar.toggle("타임스탬프 표시", value=False, key="show_time")
281
- st.sidebar.toggle("타자 효과", value=False, key="typewriter_on")
282
-
283
- # LLM 옵션
284
- st.sidebar.toggle("🧠 LLM 보강 사용", value=True, key="use_llm")
285
- st.sidebar.slider("의도 인식 임계값", 0.5, 0.95, 0.70, 0.01, key="intent_threshold")
286
-
287
- from css import render_message, render_chip_buttons, log_and_render, replay_log, _get_colors
288
-
289
- def init_session():
290
- if "chat_log" not in st.session_state:
291
- st.session_state.chat_log = []
292
- if "mode" not in st.session_state:
293
- st.session_state.mode = None
294
- if "user_input" not in st.session_state:
295
- st.session_state.user_input = ""
296
-
297
- def main():
298
- init_session()
299
- chat_container = st.container()
300
-
301
- if "chat_log" in st.session_state and st.session_state.chat_log:
302
- replay_log(chat_container)
303
-
304
- if not st.session_state.get("greeting_rendered", False):
305
- greeting_message = (
306
- "안녕하세요. <strong>모아(MoAi)</strong>입니다.🤖<br><br>"
307
- "요즘 어떤 여행이 떠오르세요?<br>""모아가 딱 맞는 여행지를 찾아드릴게요."
308
- )
309
- log_and_render(greeting_message, sender="bot", chat_container=chat_container, key="greeting")
310
- st.session_state["greeting_rendered"] = True
311
-
312
- user_input = st.text_input("입력창",
313
- placeholder="ex) '요즘 힐링이 필요해요', '가족 여행 어디가 좋을까요?'",
314
- key="user_input", label_visibility="collapsed")
315
-
316
- if user_input:
317
- mode, country_filter, city_filter, intent, llm_dbg = _merge_signals(
318
- user_input=user_input,
319
- travel_df=travel_df,
320
- use_llm=st.session_state.get("use_llm", True),
321
- intent_threshold=st.session_state.get("intent_threshold", 0.70)
322
- )
323
- if st.session_state.get("use_llm") and llm_dbg.get("notes"):
324
- log_and_render(f"🧩 LLM 해석: {llm_dbg['notes']}",
325
- sender="bot", chat_container=chat_container,
326
- key=f"llm_notes_{random.randint(1,999999)}")
327
-
328
- if mode == "region":
329
- region_ui(travel_df, external_score_df, festival_df, weather_df, package_df,
330
- country_filter, city_filter, chat_container, log_and_render)
331
- return
332
- elif mode == "intent":
333
- intent_ui(travel_df, external_score_df, festival_df, weather_df, package_df,
334
- country_filter, city_filter, chat_container, intent, log_and_render)
335
- return
336
- elif mode == "unknown":
337
- unknown_ui(country_filter, city_filter, chat_container, log_and_render)
338
- return
339
- else:
340
- top_emotions, emotion_groups = analyze_emotion(user_input)
341
- candidate_themes = extract_themes(emotion_groups, intent, force_mode=False)
342
- emotion_ui(travel_df, external_score_df, festival_df, weather_df, package_df,
343
- country_filter, city_filter, chat_container,
344
- candidate_themes, intent, emotion_groups, top_emotions, log_and_render)
345
- return
346
-
347
- if __name__ == "__main__":
348
- main()
 
1
  # -*- coding: utf-8 -*-
2
+ # ──────────────────────────────── BOOTSTRAP (must be first) ────────────────────────────────
3
+ import os, pathlib, io, json, random
 
 
4
 
5
+ HOME = pathlib.Path.home() # 실행 사용자 디렉터리 (쓰기 가능)
 
6
  APP_DIR = pathlib.Path(__file__).parent.resolve()
7
+
8
+ # Streamlit 홈/설정
9
  STREAMLIT_DIR = HOME / ".streamlit"
10
  STREAMLIT_DIR.mkdir(parents=True, exist_ok=True)
11
  os.environ["STREAMLIT_HOME"] = str(STREAMLIT_DIR)
12
  os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
13
  os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
14
 
15
+ # ✅ HF/Transformers 캐시: 홈 밑의 .cache 사용 (필요 시 HF_CACHE_ROOT로 오버라이드 가능)
16
+ CACHE_ROOT = pathlib.Path(os.environ.get("HF_CACHE_ROOT", HOME / ".cache" / f"u{os.getuid()}"))
17
+ HF_HOME = CACHE_ROOT / "hf-home"
18
+ TRANSFORMERS_CACHE = CACHE_ROOT / "hf-cache"
19
+ HUB_CACHE = CACHE_ROOT / "hf-cache"
20
+ TORCH_HOME = CACHE_ROOT / "torch-cache"
21
+ XDG_CACHE_HOME = CACHE_ROOT / "xdg-cache"
22
+
23
+ # 폴더 생성 (권한 오류가 나면 /tmp로 자동 폴백)
24
+ try:
25
+ for p in [HF_HOME, TRANSFORMERS_CACHE, HUB_CACHE, TORCH_HOME, XDG_CACHE_HOME]:
26
+ p.mkdir(parents=True, exist_ok=True)
27
+ except PermissionError:
28
+ TMP_ROOT = pathlib.Path("/tmp") / f"hf-cache-u{os.getuid()}"
29
+ HF_HOME = TMP_ROOT / "hf-home"
30
+ TRANSFORMERS_CACHE = TMP_ROOT / "hf-cache"
31
+ HUB_CACHE = TMP_ROOT / "hf-cache"
32
+ TORCH_HOME = TMP_ROOT / "torch-cache"
33
+ XDG_CACHE_HOME = TMP_ROOT / "xdg-cache"
34
+ for p in [HF_HOME, TRANSFORMERS_CACHE, HUB_CACHE, TORCH_HOME, XDG_CACHE_HOME]:
35
+ p.mkdir(parents=True, exist_ok=True)
36
+
37
+ os.environ["HF_HOME"] = str(HF_HOME)
38
+ os.environ["TRANSFORMERS_CACHE"] = str(TRANSFORMERS_CACHE)
39
+ os.environ["HUGGINGFACE_HUB_CACHE"] = str(HUB_CACHE)
40
+ os.environ["TORCH_HOME"] = str(TORCH_HOME)
41
+ os.environ["XDG_CACHE_HOME"] = str(XDG_CACHE_HOME)
42
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
43
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
44
+
45
  from huggingface_hub import hf_hub_download
46
+ import pandas as pd
47
+ import streamlit as st
48
+ from streamlit.components.v1 import html
49
+ from css import render_message, render_chip_buttons, log_and_render, replay_log, _get_colors
50
 
51
+ # ──────────────────────────────── Dataset Repo 설정 ────────────────────────────────
52
+ HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "emisdfde/moai-travel-data")
 
53
  HF_DATASET_REV = os.getenv("HF_DATASET_REV", "main")
54
 
55
  def _is_pointer_bytes(b: bytes) -> bool:
 
57
  return (
58
  "version https://git-lfs.github.com/spec/v1" in head
59
  or "git-lfs" in head
60
+ or "xet" in head # e.g. xet 포인터
61
  or "pointer size" in head
62
  )
63
 
64
  def _read_csv_bytes(b: bytes) -> pd.DataFrame:
 
65
  try:
66
  return pd.read_csv(io.BytesIO(b), encoding="utf-8")
67
  except UnicodeDecodeError:
 
69
 
70
  def load_csv_smart(local_path: str,
71
  hub_filename: str | None = None,
72
+ repo_id: str = HF_DATASET_REPO,
73
  repo_type: str = "dataset",
74
  revision: str = HF_DATASET_REV) -> pd.DataFrame:
 
 
 
 
 
75
  if hub_filename is None:
76
  hub_filename = os.path.basename(local_path)
 
 
77
  if os.path.exists(local_path):
78
  with open(local_path, "rb") as f:
79
  data = f.read()
80
  if not _is_pointer_bytes(data):
81
  return _read_csv_bytes(data)
82
+ cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
83
+ repo_type=repo_type, revision=revision)
84
+ try:
85
+ return pd.read_csv(cached, encoding="utf-8")
86
+ except UnicodeDecodeError:
87
+ return pd.read_csv(cached, encoding="cp949")
 
 
 
 
 
 
 
 
 
 
88
 
89
  def load_json_smart(local_path: str,
90
  hub_filename: str | None = None,
91
+ repo_id: str = HF_DATASET_REPO,
92
  repo_type: str = "dataset",
93
  revision: str = HF_DATASET_REV):
94
  if hub_filename is None:
95
  hub_filename = os.path.basename(local_path)
 
 
96
  if os.path.exists(local_path):
97
  with open(local_path, "rb") as f:
98
  data = f.read()
99
  if not _is_pointer_bytes(data):
100
+ return json.loads(data.decode("utf-8"))
101
+ cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
102
+ repo_type=repo_type, revision=revision)
103
+ with open(cached, "r", encoding="utf-8") as f:
104
+ return json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # ──────────────────────────────── 데이터 로드 ────────────────────────────────
107
  travel_df = load_csv_smart("trip_emotions.csv", "trip_emotions.csv")
108
  external_score_df = load_csv_smart("external_scores.csv", "external_scores.csv")
109
  festival_df = load_csv_smart("festivals.csv", "festivals.csv")
 
111
  package_df = load_csv_smart("packages.csv", "packages.csv")
112
  master_df = load_csv_smart("countries_cities.csv", "countries_cities.csv")
113
  theme_title_phrases = load_json_smart("theme_title_phrases.json", "theme_title_phrases.json")