Spaces:

chatbotMOAI
/

MOAI

Sleeping

App Files Files Community

Jongpal12 commited on Sep 8, 2025

Commit

ca4b63f

verified ·

1 Parent(s): 06b0d5c

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -291

app.py CHANGED Viewed

@@ -1,23 +1,55 @@
 # -*- coding: utf-8 -*-
-import os, pathlib, io, json, random, requests
-import pandas as pd
-import streamlit as st
-from streamlit.components.v1 import html
-# ──────────────────────────────── 캐시/환경 경로 설정 ────────────────────────────────
-HOME = pathlib.Path.home()
 APP_DIR = pathlib.Path(__file__).parent.resolve()
 STREAMLIT_DIR = HOME / ".streamlit"
 STREAMLIT_DIR.mkdir(parents=True, exist_ok=True)
 os.environ["STREAMLIT_HOME"] = str(STREAMLIT_DIR)
 os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
 os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
 from huggingface_hub import hf_hub_download
-# (선택) 데이터셋 리포를 쓰고 싶으면 환경변수로 지정하세요.
-# 예: HF_DATASET_REPO="yourname/moai-travel-data", HF_DATASET_REV="main"
-HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", None)  # None이면 로컬 우선
 HF_DATASET_REV  = os.getenv("HF_DATASET_REV", "main")
 def _is_pointer_bytes(b: bytes) -> bool:
@@ -25,12 +57,11 @@ def _is_pointer_bytes(b: bytes) -> bool:
     return (
         "version https://git-lfs.github.com/spec/v1" in head
         or "git-lfs" in head
-        or "xet" in head
         or "pointer size" in head
     )
 def _read_csv_bytes(b: bytes) -> pd.DataFrame:
-    # utf-8 → cp949 순으로 시도
     try:
         return pd.read_csv(io.BytesIO(b), encoding="utf-8")
     except UnicodeDecodeError:
@@ -38,81 +69,41 @@ def _read_csv_bytes(b: bytes) -> pd.DataFrame:
 def load_csv_smart(local_path: str,
                    hub_filename: str | None = None,
-                   repo_id: str | None = HF_DATASET_REPO,
                    repo_type: str = "dataset",
                    revision: str = HF_DATASET_REV) -> pd.DataFrame:
-    """
-    1) 로컬 파일이 있으면 즉시 사용
-    2) 없고 repo_id가 있으면 HF Hub에서 받아서 사용
-    3) 둘 다 실패하면 Streamlit 에러
-    """
     if hub_filename is None:
         hub_filename = os.path.basename(local_path)
-    # 1) 로컬 우선
     if os.path.exists(local_path):
         with open(local_path, "rb") as f:
             data = f.read()
         if not _is_pointer_bytes(data):
             return _read_csv_bytes(data)
-    # 2) 허브에서 받기 (repo_id가 설정된 경우)
-    if repo_id:
-        try:
-            cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
-                                     repo_type=repo_type, revision=revision)
-            # 파일 자체를 다시 읽어서 인코딩 안전처리
-            with open(cached, "rb") as f:
-                data = f.read()
-            return _read_csv_bytes(data)
-        except Exception as e:
-            st.error(f"Hub에서 {hub_filename} 받기 실패: {e}")
-    # 3) 최종 실패
-    st.error(f"데이터 파일을 찾을 수 없습니다: {local_path} (또는 Hub: {hub_filename})")
-    st.stop()
 def load_json_smart(local_path: str,
                     hub_filename: str | None = None,
-                    repo_id: str | None = HF_DATASET_REPO,
                     repo_type: str = "dataset",
                     revision: str = HF_DATASET_REV):
     if hub_filename is None:
         hub_filename = os.path.basename(local_path)
-    # 1) 로컬 우선
     if os.path.exists(local_path):
         with open(local_path, "rb") as f:
             data = f.read()
         if not _is_pointer_bytes(data):
-            try:
-                return json.loads(data.decode("utf-8"))
-            except Exception:
-                return json.loads(data.decode("cp949"))
-    # 2) 허브
-    if repo_id:
-        try:
-            cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
-                                     repo_type=repo_type, revision=revision)
-            with open(cached, "r", encoding="utf-8") as f:
-                return json.load(f)
-        except Exception as e:
-            st.error(f"Hub에서 {hub_filename} 받기 실패: {e}")
-    # 3) 최종 실패
-    st.error(f"JSON 파일을 찾을 수 없습니다: {local_path} (또는 Hub: {hub_filename})")
-    st.stop()
-# ──────────────────────────────── CSV 안전 로더 ────────────────────────────────
-def read_csv_safe(path, encodings=("utf-8", "cp949")):
-    last_err = None
-    for enc in encodings:
-        try:
-            return pd.read_csv(path, encoding=enc)
-        except Exception as e:
-            last_err = e
-    raise last_err
 travel_df         = load_csv_smart("trip_emotions.csv",      "trip_emotions.csv")
 external_score_df = load_csv_smart("external_scores.csv",    "external_scores.csv")
 festival_df       = load_csv_smart("festivals.csv",          "festivals.csv")
@@ -120,229 +111,3 @@ weather_df        = load_csv_smart("weather.csv",            "weather.csv")
 package_df        = load_csv_smart("packages.csv",           "packages.csv")
 master_df         = load_csv_smart("countries_cities.csv",   "countries_cities.csv")
 theme_title_phrases = load_json_smart("theme_title_phrases.json", "theme_title_phrases.json")
-# ──────────────────────────────── theme_title_phrases ────────────────────────────────
-def load_theme_title_phrases(json_path="theme_title_phrases.json"):
-    default_map = {
-        "힐링": ["휴양 가볍게", "조용히 쉬기", "잔잔한 힐링"],
-        "액티비티": ["스릴 가득", "체험 중심", "짜릿한 하루"],
-        "미식": ["현지 미식 탐방", "숨은 맛집", "식도락 여행"],
-        "자연": ["자연 한가운데", "풍경 맛집", "자연 충전"],
-        "도시": ["핫플 모음", "핵심만 알차게", "도심 산책"],
-        "문화": ["역사와 예술", "전통과 현대", "아카이빙 투어"],
-        "가성비": ["알뜰 추천", "가심비 만족", "똑똑한 선택"],
-        "추천": ["핵심 하이라이트", "이번엔 여기", "요즘 뜨는 곳"]
-    }
-    if os.path.exists(json_path):
-        try:
-            with open(json_path, "r", encoding="utf-8") as f:
-                data = json.load(f)
-            if isinstance(data, dict) and data:
-                return data
-        except Exception:
-            pass
-    with open(json_path, "w", encoding="utf-8") as f:
-        json.dump(default_map, f, ensure_ascii=False, indent=2)
-    return default_map
-theme_title_phrases = load_theme_title_phrases("theme_title_phrases.json")
-# ───────────────────────────���──── chat_a 모듈 ────────────────────────────────
-from chat_a import (
-    analyze_emotion, detect_intent, extract_themes,
-    recommend_places_by_theme, detect_location_filter,
-    generate_intro_message, theme_ui_map, ui_to_theme_map,
-    theme_opening_lines, intent_opening_lines, apply_weighted_score_filter,
-    get_highlight_message, get_weather_message, get_intent_intro_message,
-    recommend_packages, handle_selected_place, generate_region_intro,
-    parse_companion_and_age, filter_packages_by_companion_age,
-    make_top2_description_custom, format_summary_tags_custom,
-    make_companion_age_message
-)
-# ──────────────────────────────── Ollama LLM (gemma2:9b) ────────────────────────────────
-OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
-OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "gemma2:9b")
-OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "60"))
-def _call_ollama_chat(messages, model=OLLAMA_MODEL,
-                      temperature=0.8, top_p=0.9, top_k=40, repeat_penalty=1.1,
-                      system_prompt=None):
-    url = f"{OLLAMA_HOST}/api/chat"
-    _msgs = []
-    if system_prompt:
-        _msgs.append({"role": "system", "content": system_prompt})
-    _msgs.extend(messages)
-    payload = {
-        "model": model,
-        "messages": _msgs,
-        "options": {
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repeat_penalty": repeat_penalty,
-        },
-        "stream": False,
-    }
-    try:
-        r = requests.post(url, json=payload, timeout=OLLAMA_TIMEOUT)
-        r.raise_for_status()
-        j = r.json() or {}
-        return (j.get("message") or {}).get("content", "") or ""
-    except Exception:
-        return ""
-STRUCTURED_EXTRACTION_SYSTEM = """\
-You are a travel assistant that extracts structured fields from Korean user queries.
-Return ONLY a valid JSON object:
-{
-  "emotion": "happy|sad|stressed|excited|tired|none",
-  "intent": "beach|hiking|shopping|food|museum|relaxing|none",
-  "country_hint": "",
-  "city_hint": "",
-  "themes_hint": ["<0..3 words>"],
-  "notes": "<very short reasoning in Korean>"
-}
-If unknown, use "none" or "" and NEVER add extra text outside JSON.
-"""
-def _build_structured_user_prompt(user_text: str) -> str:
-    return (
-        "다음 한국어 문장에서 감정/의도/지역/테마 힌트를 추출해 주세요. "
-        "오직 유효한 JSON만 반환하세요.\n\n"
-        f"문장: {user_text}\n"
-    )
-def _llm_structured_extract(user_text: str):
-    out = _call_ollama_chat([
-        {"role": "system", "content": STRUCTURED_EXTRACTION_SYSTEM},
-        {"role": "user", "content": _build_structured_user_prompt(user_text)}
-    ])
-    try:
-        data = json.loads(out)
-    except Exception:
-        data = {}
-    data.setdefault("emotion", "none")
-    data.setdefault("intent", "none")
-    data.setdefault("country_hint", "")
-    data.setdefault("city_hint", "")
-    data.setdefault("themes_hint", [])
-    data.setdefault("notes", "")
-    return data
-# ──────────────────────────────── 규칙/LLM 신호 병합 ────────────────────────────────
-def _merge_signals(user_input: str,
-                   travel_df: pd.DataFrame,
-                   use_llm: bool = True,
-                   intent_threshold: float = 0.70):
-    country_rb, city_rb, loc_mode = detect_location_filter(user_input)
-    intent_rb, intent_score = detect_intent(user_input)
-    llm = _llm_structured_extract(user_input) if use_llm else {
-        "emotion": "none", "intent": "none",
-        "country_hint": "", "city_hint": "",
-        "themes_hint": [], "notes": ""
-    }
-    country = country_rb or (llm["country_hint"] or "")
-    city    = city_rb    or (llm["city_hint"] or "")
-    city_exists    = bool(city)    and city    in travel_df["여행도시"].values
-    country_exists = bool(country) and country in travel_df["여행나라"].values
-    if intent_score >= intent_threshold:
-        intent = intent_rb
-    else:
-        intent = llm["intent"] if llm["intent"] != "none" else intent_rb
-    if city_exists or country_exists:
-        mode = "region"
-    elif intent and intent_score >= intent_threshold:
-        mode = "intent"
-    elif country or city:
-        mode = "unknown"
-    else:
-        mode = "emotion"
-    return mode, country, city, intent, llm
-def _llm_place_copy(city: str, place: str) -> str:
-    sys = "You are a Korean copywriter for a travel agency."
-    prompt = (
-        f"'{city} - {place}'를 2문장으로 매력적으로 소개해줘. "
-        "첫 문장은 감성 한 줄, 둘째 문장은 활동/포인트 3개를 쉼표로 요약. 존댓말, 과장 금지."
-    )
-    out = _call_ollama_chat([
-        {"role": "system", "content": sys},
-        {"role": "user", "content": prompt}
-    ], temperature=0.6, top_p=0.9)
-    return out.strip()
-# ──────────────────────────────── Streamlit UI + main ────────────────────────────────
-st.set_page_config(page_title="여행은 모두투어 : 모아(MoAi)", layout="centered")
-st.sidebar.subheader("⚙️ 대화 표시")
-st.sidebar.selectbox("테마", ["피스타치오", "스카이블루", "크리미오트"], key="bubble_theme")
-st.sidebar.toggle("타임스탬프 표시", value=False, key="show_time")
-st.sidebar.toggle("타자 효과", value=False, key="typewriter_on")
-# LLM 옵션
-st.sidebar.toggle("🧠 LLM 보강 사용", value=True, key="use_llm")
-st.sidebar.slider("의도 인식 임계값", 0.5, 0.95, 0.70, 0.01, key="intent_threshold")
-from css import render_message, render_chip_buttons, log_and_render, replay_log, _get_colors
-def init_session():
-    if "chat_log" not in st.session_state:
-        st.session_state.chat_log = []
-    if "mode" not in st.session_state:
-        st.session_state.mode = None
-    if "user_input" not in st.session_state:
-        st.session_state.user_input = ""
-def main():
-    init_session()
-    chat_container = st.container()
-    if "chat_log" in st.session_state and st.session_state.chat_log:
-        replay_log(chat_container)
-    if not st.session_state.get("greeting_rendered", False):
-        greeting_message = (
-            "안녕하세요. <strong>모아(MoAi)</strong>입니다.🤖<br><br>"
-            "요즘 어떤 여행이 떠오르세요?<br>""모아가 딱 맞는 여행지를 찾아드릴게요."
-        )
-        log_and_render(greeting_message, sender="bot", chat_container=chat_container, key="greeting")
-        st.session_state["greeting_rendered"] = True
-    user_input = st.text_input("입력창",
-                               placeholder="ex) '요즘 힐링이 필요해요', '가족 여행 어디가 좋을까요?'",
-                               key="user_input", label_visibility="collapsed")
-    if user_input:
-        mode, country_filter, city_filter, intent, llm_dbg = _merge_signals(
-            user_input=user_input,
-            travel_df=travel_df,
-            use_llm=st.session_state.get("use_llm", True),
-            intent_threshold=st.session_state.get("intent_threshold", 0.70)
-        )
-        if st.session_state.get("use_llm") and llm_dbg.get("notes"):
-            log_and_render(f"🧩 LLM 해석: {llm_dbg['notes']}",
-                           sender="bot", chat_container=chat_container,
-                           key=f"llm_notes_{random.randint(1,999999)}")
-        if mode == "region":
-            region_ui(travel_df, external_score_df, festival_df, weather_df, package_df,
-                      country_filter, city_filter, chat_container, log_and_render)
-            return
-        elif mode == "intent":
-            intent_ui(travel_df, external_score_df, festival_df, weather_df, package_df,
-                      country_filter, city_filter, chat_container, intent, log_and_render)
-            return
-        elif mode == "unknown":
-            unknown_ui(country_filter, city_filter, chat_container, log_and_render)
-            return
-        else:
-            top_emotions, emotion_groups = analyze_emotion(user_input)
-            candidate_themes = extract_themes(emotion_groups, intent, force_mode=False)
-            emotion_ui(travel_df, external_score_df, festival_df, weather_df, package_df,
-                       country_filter, city_filter, chat_container,
-                       candidate_themes, intent, emotion_groups, top_emotions, log_and_render)
-            return
-if __name__ == "__main__":
-    main()

 # -*- coding: utf-8 -*-
+# ──────────────────────────────── BOOTSTRAP (must be first) ────────────────────────────────
+import os, pathlib, io, json, random
+HOME = pathlib.Path.home()                               # ✅ 실행 사용자 홈 디렉터리 (쓰기 가능)
 APP_DIR = pathlib.Path(__file__).parent.resolve()
+# Streamlit 홈/설정
 STREAMLIT_DIR = HOME / ".streamlit"
 STREAMLIT_DIR.mkdir(parents=True, exist_ok=True)
 os.environ["STREAMLIT_HOME"] = str(STREAMLIT_DIR)
 os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
 os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
+# ✅ HF/Transformers 캐시: 홈 밑의 .cache 사용 (필요 시 HF_CACHE_ROOT로 오버라이드 가능)
+CACHE_ROOT = pathlib.Path(os.environ.get("HF_CACHE_ROOT", HOME / ".cache" / f"u{os.getuid()}"))
+HF_HOME              = CACHE_ROOT / "hf-home"
+TRANSFORMERS_CACHE   = CACHE_ROOT / "hf-cache"
+HUB_CACHE            = CACHE_ROOT / "hf-cache"
+TORCH_HOME           = CACHE_ROOT / "torch-cache"
+XDG_CACHE_HOME       = CACHE_ROOT / "xdg-cache"
+# 폴더 생성 (권한 오류가 나면 /tmp로 자동 폴백)
+try:
+    for p in [HF_HOME, TRANSFORMERS_CACHE, HUB_CACHE, TORCH_HOME, XDG_CACHE_HOME]:
+        p.mkdir(parents=True, exist_ok=True)
+except PermissionError:
+    TMP_ROOT = pathlib.Path("/tmp") / f"hf-cache-u{os.getuid()}"
+    HF_HOME              = TMP_ROOT / "hf-home"
+    TRANSFORMERS_CACHE   = TMP_ROOT / "hf-cache"
+    HUB_CACHE            = TMP_ROOT / "hf-cache"
+    TORCH_HOME           = TMP_ROOT / "torch-cache"
+    XDG_CACHE_HOME       = TMP_ROOT / "xdg-cache"
+    for p in [HF_HOME, TRANSFORMERS_CACHE, HUB_CACHE, TORCH_HOME, XDG_CACHE_HOME]:
+        p.mkdir(parents=True, exist_ok=True)
+os.environ["HF_HOME"]               = str(HF_HOME)
+os.environ["TRANSFORMERS_CACHE"]    = str(TRANSFORMERS_CACHE)
+os.environ["HUGGINGFACE_HUB_CACHE"] = str(HUB_CACHE)
+os.environ["TORCH_HOME"]            = str(TORCH_HOME)
+os.environ["XDG_CACHE_HOME"]        = str(XDG_CACHE_HOME)
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 from huggingface_hub import hf_hub_download
+import pandas as pd
+import streamlit as st
+from streamlit.components.v1 import html
+from css import render_message, render_chip_buttons, log_and_render, replay_log, _get_colors
+# ──────────────────────────────── Dataset Repo 설정 ────────────────────────────────
+HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "emisdfde/moai-travel-data")
 HF_DATASET_REV  = os.getenv("HF_DATASET_REV", "main")
 def _is_pointer_bytes(b: bytes) -> bool:
     return (
         "version https://git-lfs.github.com/spec/v1" in head
         or "git-lfs" in head
+        or "xet" in head          # e.g. xet 포인터
         or "pointer size" in head
     )
 def _read_csv_bytes(b: bytes) -> pd.DataFrame:
     try:
         return pd.read_csv(io.BytesIO(b), encoding="utf-8")
     except UnicodeDecodeError:
 def load_csv_smart(local_path: str,
                    hub_filename: str | None = None,
+                   repo_id: str = HF_DATASET_REPO,
                    repo_type: str = "dataset",
                    revision: str = HF_DATASET_REV) -> pd.DataFrame:
     if hub_filename is None:
         hub_filename = os.path.basename(local_path)
     if os.path.exists(local_path):
         with open(local_path, "rb") as f:
             data = f.read()
         if not _is_pointer_bytes(data):
             return _read_csv_bytes(data)
+    cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
+                             repo_type=repo_type, revision=revision)
+    try:
+        return pd.read_csv(cached, encoding="utf-8")
+    except UnicodeDecodeError:
+        return pd.read_csv(cached, encoding="cp949")
 def load_json_smart(local_path: str,
                     hub_filename: str | None = None,
+                    repo_id: str = HF_DATASET_REPO,
                     repo_type: str = "dataset",
                     revision: str = HF_DATASET_REV):
     if hub_filename is None:
         hub_filename = os.path.basename(local_path)
     if os.path.exists(local_path):
         with open(local_path, "rb") as f:
             data = f.read()
         if not _is_pointer_bytes(data):
+            return json.loads(data.decode("utf-8"))
+    cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
+                             repo_type=repo_type, revision=revision)
+    with open(cached, "r", encoding="utf-8") as f:
+        return json.load(f)
+# ──────────────────────────────── 데이터 로드 ────────────────────────────────
 travel_df         = load_csv_smart("trip_emotions.csv",      "trip_emotions.csv")
 external_score_df = load_csv_smart("external_scores.csv",    "external_scores.csv")
 festival_df       = load_csv_smart("festivals.csv",          "festivals.csv")
 package_df        = load_csv_smart("packages.csv",           "packages.csv")
 master_df         = load_csv_smart("countries_cities.csv",   "countries_cities.csv")
 theme_title_phrases = load_json_smart("theme_title_phrases.json", "theme_title_phrases.json")