wenbemi commited on
Commit
a6c552a
Β·
verified Β·
1 Parent(s): 3308e49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -95
app.py CHANGED
@@ -1,45 +1,54 @@
1
- # streamlit λΌμ΄λΈŒλŸ¬λ¦¬κ°€ import 되기 전에,
2
- # μ„€μ • 파일 경둜λ₯Ό μ•± λ‚΄λΆ€μ˜ μ“°κΈ° κ°€λŠ₯ν•œ 경둜둜 κ°•μ œ μ§€μ •ν•©λ‹ˆλ‹€.
3
- import os, pathlib, io
4
  APP_DIR = pathlib.Path(__file__).parent.resolve()
5
 
6
- # λͺ¨λ“  HF/Transformers μΊμ‹œλ₯Ό /tmp μͺ½μœΌλ‘œ κ°•μ œ
7
- os.environ.setdefault("HF_HOME", "/tmp/hf-home")
8
- os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf-cache")
9
- os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/tmp/hf-cache")
10
- os.environ.setdefault("TORCH_HOME", "/tmp/torch-cache")
11
- os.environ.setdefault("XDG_CACHE_HOME", "/tmp/xdg-cache")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 
13
 
14
- # 디렉터리 보μž₯
15
- for p in ["/tmp/hf-home", "/tmp/hf-cache", "/tmp/torch-cache", "/tmp/xdg-cache"]:
16
- os.makedirs(p, exist_ok=True)
17
-
18
  from huggingface_hub import hf_hub_download
19
  import pandas as pd
20
- import json
21
- import random
22
-
23
- APP_DIR = pathlib.Path(__file__).parent.resolve()
24
- os.environ.setdefault("HOME", str(APP_DIR)) # '~'κ°€ /κ°€ μ•„λ‹ˆλΌ /app으둜 가도둝
25
- CONFIG_DIR = APP_DIR / ".streamlit"
26
- CONFIG_DIR.mkdir(parents=True, exist_ok=True)
27
 
28
- os.environ["STREAMLIT_HOME"] = str(CONFIG_DIR)
29
- os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
30
- os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false" # 선택: metrics 파일 생성 쀄이기
31
 
32
- HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "emisdfde/moai-travel-data") # ← 본인 리포
 
33
  HF_DATASET_REV = os.getenv("HF_DATASET_REV", "main")
34
 
35
  def _is_pointer_bytes(b: bytes) -> bool:
36
  head = b[:2048].decode(errors="ignore").lower()
37
- # git-lfs / xet 포인터 ν…μŠ€νŠΈ νŒ¨ν„΄ λͺ¨λ‘ 감지
38
  return (
39
- "version https://git-lfs.github.com/spec/v1" in head or
40
- "git-lfs" in head or
41
- "xet" in head or # e.g. "Xet backed hash"
42
- "pointer size" in head
43
  )
44
 
45
  def _read_csv_bytes(b: bytes) -> pd.DataFrame:
@@ -52,88 +61,59 @@ def load_csv_smart(local_path: str,
52
  hub_filename: str | None = None,
53
  repo_id: str = HF_DATASET_REPO,
54
  repo_type: str = "dataset",
55
- revision: str = HF_DATASET_REV):
 
56
  if hub_filename is None:
57
  hub_filename = os.path.basename(local_path)
 
58
  if os.path.exists(local_path):
59
  with open(local_path, "rb") as f:
60
  data = f.read()
61
  if not _is_pointer_bytes(data):
62
- try:
63
- return pd.read_csv(io.BytesIO(data), encoding="utf-8")
64
- except UnicodeDecodeError:
65
- return pd.read_csv(io.BytesIO(data), encoding="cp949")
66
  cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
67
  repo_type=repo_type, revision=revision)
68
  try:
69
  return pd.read_csv(cached, encoding="utf-8")
70
  except UnicodeDecodeError:
71
  return pd.read_csv(cached, encoding="cp949")
72
-
73
-
74
 
75
- import streamlit as st
76
- from streamlit.components.v1 import html
77
- from css import render_message, render_chip_buttons, log_and_render, replay_log
78
-
79
- import streamlit as st, pandas as pd, requests, json
80
-
81
- st.success("πŸŽ‰ 앱이 μ„±κ³΅μ μœΌλ‘œ μ‹œμž‘λ˜μ—ˆμŠ΅λ‹ˆλ‹€! 라이브러리 μ„€μΉ˜ 성곡!")
82
-
83
- @st.cache_data(show_spinner=False)
84
- def load_csv_any(p):
85
- return pd.read_csv(p) if str(p).startswith(("http://","https://")) else pd.read_csv(p)
86
-
87
- # 데이터 λ‘œλ”©μ„ μœ„ν•œ ν•¨μˆ˜
88
- @st.cache_data
89
- def load_travel_data(file_path):
90
- print(f"Caching {file_path}...") # μΊμ‹œκ°€ μ–Έμ œ μ‹€ν–‰λ˜λŠ”μ§€ ν™•μΈμš©
91
- return pd.read_csv(file_path)
92
-
93
- @st.cache_data
94
- def load_json_data(file_path):
95
- print(f"Caching {file_path}...")
96
- with open(file_path, "r", encoding="utf-8") as f:
97
  return json.load(f)
98
 
99
- @st.cache_data
100
- def load_data(path):
101
- try:
102
- # UTF-8 μΈμ½”λ”©μœΌλ‘œ λ¨Όμ € μ‹œλ„
103
- df = pd.read_csv(path, encoding='utf-8')
104
- print(f"βœ… SUCCESS (utf-8): {path} λ‘œλ“œ 성곡. 컬럼: {df.columns.tolist()}")
105
- return df
106
- except UnicodeDecodeError:
107
- # μ‹€νŒ¨ν•˜λ©΄ 'cp949' (ν•œκ΅­μ–΄ μœˆλ„μš° ν™˜κ²½) μΈμ½”λ”©μœΌλ‘œ μž¬μ‹œλ„
108
- print(f"⚠️ INFO: {path} utf-8 λ””μ½”λ”© μ‹€νŒ¨. cp949둜 μž¬μ‹œλ„ν•©λ‹ˆλ‹€.")
109
- df = pd.read_csv(path, encoding='cp949')
110
- print(f"βœ… SUCCESS (cp949): {path} λ‘œλ“œ 성곡. 컬럼: {df.columns.tolist()}")
111
- return df
112
- except Exception as e:
113
- print(f"❌ ERROR: {path} λ‘œλ“œ 쀑 μ—λŸ¬ λ°œμƒ: {e}")
114
- return pd.DataFrame() # 였λ₯˜ λ°œμƒ μ‹œ 빈 λ°μ΄ν„°ν”„λ ˆμž„ λ°˜ν™˜
115
-
116
-
117
- # ───────────────────────────────────── 데이터 λ‘œλ“œ
118
- # trip_url = st.secrets.get("TRIPDATA_URL")
119
- # if not trip_url:
120
- # st.error("TRIPDATA_URL λ―Έμ„€μ •: Streamlit Secrets에 URL을 λ„£μ–΄μ£Όμ„Έμš”.")
121
- # st.stop()
122
-
123
- travel_df = load_csv_smart("trip_emotions.csv")
124
- external_score_df = load_csv_smart("external_scores.csv")
125
- festival_df = load_csv_smart("festivals.csv")
126
- weather_df = load_csv_smart("weather.csv")
127
- package_df = load_csv_smart("packages.csv")
128
- master_df = load_csv_smart("countries_cities.csv")
129
- theme_title_phrases = load_json_data("theme_title_phrases.json")
130
-
131
- # travel_dfκ°€ μ„±κ³΅μ μœΌλ‘œ λ‘œλ“œλ˜μ—ˆλŠ”μ§€ μ΅œμ’… 확인
132
- if 'μ—¬ν–‰λ‚˜λΌ' not in travel_df.columns:
133
- st.error(f"데이터 λ‘œλ”© 후에도 'travel_df'에 'μ—¬ν–‰λ‚˜λΌ' 컬럼이 μ—†μŠ΅λ‹ˆλ‹€. μ‹€μ œ 컬럼: {travel_df.columns.tolist()}")
134
- st.stop()
135
-
136
  from chat_a import (
 
137
  analyze_emotion,
138
  detect_intent,
139
  extract_themes,
@@ -157,6 +137,17 @@ from chat_a import (
157
  format_summary_tags_custom,
158
  make_companion_age_message
159
  )
 
 
 
 
 
 
 
 
 
 
 
160
  # ───────────────────────────────────── streamlit용 ν•¨μˆ˜
161
  def init_session():
162
  if "chat_log" not in st.session_state:
 
1
+ # -*- coding: utf-8 -*-
2
+ # ──────────────────────────────── BOOTSTRAP (must be first) ────────────────────────────────
3
+ import os, pathlib, io, json, random
4
  APP_DIR = pathlib.Path(__file__).parent.resolve()
5
 
6
+ # Streamlit ν™ˆ/μ„€μ •
7
+ os.environ["HOME"] = str(APP_DIR)
8
+ CONFIG_DIR = APP_DIR / ".streamlit"
9
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
10
+ os.environ["STREAMLIT_HOME"] = str(CONFIG_DIR)
11
+ os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
12
+ os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
13
+
14
+ # HF/Transformers μΊμ‹œλ₯Ό **/data**둜 κ³ μ • (Spacesμ—μ„œ μ“°κΈ° κ°€λŠ₯)
15
+ CACHE_ROOT = os.environ.get("HF_CACHE_ROOT", "/data")
16
+ ENV_DIRS = {
17
+ "HF_HOME": f"{CACHE_ROOT}/hf-home",
18
+ "TRANSFORMERS_CACHE": f"{CACHE_ROOT}/hf-cache",
19
+ "HUGGINGFACE_HUB_CACHE": f"{CACHE_ROOT}/hf-cache",
20
+ "TORCH_HOME": f"{CACHE_ROOT}/torch-cache",
21
+ "XDG_CACHE_HOME": f"{CACHE_ROOT}/xdg-cache",
22
+ }
23
+ for k, v in ENV_DIRS.items():
24
+ os.environ[k] = v
25
+ os.makedirs(v, exist_ok=True)
26
+ try:
27
+ os.chmod(v, 0o777)
28
+ except Exception:
29
+ pass
30
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
31
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
32
 
 
 
 
 
33
  from huggingface_hub import hf_hub_download
34
  import pandas as pd
35
+ import streamlit as st
36
+ from streamlit.components.v1 import html
37
+ from css import render_message, render_chip_buttons, log_and_render, replay_log
 
 
 
 
38
 
39
+ st.success("πŸŽ‰ 앱이 μ„±κ³΅μ μœΌλ‘œ μ‹œμž‘λ˜μ—ˆμŠ΅λ‹ˆλ‹€! 라이브러리 μ„€μΉ˜ 성곡!")
 
 
40
 
41
+ # ──────────────────────────────── Dataset Repo μ„€μ • ────────────────────────────────
42
+ HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "emisdfde/moai-travel-data")
43
  HF_DATASET_REV = os.getenv("HF_DATASET_REV", "main")
44
 
45
  def _is_pointer_bytes(b: bytes) -> bool:
46
  head = b[:2048].decode(errors="ignore").lower()
 
47
  return (
48
+ "version https://git-lfs.github.com/spec/v1" in head
49
+ or "git-lfs" in head
50
+ or "xet" in head # e.g. xet 포인터
51
+ or "pointer size" in head
52
  )
53
 
54
  def _read_csv_bytes(b: bytes) -> pd.DataFrame:
 
61
  hub_filename: str | None = None,
62
  repo_id: str = HF_DATASET_REPO,
63
  repo_type: str = "dataset",
64
+ revision: str = HF_DATASET_REV) -> pd.DataFrame:
65
+ # hub_filename μƒλž΅ μ‹œ 둜컬 파일λͺ… μ‚¬μš©
66
  if hub_filename is None:
67
  hub_filename = os.path.basename(local_path)
68
+ # 1) 둜컬 μš°μ„ 
69
  if os.path.exists(local_path):
70
  with open(local_path, "rb") as f:
71
  data = f.read()
72
  if not _is_pointer_bytes(data):
73
+ return _read_csv_bytes(data)
74
+ # 2) ν—ˆλΈŒ λ‹€μš΄λ‘œλ“œ
 
 
75
  cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
76
  repo_type=repo_type, revision=revision)
77
  try:
78
  return pd.read_csv(cached, encoding="utf-8")
79
  except UnicodeDecodeError:
80
  return pd.read_csv(cached, encoding="cp949")
 
 
81
 
82
+ def load_json_smart(local_path: str,
83
+ hub_filename: str | None = None,
84
+ repo_id: str = HF_DATASET_REPO,
85
+ repo_type: str = "dataset",
86
+ revision: str = HF_DATASET_REV):
87
+ if hub_filename is None:
88
+ hub_filename = os.path.basename(local_path)
89
+ if os.path.exists(local_path):
90
+ with open(local_path, "rb") as f:
91
+ data = f.read()
92
+ if not _is_pointer_bytes(data):
93
+ return json.loads(data.decode("utf-8"))
94
+ cached = hf_hub_download(repo_id=repo_id, filename=hub_filename,
95
+ repo_type=repo_type, revision=revision)
96
+ with open(cached, "r", encoding="utf-8") as f:
 
 
 
 
 
 
 
97
  return json.load(f)
98
 
99
+ # ──────────────────────────────── 데이터 λ‘œλ“œ ────────────────────────────────
100
+ travel_df = load_csv_smart("trip_emotions.csv", "trip_emotions.csv")
101
+ external_score_df = load_csv_smart("external_scores.csv", "external_scores.csv")
102
+ festival_df = load_csv_smart("festivals.csv", "festivals.csv")
103
+ weather_df = load_csv_smart("weather.csv", "weather.csv")
104
+ package_df = load_csv_smart("packages.csv", "packages.csv")
105
+ master_df = load_csv_smart("countries_cities.csv", "countries_cities.csv")
106
+ theme_title_phrases = load_json_smart("theme_title_phrases.json", "theme_title_phrases.json")
107
+
108
+ # ν•„μˆ˜ 컬럼 κ°€λ“œ
109
+ for col in ("μ—¬ν–‰λ‚˜λΌ", "μ—¬ν–‰λ„μ‹œ", "μ—¬ν–‰μ§€"):
110
+ if col not in travel_df.columns:
111
+ st.error(f"'travel_df'에 '{col}' 컬럼이 μ—†μŠ΅λ‹ˆλ‹€. μ‹€μ œ 컬럼: {travel_df.columns.tolist()}")
112
+ st.stop()
113
+
114
+ # ──────────────────────────────── chat_a import & μ΄ˆκΈ°ν™” ────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  from chat_a import (
116
+ init_datasets, # ⬅️ μƒˆλ‘œ μΆ”κ°€λœ μ§€μ—° μ΄ˆκΈ°ν™” ν•¨μˆ˜
117
  analyze_emotion,
118
  detect_intent,
119
  extract_themes,
 
137
  format_summary_tags_custom,
138
  make_companion_age_message
139
  )
140
+
141
+ # μ§€μ—° μ΄ˆκΈ°ν™”: import μ‹œμ μ—λŠ” 데이터 μ ‘κ·Ό κΈˆμ§€, μ—¬κΈ°μ„œ ν•œ 번만 μ£Όμž…
142
+ init_datasets(
143
+ travel_df=travel_df,
144
+ festival_df=festival_df,
145
+ external_score_df=external_score_df,
146
+ weather_df=weather_df,
147
+ package_df=package_df,
148
+ master_df=master_df,
149
+ theme_title_phrases=theme_title_phrases,
150
+ )
151
  # ───────────────────────────────────── streamlit용 ν•¨μˆ˜
152
  def init_session():
153
  if "chat_log" not in st.session_state: