Spaces:
Sleeping
Sleeping
fix
Browse files
app.py
CHANGED
|
@@ -14,16 +14,14 @@ import matplotlib.pyplot as plt
|
|
| 14 |
import matplotlib.font_manager as fm
|
| 15 |
from sklearn.manifold import TSNE
|
| 16 |
import warnings
|
| 17 |
-
import gensim # FastText μ¬μ©μ μν gensim import
|
| 18 |
-
import hashlib # μΊμ ν€ μμ±μ μν΄ μΆκ°
|
| 19 |
-
|
| 20 |
warnings.filterwarnings('ignore')
|
| 21 |
|
| 22 |
-
# ---
|
|
|
|
| 23 |
# νμ΄μ§ μ€μ
|
| 24 |
st.set_page_config(
|
| 25 |
-
page_title="νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν
|
| 26 |
-
page_icon="
|
| 27 |
layout="wide"
|
| 28 |
)
|
| 29 |
|
|
@@ -32,122 +30,108 @@ DATA_FOLDER = 'data'
|
|
| 32 |
UPLOAD_FOLDER = 'uploads'
|
| 33 |
|
| 34 |
# ν΄λ μμ±
|
| 35 |
-
if not os.path.exists(DATA_FOLDER):
|
| 36 |
-
os.makedirs(DATA_FOLDER)
|
| 37 |
if not os.path.exists(UPLOAD_FOLDER):
|
| 38 |
os.makedirs(UPLOAD_FOLDER)
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
# λ€μ΄λ‘λν νκ΅μ΄ FastText λͺ¨λΈ νμΌ(.bin)μ μ 체 κ²½λ‘λ₯Ό μ§μ νμΈμ.
|
| 44 |
-
# μμ: "C:/Users/YourUser/Downloads/cc.ko.300.bin" λλ "/home/user/models/cc.ko.300.bin"
|
| 45 |
-
# λͺ¨λΈ λ€μ΄λ‘λ: https://fasttext.cc/docs/en/crawl-vectors.html λ± μ°Έμ‘°
|
| 46 |
-
FASTTEXT_MODEL_PATH = "YOUR_PATH_TO/cc.ko.300.bin" # <--- μ¬κΈ°μ μ€μ νμΌ κ²½λ‘ μ
λ ₯!!!
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
# --- μΈμ
μν μ΄κΈ°ν ---
|
| 50 |
-
if 'fasttext_model' not in st.session_state:
|
| 51 |
-
st.session_state.fasttext_model = None # λͺ¨λΈ κ°μ²΄ μ μ₯
|
| 52 |
if 'embeddings_cache' not in st.session_state:
|
| 53 |
-
st.session_state.embeddings_cache = {}
|
| 54 |
if 'graph_cache' not in st.session_state:
|
| 55 |
st.session_state.graph_cache = {}
|
| 56 |
if 'data_files' not in st.session_state:
|
| 57 |
st.session_state.data_files = {}
|
| 58 |
if 'selected_files' not in st.session_state:
|
| 59 |
-
st.session_state.selected_files = []
|
| 60 |
if 'threshold' not in st.session_state:
|
| 61 |
-
st.session_state.threshold = 0.
|
| 62 |
-
if 'perplexity' not in st.session_state:
|
| 63 |
-
st.session_state.perplexity = 30
|
| 64 |
-
if 'learning_rate' not in st.session_state:
|
| 65 |
-
st.session_state.learning_rate = 'auto'
|
| 66 |
-
if 'n_iter' not in st.session_state:
|
| 67 |
-
st.session_state.n_iter = 1000
|
| 68 |
if 'generate_clicked' not in st.session_state:
|
| 69 |
st.session_state.generate_clicked = False
|
| 70 |
if 'fig' not in st.session_state:
|
| 71 |
st.session_state.fig = None
|
| 72 |
|
| 73 |
-
|
| 74 |
-
# --- FastText λͺ¨λΈ λ‘λ© ν¨μ (μΊμ± μ¬μ©) ---
|
| 75 |
-
@st.cache_resource # λͺ¨λΈ κ°μ²΄λ ν¬λ―λ‘ λ¦¬μμ€ μΊμ± μ¬μ©
|
| 76 |
-
def load_fasttext_model(model_path):
|
| 77 |
-
"""μ§μ λ κ²½λ‘μμ FastText λͺ¨λΈμ λ‘λν©λλ€."""
|
| 78 |
-
if not os.path.exists(model_path):
|
| 79 |
-
st.error(f"μ€λ₯: FastText λͺ¨λΈ νμΌμ μ°Ύμ μ μμ΅λλ€: {model_path}")
|
| 80 |
-
st.error("FastText μΉμ¬μ΄νΈ λ±μμ νκ΅μ΄ λͺ¨λΈ(cc.ko.300.bin μΆμ²)μ λ€μ΄λ‘λνκ³ μ½λ μλ¨μ `FASTTEXT_MODEL_PATH` λ³μλ₯Ό μ νν μ§μ ν΄μ£ΌμΈμ.")
|
| 81 |
-
return None
|
| 82 |
-
try:
|
| 83 |
-
st.info(f"FastText λͺ¨λΈ λ‘λ© μ€... ({os.path.basename(model_path)}) λͺ¨λΈ ν¬κΈ°μ λ°λΌ μκ°μ΄ 걸릴 μ μμ΅λλ€.")
|
| 84 |
-
# .bin νμΌ λ‘λλ₯Ό μν΄ load_facebook_model μ¬μ©
|
| 85 |
-
model = gensim.models.fasttext.load_facebook_model(model_path)
|
| 86 |
-
st.info("FastText λͺ¨λΈ λ‘λ© μλ£.")
|
| 87 |
-
return model
|
| 88 |
-
except Exception as e:
|
| 89 |
-
st.error(f"FastText λͺ¨λΈ λ‘λ© μ€ μ€λ₯ λ°μ: {e}")
|
| 90 |
-
return None
|
| 91 |
|
| 92 |
# --- νκΈ ν°νΈ μ€μ ν¨μ ---
|
| 93 |
def set_korean_font():
|
| 94 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 95 |
system_name = platform.system()
|
| 96 |
-
plotly_font_name =
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
if font_path:
|
| 119 |
-
fm.fontManager.addfont(font_path)
|
| 120 |
-
prop = fm.FontProperties(fname=font_path)
|
| 121 |
-
font_name = prop.get_name()
|
| 122 |
-
plotly_font_name = font_name # Plotlyλ μ΄λ¦ μ¬μ©
|
| 123 |
-
else: # μμ€ν
ν°νΈ λ§€λμ μμ κ²μ
|
| 124 |
available_fonts = [f.name for f in fm.fontManager.ttflist]
|
| 125 |
nanum_fonts = [name for name in available_fonts if 'Nanum' in name]
|
| 126 |
if nanum_fonts:
|
| 127 |
font_name = nanum_fonts[0]
|
| 128 |
-
|
|
|
|
| 129 |
else:
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
plt.rc('font', family=font_name)
|
| 137 |
plt.rc('axes', unicode_minus=False)
|
| 138 |
print(f"Matplotlib font set to: {font_name}")
|
| 139 |
-
|
| 140 |
-
print("
|
| 141 |
plt.rcdefaults()
|
| 142 |
plt.rc('axes', unicode_minus=False)
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
print(f"Error setting Korean font: {e}")
|
| 146 |
plt.rcdefaults()
|
| 147 |
plt.rc('axes', unicode_minus=False)
|
| 148 |
|
|
|
|
|
|
|
| 149 |
print(f"Plotly font name to use: {plotly_font_name}")
|
| 150 |
-
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# --- λ°μ΄ν° λ‘λ ν¨μ ---
|
| 153 |
def load_words_from_json(filepath):
|
|
@@ -155,9 +139,11 @@ def load_words_from_json(filepath):
|
|
| 155 |
try:
|
| 156 |
with open(filepath, 'r', encoding='utf-8') as f:
|
| 157 |
data = json.load(f)
|
|
|
|
| 158 |
if isinstance(data, list):
|
| 159 |
-
words = [item.get('word', '') for item in data if isinstance(item, dict) and item.get('word')]
|
| 160 |
-
|
|
|
|
| 161 |
if not words:
|
| 162 |
st.warning(f"κ²½κ³ : νμΌ '{os.path.basename(filepath)}'μμ 'word' ν€λ₯Ό κ°μ§ μ ν¨ν λ°μ΄ν°λ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
| 163 |
return None
|
|
@@ -175,327 +161,428 @@ def load_words_from_json(filepath):
|
|
| 175 |
st.error(f"'{os.path.basename(filepath)}' λ°μ΄ν° λ‘λ© μ€ μ€λ₯ λ°μ: {e}")
|
| 176 |
return None
|
| 177 |
|
|
|
|
| 178 |
def scan_data_files():
|
| 179 |
-
"""λ°μ΄ν°
|
| 180 |
data_files = {}
|
| 181 |
-
# κΈ°λ³Έ λ°μ΄ν° ν΄λ
|
| 182 |
try:
|
| 183 |
for file_path in glob.glob(os.path.join(DATA_FOLDER, '*.json')):
|
| 184 |
-
file_id = f"default_{os.path.basename(file_path)}"
|
| 185 |
file_name = os.path.basename(file_path)
|
| 186 |
words = load_words_from_json(file_path)
|
| 187 |
-
if words:
|
| 188 |
-
data_files[file_id] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
except Exception as e:
|
| 190 |
st.error(f"κΈ°λ³Έ λ°μ΄ν° ν΄λ μ€μΊ μ€ μ€λ₯: {e}")
|
| 191 |
-
|
|
|
|
| 192 |
try:
|
| 193 |
for file_path in glob.glob(os.path.join(UPLOAD_FOLDER, '*.json')):
|
| 194 |
-
file_id = f"uploaded_{os.path.basename(file_path)}"
|
| 195 |
file_name = os.path.basename(file_path)
|
| 196 |
words = load_words_from_json(file_path)
|
| 197 |
-
if words:
|
| 198 |
-
data_files[file_id] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
except Exception as e:
|
| 200 |
st.error(f"μ
λ‘λ ν΄λ μ€μΊ μ€ μ€λ₯: {e}")
|
|
|
|
| 201 |
return data_files
|
| 202 |
|
| 203 |
-
|
|
|
|
| 204 |
"""μ νλ νμΌλ€μμ λ¨μ΄λ₯Ό λ‘λνκ³ μ€λ³΅ μ κ±°νμ¬ λ³ν©ν©λλ€."""
|
| 205 |
-
all_words =
|
| 206 |
if not file_ids:
|
| 207 |
return []
|
| 208 |
|
|
|
|
|
|
|
|
|
|
| 209 |
for file_id in file_ids:
|
| 210 |
if file_id in current_data_files:
|
| 211 |
file_path = current_data_files[file_id]['path']
|
| 212 |
words = load_words_from_json(file_path)
|
| 213 |
if words:
|
| 214 |
-
all_words.
|
| 215 |
else:
|
| 216 |
-
st.warning(f"μ νλ νμΌ ID '{file_id}'λ₯Ό μ°Ύμ μ μμ΅λλ€. λͺ©λ‘μ μλ‘κ³ μΉ¨ν©λλ€.")
|
| 217 |
-
# νμΌ
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
# --- λ¨μ΄ μλ² λ© ν¨μ (FastText μ¬μ©) ---
|
| 223 |
-
def encode_words_fasttext(words, normalize=True):
|
| 224 |
-
"""FastText λͺ¨λΈμ μ¬μ©νμ¬ λ¨μ΄ λͺ©λ‘μ μλ―Έ μλ² λ©μΌλ‘ λ³νν©λλ€."""
|
| 225 |
-
model = st.session_state.get('fasttext_model')
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
|
|
|
|
|
|
|
|
|
|
| 231 |
if not words:
|
| 232 |
return np.array([])
|
| 233 |
|
| 234 |
embeddings = []
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
try:
|
| 241 |
-
vector = model.wv[word]
|
| 242 |
-
if np.all(vector == 0):
|
| 243 |
-
oov_count += 1
|
| 244 |
-
if normalize:
|
| 245 |
-
norm = np.linalg.norm(vector)
|
| 246 |
-
vector = vector / norm if norm > 0 else np.zeros(vector_size)
|
| 247 |
-
embeddings.append(vector)
|
| 248 |
-
except Exception as e:
|
| 249 |
-
st.warning(f"λ¨μ΄ '{word}' μ²λ¦¬ μ€ μ€λ₯ λ°μ (νΉμ OOV): {e}. 0벑ν°λ‘ λ체ν©λλ€.")
|
| 250 |
-
embeddings.append(np.zeros(vector_size))
|
| 251 |
-
oov_count += 1
|
| 252 |
-
|
| 253 |
-
if oov_count > 0:
|
| 254 |
-
st.warning(f"μ΄ {len(words)}κ° λ¨μ΄ μ€ {oov_count}κ°μ λν΄ μ ν¨ λ²‘ν°λ₯Ό μ»μ§ λͺ»νμ΅λλ€(OOV λ±).")
|
| 255 |
-
|
| 256 |
-
result_embeddings = np.array(embeddings)
|
| 257 |
-
|
| 258 |
-
if result_embeddings.size == 0 and len(words) > 0:
|
| 259 |
-
st.error("μλ² λ© μμ± κ²°κ³Όκ° λΉμ΄ μμ΅λλ€.")
|
| 260 |
-
return None
|
| 261 |
-
elif result_embeddings.shape[0] != len(words):
|
| 262 |
-
st.error(f"μ
λ ₯ λ¨μ΄ μ({len(words)})μ μμ±λ μλ² λ© μ({result_embeddings.shape[0]}) λΆμΌμΉ.")
|
| 263 |
-
return None
|
| 264 |
-
|
| 265 |
-
return result_embeddings
|
| 266 |
-
|
| 267 |
-
# --- κ·Έλν μμ± ν¨μ ---
|
| 268 |
-
def generate_graph(file_ids, similarity_threshold, perplexity, learning_rate, n_iter):
|
| 269 |
-
""" μλ―Έ μ μ¬μ± κΈ°λ° 3D κ·Έλνλ₯Ό μμ±ν©λλ€. """
|
| 270 |
-
# κ·Έλν μΊμ ν€ μμ± (νμΌ ID, μκ³κ°, t-SNE νλΌλ―Έν° ν¬ν¨)
|
| 271 |
-
param_str = f"t{similarity_threshold}_p{perplexity}_lr{learning_rate}_i{n_iter}"
|
| 272 |
-
sorted_fids = "-".join(sorted(file_ids))
|
| 273 |
-
# λ¨μ΄ λͺ©λ‘ μ체λ₯Ό ν΄μνμ¬ μΊμ ν€μ ν¬ν¨ (λ μ ννμ§λ§ λ릴 μ μμ)
|
| 274 |
-
# word_list_for_key = merge_word_lists(file_ids, st.session_state.data_files)
|
| 275 |
-
# word_hash = hashlib.sha256(str(word_list_for_key).encode()).hexdigest()[:8]
|
| 276 |
-
# cache_key = f"{sorted_fids}_{word_hash}_{param_str}_fasttext"
|
| 277 |
-
cache_key = f"{sorted_fids}_{param_str}_fasttext" # νμΌ ID κΈ°λ° μΊμ
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
if not file_ids:
|
| 285 |
st.error("κ·Έλνλ₯Ό μμ±ν νμΌμ΄ μ νλμ§ μμμ΅λλ€.")
|
| 286 |
return None
|
| 287 |
-
if st.session_state.get('fasttext_model') is None:
|
| 288 |
-
st.error("FastText λͺ¨λΈμ΄ λ‘λλμ§ μμ κ·Έλν μμ±μ μ§νν μ μμ΅λλ€.")
|
| 289 |
-
return None
|
| 290 |
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
if not word_list:
|
| 295 |
st.error("μ νλ νμΌμμ μ ν¨ν λ¨μ΄λ₯Ό λ‘λν μ μμ΅λλ€.")
|
| 296 |
return None
|
|
|
|
| 297 |
if len(word_list) < 2:
|
| 298 |
st.warning("κ·Έλνλ₯Ό μμ±νλ €λ©΄ μ΅μ 2κ° μ΄μμ κ³ μ λ¨μ΄κ° νμν©λλ€.")
|
| 299 |
return None
|
| 300 |
|
| 301 |
-
|
| 302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
if embeddings is None or embeddings.shape[0] == 0 or embeddings.shape[1] == 0:
|
| 304 |
-
st.error("
|
| 305 |
return None
|
| 306 |
|
| 307 |
-
#
|
| 308 |
embeddings_3d = None
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
if n_samples
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
| 324 |
else:
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
| 341 |
return None
|
| 342 |
|
| 343 |
-
#
|
| 344 |
edges = []
|
| 345 |
edge_weights = []
|
| 346 |
-
with st.spinner('λ¨μ΄ κ°
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
# --- NetworkX κ·Έλν μμ± ---
|
| 360 |
G = nx.Graph()
|
| 361 |
-
|
| 362 |
for i, word in enumerate(word_list):
|
| 363 |
-
|
| 364 |
-
G.add_node(word, pos=(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2]))
|
| 365 |
-
valid_nodes_count += 1
|
| 366 |
-
else:
|
| 367 |
-
st.warning(f"'{word}' λ¨μ΄ μ’ν λλ½.") # λλ½ κ²½κ³
|
| 368 |
|
| 369 |
-
|
| 370 |
-
st.warning(f"{len(word_list)-valid_nodes_count}κ° λ¨μ΄ λ
Έλ μμ± μ€ν¨.")
|
| 371 |
-
|
| 372 |
-
valid_edges_count = 0
|
| 373 |
for edge, weight in zip(edges, edge_weights):
|
| 374 |
-
|
|
|
|
| 375 |
G.add_edge(edge[0], edge[1], weight=weight)
|
| 376 |
-
valid_edges_count += 1
|
| 377 |
|
| 378 |
-
#
|
| 379 |
edge_x, edge_y, edge_z = [], [], []
|
| 380 |
if G.number_of_edges() > 0:
|
| 381 |
for edge in G.edges():
|
| 382 |
try:
|
| 383 |
pos0 = G.nodes[edge[0]]['pos']
|
| 384 |
pos1 = G.nodes[edge[1]]['pos']
|
| 385 |
-
edge_x.extend([pos0[0], pos1[0], None])
|
| 386 |
edge_y.extend([pos0[1], pos1[1], None])
|
| 387 |
edge_z.extend([pos0[2], pos1[2], None])
|
| 388 |
except KeyError as e:
|
| 389 |
-
st.warning(f"μ£μ§
|
| 390 |
-
continue
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
raw_sizes = np.log1p(degrees) * 3 + 6
|
| 399 |
-
node_sizes_list = np.clip(raw_sizes, 5, 20).tolist()
|
| 400 |
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
)
|
| 430 |
-
|
|
|
|
|
|
|
| 431 |
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
file_names_used = [
|
|
|
|
|
|
|
| 435 |
file_info_str = ", ".join(file_names_used) if file_names_used else "μ μ μμ"
|
| 436 |
|
|
|
|
|
|
|
| 437 |
layout = go.Layout(
|
| 438 |
title=dict(
|
| 439 |
-
text=f'<b>μ΄ν μλ―Έ μ μ¬μ± κΈ°λ° 3D
|
| 440 |
font=dict(size=16, family=plotly_font),
|
| 441 |
-
x=0.5,
|
|
|
|
| 442 |
),
|
| 443 |
-
showlegend=False,
|
| 444 |
-
margin=dict(l=10, r=10, b=10, t=80),
|
| 445 |
scene=dict(
|
| 446 |
-
xaxis=dict(
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
),
|
|
|
|
| 452 |
hovermode='closest'
|
| 453 |
)
|
| 454 |
|
|
|
|
| 455 |
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
|
| 456 |
|
| 457 |
-
#
|
| 458 |
st.session_state.graph_cache[cache_key] = fig
|
| 459 |
|
| 460 |
return fig
|
| 461 |
|
| 462 |
-
|
| 463 |
def handle_uploaded_file(uploaded_file):
|
| 464 |
-
"""
|
| 465 |
if uploaded_file is not None:
|
| 466 |
-
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
file_path = os.path.join(UPLOAD_FOLDER, file_name)
|
| 469 |
|
| 470 |
try:
|
|
|
|
| 471 |
with open(file_path, 'wb') as f:
|
| 472 |
f.write(uploaded_file.getbuffer())
|
| 473 |
-
st.info(f"νμΌ '{uploaded_file.name}' μ μ₯ μλ£. λ΄μ© κ²μ¦ μ€...")
|
| 474 |
|
|
|
|
| 475 |
words = load_words_from_json(file_path)
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
else:
|
| 481 |
-
st.
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
new_file_id = f"uploaded_{file_name}"
|
| 485 |
-
return new_file_id
|
| 486 |
except Exception as e:
|
| 487 |
-
st.error(f"νμΌ μ
λ‘λ μ²λ¦¬ μ€
|
| 488 |
-
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
def delete_file(file_id):
|
| 492 |
-
"""
|
| 493 |
-
|
| 494 |
-
if file_id not in current_data_files:
|
| 495 |
st.error('μμ ν νμΌμ μ°Ύμ μ μμ΅λλ€.')
|
| 496 |
return False
|
| 497 |
|
| 498 |
-
file_info =
|
|
|
|
|
|
|
| 499 |
if file_info.get('type') != 'uploaded':
|
| 500 |
st.error('κΈ°λ³Έ λ°μ΄ν° νμΌμ μμ ν μ μμ΅λλ€.')
|
| 501 |
return False
|
|
@@ -503,218 +590,235 @@ def delete_file(file_id):
|
|
| 503 |
file_path = file_info.get('path')
|
| 504 |
file_name = file_info.get('name', 'μ μ μμ')
|
| 505 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
try:
|
| 507 |
-
|
|
|
|
| 508 |
os.remove(file_path)
|
| 509 |
-
st.info(f"νμΌ '{file_name}' μμ μλ£.")
|
| 510 |
else:
|
| 511 |
-
|
| 512 |
|
| 513 |
-
# μΈμ
|
| 514 |
del st.session_state.data_files[file_id]
|
| 515 |
-
if file_id in st.session_state.selected_files:
|
| 516 |
-
st.session_state.selected_files.remove(file_id)
|
| 517 |
|
| 518 |
-
# κ΄λ ¨
|
| 519 |
-
|
| 520 |
-
for key in
|
| 521 |
del st.session_state.graph_cache[key]
|
| 522 |
-
if keys_to_remove: st.info(f"{len(keys_to_remove)}κ° κ΄λ ¨ κ·Έλν μΊμ μμ .")
|
| 523 |
|
| 524 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
return True
|
| 526 |
|
| 527 |
except Exception as e:
|
| 528 |
st.error(f"νμΌ μμ μ€ μ€λ₯ λ°μ: {e}")
|
| 529 |
return False
|
| 530 |
|
| 531 |
-
|
| 532 |
def clear_cache():
|
| 533 |
-
"""
|
| 534 |
st.session_state.graph_cache = {}
|
| 535 |
-
|
| 536 |
-
st.session_state.fig = None
|
| 537 |
-
st.success('κ·Έλν μΊμκ° μ΄κΈ°νλμμ΅λλ€.')
|
| 538 |
-
st.
|
| 539 |
-
|
| 540 |
|
| 541 |
-
# ==============================================================================
|
| 542 |
-
# --- Streamlit μ± μ€ν λΆλΆ ---
|
| 543 |
-
# ==============================================================================
|
| 544 |
|
| 545 |
-
# --- μ± μμ
|
| 546 |
-
# FastText λͺ¨λΈ λ‘λ μλ
|
| 547 |
-
if 'fasttext_model' not in st.session_state or st.session_state.fasttext_model is None:
|
| 548 |
-
st.session_state.fasttext_model = load_fasttext_model(FASTTEXT_MODEL_PATH)
|
| 549 |
|
| 550 |
-
# λ°μ΄ν° νμΌ μ€μΊ
|
| 551 |
if 'data_files' not in st.session_state or not st.session_state.data_files:
|
| 552 |
st.session_state.data_files = scan_data_files()
|
| 553 |
|
| 554 |
# νμ΄ν λ° μκ°
|
| 555 |
-
st.title('νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν
|
| 556 |
st.markdown("""
|
| 557 |
-
μ΄ λꡬλ JSON
|
| 558 |
-
|
| 559 |
""")
|
| 560 |
|
| 561 |
-
#
|
| 562 |
-
if st.session_state.get('fasttext_model') is None:
|
| 563 |
-
st.error("FastText λͺ¨λΈ λ‘λ© μ€ν¨. μ½λ μλ¨μ `FASTTEXT_MODEL_PATH` μ€μ μ νμΈνκ³ μ±μ μ¬μ€νν΄μ£ΌμΈμ.")
|
| 564 |
-
st.stop() # λͺ¨λΈ μμΌλ©΄ μ± μ€λ¨
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
# --- μ¬μ΄λλ° ---
|
| 568 |
st.sidebar.title('βοΈ μ€μ λ° μ μ΄')
|
| 569 |
|
| 570 |
-
#
|
| 571 |
threshold = st.sidebar.slider(
|
| 572 |
-
'μ μ¬λ μκ³κ° (Similarity Threshold)',
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
)
|
|
|
|
| 575 |
if threshold != st.session_state.threshold:
|
| 576 |
st.session_state.threshold = threshold
|
| 577 |
-
st.session_state.fig = None #
|
| 578 |
-
st.session_state.generate_clicked = False
|
| 579 |
-
|
| 580 |
-
st.sidebar.divider()
|
| 581 |
-
|
| 582 |
-
# 2. t-SNE νλΌλ―Έν° (μκ°ν λ―ΈμΈ μ‘°μ )
|
| 583 |
-
st.sidebar.header("t-SNE νλΌλ―Έν° (κ³ κΈ)")
|
| 584 |
-
perplexity = st.sidebar.slider(
|
| 585 |
-
"Perplexity", 5, 50, st.session_state.perplexity, 1,
|
| 586 |
-
help="κ° μ μ΄ κ³ λ €νλ μ΄μ μμ κ΄λ ¨. κ΅°μ§ ννμ μν₯."
|
| 587 |
-
)
|
| 588 |
-
learning_rate = st.sidebar.select_slider(
|
| 589 |
-
"Learning Rate", options=[10, 50, 100, 200, 500, 1000, 'auto'], value=st.session_state.learning_rate,
|
| 590 |
-
help="μ΅μ ν νμ΅ μλ. κ΅°μ§ κ° κ±°λ¦¬μ μν₯."
|
| 591 |
-
)
|
| 592 |
-
n_iter = st.sidebar.select_slider(
|
| 593 |
-
"Iterations", options=[250, 500, 1000, 2000, 5000], value=st.session_state.n_iter,
|
| 594 |
-
help="μ΅μ ν λ°λ³΅ νμ. λμμλ‘ μμ μ μ΄λ μ€λ κ±Έλ¦Ό."
|
| 595 |
-
)
|
| 596 |
-
# t-SNE νλΌλ―Έν° λ³κ²½ μ μν μ
λ°μ΄νΈ λ° κ·Έλν μ΄κΈ°ν
|
| 597 |
-
if (perplexity != st.session_state.perplexity or
|
| 598 |
-
learning_rate != st.session_state.learning_rate or
|
| 599 |
-
n_iter != st.session_state.n_iter):
|
| 600 |
-
st.session_state.perplexity = perplexity
|
| 601 |
-
st.session_state.learning_rate = learning_rate
|
| 602 |
-
st.session_state.n_iter = n_iter
|
| 603 |
-
st.session_state.fig = None
|
| 604 |
-
st.session_state.generate_clicked = False
|
| 605 |
|
| 606 |
st.sidebar.divider()
|
| 607 |
|
| 608 |
-
#
|
| 609 |
st.sidebar.header('π νμΌ μ
λ‘λ')
|
| 610 |
uploaded_file = st.sidebar.file_uploader(
|
| 611 |
-
"JSON νμΌ μ
λ‘λ
|
|
|
|
|
|
|
| 612 |
)
|
|
|
|
| 613 |
if uploaded_file is not None:
|
|
|
|
|
|
|
| 614 |
with st.spinner("μ
λ‘λλ νμΌ μ²λ¦¬ μ€..."):
|
| 615 |
new_file_id = handle_uploaded_file(uploaded_file)
|
| 616 |
if new_file_id:
|
| 617 |
-
st.sidebar.success(f"νμΌ '{uploaded_file.name}' μ
λ‘λ μλ£!")
|
| 618 |
-
# μλ‘ μ
λ‘λλ νμΌμ μλμΌλ‘ μ ν λͺ©λ‘μ
|
| 619 |
if new_file_id not in st.session_state.selected_files:
|
| 620 |
st.session_state.selected_files.append(new_file_id)
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
|
| 623 |
st.sidebar.divider()
|
| 624 |
|
| 625 |
-
#
|
| 626 |
st.sidebar.header('ποΈ λ°μ΄ν° νμΌ μ ν')
|
| 627 |
-
|
| 628 |
-
if
|
| 629 |
-
|
|
|
|
|
|
|
|
|
|
| 630 |
selected_files_temp = []
|
| 631 |
-
|
|
|
|
|
|
|
| 632 |
|
|
|
|
| 633 |
for file_id in sorted_file_ids:
|
| 634 |
-
if file_id not in
|
| 635 |
-
file_info =
|
|
|
|
| 636 |
file_label = f"{file_info['name']} ({file_info['word_count']} λ¨μ΄)"
|
| 637 |
file_type_tag = "[κΈ°λ³Έ]" if file_info['type'] == 'default' else "[μ
λ‘λ]"
|
| 638 |
label_full = f"{file_label} {file_type_tag}"
|
|
|
|
|
|
|
| 639 |
is_selected = file_id in st.session_state.selected_files
|
| 640 |
|
| 641 |
-
# 체ν¬λ°μ€
|
| 642 |
-
|
|
|
|
|
|
|
|
|
|
| 643 |
selected_files_temp.append(file_id)
|
| 644 |
-
|
|
|
|
| 645 |
with st.sidebar.expander("νμΌ μ 보 보기", expanded=False):
|
| 646 |
-
st.markdown(f"
|
| 647 |
if file_info['type'] == 'uploaded':
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
if sorted(selected_files_temp) != sorted(st.session_state.selected_files):
|
| 654 |
st.session_state.selected_files = selected_files_temp
|
| 655 |
-
st.session_state.fig = None
|
| 656 |
-
st.session_state.generate_clicked = False
|
| 657 |
-
|
|
|
|
| 658 |
|
| 659 |
st.sidebar.divider()
|
| 660 |
|
| 661 |
-
#
|
|
|
|
| 662 |
if st.session_state.selected_files:
|
| 663 |
if st.sidebar.button('π κ·Έλν μμ±/μ
λ°μ΄νΈ', key='generate_button', type="primary"):
|
| 664 |
-
|
| 665 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
else:
|
| 667 |
-
st.sidebar.warning('κ·Έλνλ₯Ό
|
| 668 |
|
| 669 |
else:
|
| 670 |
-
st.sidebar.
|
| 671 |
|
| 672 |
-
st.sidebar.divider()
|
| 673 |
|
| 674 |
-
#
|
| 675 |
if st.sidebar.button('π μΊμ μ΄κΈ°ν', key='clear_cache_button'):
|
| 676 |
clear_cache()
|
| 677 |
|
| 678 |
-
|
| 679 |
# --- λ©μΈ μ½ν
μΈ μμ ---
|
| 680 |
st.header("π 3D λ¨μ΄ λ€νΈμν¬ μκ°ν")
|
| 681 |
|
| 682 |
# κ·Έλν νμ λ‘μ§
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
if st.session_state.selected_files:
|
| 684 |
-
# κ·Έλνλ₯Ό μμ±ν΄μΌ νλ 쑰건
|
| 685 |
should_generate_graph = st.session_state.generate_clicked or \
|
| 686 |
-
(st.session_state.fig is None and st.session_state.selected_files) #
|
| 687 |
|
| 688 |
-
if should_generate_graph
|
| 689 |
-
with st.spinner('
|
| 690 |
try:
|
| 691 |
-
# generate_graph ν¨μ νΈμΆ
|
| 692 |
-
fig = generate_graph(
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
st.session_state.n_iter
|
| 698 |
-
)
|
| 699 |
-
st.session_state.fig = fig # μ±κ³΅ μ fig μ μ₯
|
| 700 |
except Exception as e:
|
| 701 |
-
st.error(f"κ·Έλν μμ± μ€
|
| 702 |
-
st.session_state.fig = None #
|
| 703 |
-
|
| 704 |
-
st.session_state.generate_clicked = False # μμ
μλ£ ν ν΄λ¦ νλκ·Έ 리μ
|
| 705 |
|
| 706 |
-
# μμ±λ κ·Έλνκ° μμΌλ©΄ νμ
|
| 707 |
if st.session_state.get('fig') is not None:
|
| 708 |
st.plotly_chart(st.session_state.fig, use_container_width=True)
|
| 709 |
|
| 710 |
# νμ¬ κ·Έλν μ 보 νμ
|
| 711 |
try:
|
|
|
|
|
|
|
|
|
|
| 712 |
num_nodes = len(st.session_state.fig.data[1].x) if len(st.session_state.fig.data) > 1 and hasattr(st.session_state.fig.data[1], 'x') else 0
|
| 713 |
num_edges = len(st.session_state.fig.data[0].x) // 3 if len(st.session_state.fig.data) > 0 and hasattr(st.session_state.fig.data[0], 'x') and st.session_state.fig.data[0].x else 0
|
| 714 |
|
| 715 |
-
# μ¬μ©λ νμΌ μ΄λ¦ μ»κΈ° (λ°μ΄ν° λ‘λ ν)
|
| 716 |
-
current_data_files = st.session_state.get('data_files', {})
|
| 717 |
-
selected_file_names = [current_data_files[fid]['name'] for fid in st.session_state.selected_files if fid in current_data_files]
|
| 718 |
|
| 719 |
st.info(f"""
|
| 720 |
**νμ¬ κ·Έλν μ 보**
|
|
@@ -725,35 +829,73 @@ if st.session_state.selected_files:
|
|
| 725 |
except Exception as info_e:
|
| 726 |
st.warning(f"κ·Έλν μ 보 νμ μ€ μ€λ₯: {info_e}")
|
| 727 |
|
|
|
|
| 728 |
# μ¬μ© μ€λͺ
|
| 729 |
with st.expander("π‘ κ·Έλν μ‘°μ λ°©λ²"):
|
| 730 |
st.markdown("""
|
| 731 |
-
- **νλ/μΆμ:** λ§μ°μ€ ν μ€ν¬λ‘€
|
| 732 |
- **νμ :** λ§μ°μ€ μΌμͺ½ λ²νΌ λλ₯Έ μνλ‘ λλκ·Έ
|
| 733 |
-
- **μ΄λ (Pan):** λ§μ°μ€ μ€λ₯Έμͺ½ λ²νΌ λλ₯Έ μνλ‘ λλκ·Έ
|
| 734 |
-
- **λ¨μ΄ μ 보 νμΈ:** λ§μ°μ€ 컀μλ₯Ό λ¨μ΄(λ§μ»€) μμ μ¬λ¦¬λ©΄ λ¨μ΄ μ΄λ¦κ³Ό
|
| 735 |
-
-
|
| 736 |
""")
|
| 737 |
-
|
| 738 |
-
|
|
|
|
|
|
|
| 739 |
st.info("π μ¬μ΄λλ°μμ 'π κ·Έλν μμ±/μ
λ°μ΄νΈ' λ²νΌμ ν΄λ¦νμ¬ μκ°νλ₯Ό μμνμΈμ.")
|
| 740 |
|
| 741 |
-
# μ νλ νμΌμ΄ μλ κ²½μ°
|
| 742 |
elif not st.session_state.data_files:
|
| 743 |
st.warning("νμν λ°μ΄ν° νμΌμ΄ μμ΅λλ€. νμΌμ μ
λ‘λνκ±°λ `data` ν΄λμ μ ν¨ν JSON νμΌμ μΆκ°νμΈμ.")
|
| 744 |
-
else:
|
|
|
|
| 745 |
st.info("π μ¬μ΄λλ°μμ λΆμν λ°μ΄ν° νμΌμ μ νν΄μ£ΌμΈμ.")
|
| 746 |
|
| 747 |
-
|
| 748 |
# --- νλ¨ μ 보 μΉμ
---
|
| 749 |
st.divider()
|
|
|
|
| 750 |
with st.expander("βΉοΈ μ΄ μκ°ν λꡬμ λνμ¬"):
|
| 751 |
-
st.markdown(
|
| 752 |
μ΄ λꡬλ λ€μκ³Ό κ°μ κ³Όμ μ ν΅ν΄ νκ΅μ΄ λ¨μ΄ λ€νΈμν¬λ₯Ό μκ°νν©λλ€:
|
| 753 |
|
| 754 |
1. **λ°μ΄ν° λ‘λ©:** μ¬μ©μκ° μ 곡ν JSON νμΌμμ 'word' νλλ₯Ό κ°μ§ λ¨μ΄ λͺ©λ‘μ μΆμΆν©λλ€.
|
| 755 |
-
2. **λ¨μ΄
|
| 756 |
-
3.
|
| 757 |
-
4.
|
| 758 |
-
5. **κ·Έλν
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
""")
|
|
|
|
| 14 |
import matplotlib.font_manager as fm
|
| 15 |
from sklearn.manifold import TSNE
|
| 16 |
import warnings
|
|
|
|
|
|
|
|
|
|
| 17 |
warnings.filterwarnings('ignore')
|
| 18 |
|
| 19 |
+
# --- (μ΄μ μ½λλ λμΌ) ---
|
| 20 |
+
|
| 21 |
# νμ΄μ§ μ€μ
|
| 22 |
st.set_page_config(
|
| 23 |
+
page_title="νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν",
|
| 24 |
+
page_icon="π€",
|
| 25 |
layout="wide"
|
| 26 |
)
|
| 27 |
|
|
|
|
| 30 |
UPLOAD_FOLDER = 'uploads'
|
| 31 |
|
| 32 |
# ν΄λ μμ±
|
|
|
|
|
|
|
| 33 |
if not os.path.exists(UPLOAD_FOLDER):
|
| 34 |
os.makedirs(UPLOAD_FOLDER)
|
| 35 |
|
| 36 |
+
# μΈμ
μν μ΄κΈ°ν
|
| 37 |
+
if 'model' not in st.session_state:
|
| 38 |
+
st.session_state.model = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if 'embeddings_cache' not in st.session_state:
|
| 40 |
+
st.session_state.embeddings_cache = {}
|
| 41 |
if 'graph_cache' not in st.session_state:
|
| 42 |
st.session_state.graph_cache = {}
|
| 43 |
if 'data_files' not in st.session_state:
|
| 44 |
st.session_state.data_files = {}
|
| 45 |
if 'selected_files' not in st.session_state:
|
| 46 |
+
st.session_state.selected_files = [] # 리μ€νΈλ‘ μ΄κΈ°ν
|
| 47 |
if 'threshold' not in st.session_state:
|
| 48 |
+
st.session_state.threshold = 0.7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if 'generate_clicked' not in st.session_state:
|
| 50 |
st.session_state.generate_clicked = False
|
| 51 |
if 'fig' not in st.session_state:
|
| 52 |
st.session_state.fig = None
|
| 53 |
|
| 54 |
+
# --- (ν¨μ μ μ λΆλΆμ λμΌ: set_korean_font, load_words_from_json, ...) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# --- νκΈ ν°νΈ μ€μ ν¨μ ---
|
| 57 |
def set_korean_font():
|
| 58 |
+
"""
|
| 59 |
+
νμ¬ μ΄μ체μ μ λ§λ νκΈ ν°νΈλ₯Ό matplotlib λ° Plotlyμ©μΌλ‘ μ€μ μλνκ³ ,
|
| 60 |
+
Plotlyμμ μ¬μ©ν ν°νΈ μ΄λ¦μ λ°νν©λλ€.
|
| 61 |
+
"""
|
| 62 |
system_name = platform.system()
|
| 63 |
+
plotly_font_name = None # Plotlyμμ μ¬μ©ν ν°νΈ μ΄λ¦
|
| 64 |
+
|
| 65 |
+
# Matplotlib ν°νΈ μ€μ
|
| 66 |
+
if system_name == "Windows":
|
| 67 |
+
font_name = "Malgun Gothic"
|
| 68 |
+
plotly_font_name = "Malgun Gothic"
|
| 69 |
+
elif system_name == "Darwin": # MacOS
|
| 70 |
+
font_name = "AppleGothic"
|
| 71 |
+
plotly_font_name = "AppleGothic"
|
| 72 |
+
elif system_name == "Linux":
|
| 73 |
+
# Linuxμμ μ νΈνλ νκΈ ν°νΈ κ²½λ‘ λλ μ΄λ¦ μ€μ
|
| 74 |
+
font_path = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
|
| 75 |
+
plotly_font_name_linux = "NanumGothic" # Plotlyλ ν°νΈ 'μ΄λ¦'μ μ£Όλ‘ μ¬μ©
|
| 76 |
+
|
| 77 |
+
if os.path.exists(font_path):
|
| 78 |
+
prop = fm.FontProperties(fname=font_path)
|
| 79 |
+
fm.fontManager.addfont(font_path) # μμ€ν
μ ν°νΈ μΆκ° (νμν μ μμ)
|
| 80 |
+
font_name = prop.get_name()
|
| 81 |
+
plotly_font_name = plotly_font_name_linux
|
| 82 |
+
else:
|
| 83 |
+
# μμ€ν
μμ 'Nanum' ν¬ν¨ ν°νΈ μ°ΎκΈ° μλ
|
| 84 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
available_fonts = [f.name for f in fm.fontManager.ttflist]
|
| 86 |
nanum_fonts = [name for name in available_fonts if 'Nanum' in name]
|
| 87 |
if nanum_fonts:
|
| 88 |
font_name = nanum_fonts[0]
|
| 89 |
+
# Plotlyμμ μ¬μ©ν μ΄λ¦λ λΉμ·νκ² μ€μ (μ νν μ΄λ¦μ μμ€ν
λ§λ€ λ€λ₯Ό μ μμ)
|
| 90 |
+
plotly_font_name = font_name if 'Nanum' in font_name else plotly_font_name_linux
|
| 91 |
else:
|
| 92 |
+
# λ€λ₯Έ OS ν°νΈ μλ (Linuxμμ λλ¬Όμ§λ§)
|
| 93 |
+
if "Malgun Gothic" in available_fonts:
|
| 94 |
+
font_name = "Malgun Gothic"
|
| 95 |
+
plotly_font_name = "Malgun Gothic"
|
| 96 |
+
elif "AppleGothic" in available_fonts:
|
| 97 |
+
font_name = "AppleGothic"
|
| 98 |
+
plotly_font_name = "AppleGothic"
|
| 99 |
+
else:
|
| 100 |
+
font_name = None
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Linux font search error: {e}")
|
| 104 |
+
font_name = None
|
| 105 |
|
| 106 |
+
if not font_name:
|
| 107 |
+
font_name = None
|
| 108 |
+
plotly_font_name = None # Plotlyλ κΈ°λ³Έκ° μ¬μ©
|
| 109 |
+
|
| 110 |
+
else: # κΈ°ν OS
|
| 111 |
+
font_name = None
|
| 112 |
+
plotly_font_name = None
|
| 113 |
+
|
| 114 |
+
# Matplotlib ν°νΈ μ€μ μ μ©
|
| 115 |
+
if font_name:
|
| 116 |
+
try:
|
| 117 |
plt.rc('font', family=font_name)
|
| 118 |
plt.rc('axes', unicode_minus=False)
|
| 119 |
print(f"Matplotlib font set to: {font_name}")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Failed to set Matplotlib font '{font_name}': {e}")
|
| 122 |
plt.rcdefaults()
|
| 123 |
plt.rc('axes', unicode_minus=False)
|
| 124 |
+
else:
|
| 125 |
+
print("No suitable Korean font found for Matplotlib. Using default.")
|
|
|
|
| 126 |
plt.rcdefaults()
|
| 127 |
plt.rc('axes', unicode_minus=False)
|
| 128 |
|
| 129 |
+
if not plotly_font_name:
|
| 130 |
+
plotly_font_name = 'sans-serif' # Plotly κΈ°λ³Έκ° μ§μ
|
| 131 |
print(f"Plotly font name to use: {plotly_font_name}")
|
| 132 |
+
|
| 133 |
+
return plotly_font_name # Plotlyμμ μ¬μ©ν ν°νΈ μ΄λ¦ λ°ν
|
| 134 |
+
|
| 135 |
|
| 136 |
# --- λ°μ΄ν° λ‘λ ν¨μ ---
|
| 137 |
def load_words_from_json(filepath):
|
|
|
|
| 139 |
try:
|
| 140 |
with open(filepath, 'r', encoding='utf-8') as f:
|
| 141 |
data = json.load(f)
|
| 142 |
+
# dataκ° λ¦¬μ€νΈ ννλΌκ³ κ°μ
|
| 143 |
if isinstance(data, list):
|
| 144 |
+
words = [item.get('word', '') for item in data if isinstance(item, dict) and item.get('word')] # dict ννμ΄κ³ 'word' ν€κ° μλμ§ νμΈ
|
| 145 |
+
# λΉ λ¬Έμμ΄ μ κ±°
|
| 146 |
+
words = [word for word in words if word]
|
| 147 |
if not words:
|
| 148 |
st.warning(f"κ²½κ³ : νμΌ '{os.path.basename(filepath)}'μμ 'word' ν€λ₯Ό κ°μ§ μ ν¨ν λ°μ΄ν°λ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
| 149 |
return None
|
|
|
|
| 161 |
st.error(f"'{os.path.basename(filepath)}' λ°μ΄ν° λ‘λ© μ€ μ€λ₯ λ°μ: {e}")
|
| 162 |
return None
|
| 163 |
|
| 164 |
+
|
| 165 |
def scan_data_files():
|
| 166 |
+
"""λ°μ΄ν° ν΄λμμ μ¬μ© κ°λ₯ν λͺ¨λ JSON νμΌμ μ€μΊνκ³ μ 보λ₯Ό λ°νν©λλ€."""
|
| 167 |
data_files = {}
|
| 168 |
+
# κΈ°λ³Έ λ°μ΄ν° ν΄λ μ€μΊ
|
| 169 |
try:
|
| 170 |
for file_path in glob.glob(os.path.join(DATA_FOLDER, '*.json')):
|
| 171 |
+
file_id = f"default_{os.path.basename(file_path)}" # κ³ μ ID μμ± λ°©μ λ³κ²½
|
| 172 |
file_name = os.path.basename(file_path)
|
| 173 |
words = load_words_from_json(file_path)
|
| 174 |
+
if words: # wordsκ° Noneμ΄ μλκ³ λΉμ΄μμ§ μμ κ²½μ°
|
| 175 |
+
data_files[file_id] = {
|
| 176 |
+
'path': file_path,
|
| 177 |
+
'name': file_name,
|
| 178 |
+
'word_count': len(words),
|
| 179 |
+
'type': 'default',
|
| 180 |
+
'sample_words': words[:5] # μν λ¨μ΄ μ μ‘°μ κ°λ₯
|
| 181 |
+
}
|
| 182 |
except Exception as e:
|
| 183 |
st.error(f"κΈ°λ³Έ λ°μ΄ν° ν΄λ μ€μΊ μ€ μ€λ₯: {e}")
|
| 184 |
+
|
| 185 |
+
# μ
λ‘λ ν΄λ μ€μΊ
|
| 186 |
try:
|
| 187 |
for file_path in glob.glob(os.path.join(UPLOAD_FOLDER, '*.json')):
|
| 188 |
+
file_id = f"uploaded_{os.path.basename(file_path)}" # κ³ μ ID μμ± λ°©μ λ³κ²½
|
| 189 |
file_name = os.path.basename(file_path)
|
| 190 |
words = load_words_from_json(file_path)
|
| 191 |
+
if words: # wordsκ° Noneμ΄ μλκ³ λΉμ΄μμ§ μμ κ²½μ°
|
| 192 |
+
data_files[file_id] = {
|
| 193 |
+
'path': file_path,
|
| 194 |
+
'name': file_name,
|
| 195 |
+
'word_count': len(words),
|
| 196 |
+
'type': 'uploaded',
|
| 197 |
+
'sample_words': words[:5] # μν λ¨μ΄ μ μ‘°μ κ°λ₯
|
| 198 |
+
}
|
| 199 |
except Exception as e:
|
| 200 |
st.error(f"μ
λ‘λ ν΄λ μ€μΊ μ€ μ€λ₯: {e}")
|
| 201 |
+
|
| 202 |
return data_files
|
| 203 |
|
| 204 |
+
|
| 205 |
+
def merge_word_lists(file_ids):
|
| 206 |
"""μ νλ νμΌλ€μμ λ¨μ΄λ₯Ό λ‘λνκ³ μ€λ³΅ μ κ±°νμ¬ λ³ν©ν©λλ€."""
|
| 207 |
+
all_words = []
|
| 208 |
if not file_ids:
|
| 209 |
return []
|
| 210 |
|
| 211 |
+
# data_files μνκ° μ΅μ μΈμ§ νμΈ (μ
λ‘λ/μμ ν νμν μ μμ)
|
| 212 |
+
current_data_files = st.session_state.get('data_files', {})
|
| 213 |
+
|
| 214 |
for file_id in file_ids:
|
| 215 |
if file_id in current_data_files:
|
| 216 |
file_path = current_data_files[file_id]['path']
|
| 217 |
words = load_words_from_json(file_path)
|
| 218 |
if words:
|
| 219 |
+
all_words.extend(words)
|
| 220 |
else:
|
| 221 |
+
st.warning(f"μ νλ νμΌ ID '{file_id}'λ₯Ό νμ¬ νμΌ λͺ©λ‘μμ μ°Ύμ μ μμ΅λλ€. λͺ©λ‘μ μλ‘κ³ μΉ¨ν©λλ€.")
|
| 222 |
+
# νμΌ λͺ©λ‘μ λ€μ μ€μΊνκ³ μ¬μλ (μ νμ )
|
| 223 |
+
st.session_state.data_files = scan_data_files()
|
| 224 |
+
if file_id in st.session_state.data_files:
|
| 225 |
+
words = load_words_from_json(st.session_state.data_files[file_id]['path'])
|
| 226 |
+
if words: all_words.extend(words)
|
| 227 |
+
else:
|
| 228 |
+
st.error(f"νμΌ '{file_id}'λ₯Ό μ¬μ ν μ°Ύμ μ μμ΅λλ€.")
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
+
# μ€λ³΅ μ κ±° λ° μ λ ¬
|
| 232 |
+
unique_words = sorted(list(set(all_words)))
|
| 233 |
+
return unique_words
|
| 234 |
|
| 235 |
+
|
| 236 |
+
def encode_words(words, normalize=True):
|
| 237 |
+
"""λ¨μ΄ λͺ©λ‘μ μλ² λ©μΌλ‘ λ³νν©λλ€. (κ°μ λ TF-IDF μ€νμΌ μλ² λ©)"""
|
| 238 |
if not words:
|
| 239 |
return np.array([])
|
| 240 |
|
| 241 |
embeddings = []
|
| 242 |
+
# μ 체 λ¨μ΄μ λνλλ λͺ¨λ κ³ μ λ¬Έμλ‘ μ΄ν ꡬμ±
|
| 243 |
+
unique_chars = set(char for word in words for char in word)
|
| 244 |
+
char_to_idx = {char: i for i, char in enumerate(sorted(list(unique_chars)))}
|
| 245 |
+
dim = len(char_to_idx)
|
| 246 |
|
| 247 |
+
if dim == 0: # λ¨μ΄κ° μμ μλ κ²½μ°
|
| 248 |
+
return np.array([])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
for word in words:
|
| 251 |
+
embed = np.zeros(dim)
|
| 252 |
+
word_len = len(word)
|
| 253 |
+
if word_len == 0: # λΉ λ¬Έμμ΄ μ²λ¦¬
|
| 254 |
+
embeddings.append(embed)
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
# TF (Term Frequency): λ¨μ΄ λ΄ λ¬Έμ λΉλ
|
| 258 |
+
tf = {}
|
| 259 |
+
for char in word:
|
| 260 |
+
if char in char_to_idx:
|
| 261 |
+
tf[char] = tf.get(char, 0) + 1
|
| 262 |
+
|
| 263 |
+
for char, count in tf.items():
|
| 264 |
+
if char in char_to_idx:
|
| 265 |
+
# TF κ³μ° (μ¬κΈ°μλ λ¨μ λΉλ μ¬μ©, νμμ log μ€μΌμΌλ§ λ± μ μ© κ°λ₯)
|
| 266 |
+
embed[char_to_idx[char]] = count / word_len # λ¨μ΄ κΈΈμ΄λ‘ μ κ·ν
|
| 267 |
+
|
| 268 |
+
# L2 μ κ·ν (Cosine Similarityλ₯Ό μν΄ μ μ©)
|
| 269 |
+
if normalize:
|
| 270 |
+
norm = np.linalg.norm(embed)
|
| 271 |
+
if norm > 0:
|
| 272 |
+
embed = embed / norm
|
| 273 |
+
|
| 274 |
+
embeddings.append(embed)
|
| 275 |
|
| 276 |
+
return np.array(embeddings)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def generate_graph(file_ids, similarity_threshold=0.7):
|
| 280 |
+
"""μ¬λ¬ νμΌμμ λ¨μ΄λ₯Ό λ‘λνκ³ κ·Έλνλ₯Ό μμ±ν©λλ€."""
|
| 281 |
if not file_ids:
|
| 282 |
st.error("κ·Έλνλ₯Ό μμ±ν νμΌμ΄ μ νλμ§ μμμ΅λλ€.")
|
| 283 |
return None
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
+
# μΊμ ν€ μμ± (νμΌ ID 리μ€νΈμ μκ³κ° μ‘°ν©, μμ 보μ₯)
|
| 286 |
+
cache_key = f"{'-'.join(sorted(file_ids))}_{similarity_threshold}"
|
| 287 |
+
if cache_key in st.session_state.graph_cache:
|
| 288 |
+
# μΊμλ κ²°κ³Ό λ°ν
|
| 289 |
+
return st.session_state.graph_cache[cache_key]
|
| 290 |
+
|
| 291 |
+
# νκΈ ν°νΈ μ€μ
|
| 292 |
+
plotly_font = set_korean_font()
|
| 293 |
+
|
| 294 |
+
# μ νλ νμΌλ€μμ λ¨μ΄ λ‘λ λ° λ³ν©
|
| 295 |
+
word_list = merge_word_lists(file_ids)
|
| 296 |
|
| 297 |
if not word_list:
|
| 298 |
st.error("μ νλ νμΌμμ μ ν¨ν λ¨μ΄λ₯Ό λ‘λν μ μμ΅λλ€.")
|
| 299 |
return None
|
| 300 |
+
|
| 301 |
if len(word_list) < 2:
|
| 302 |
st.warning("κ·Έλνλ₯Ό μμ±νλ €λ©΄ μ΅μ 2κ° μ΄μμ κ³ μ λ¨μ΄κ° νμν©λλ€.")
|
| 303 |
return None
|
| 304 |
|
| 305 |
+
|
| 306 |
+
# μλ² λ© μμ±
|
| 307 |
+
embeddings = None
|
| 308 |
+
with st.spinner('λ¨μ΄ μλ² λ© μμ± μ€...'):
|
| 309 |
+
# μΊμ νμΈ (νμΌ ID κΈ°λ°)
|
| 310 |
+
embedding_cache_key = '-'.join(sorted(file_ids))
|
| 311 |
+
if embedding_cache_key in st.session_state.embeddings_cache:
|
| 312 |
+
word_list_cached, embeddings = st.session_state.embeddings_cache[embedding_cache_key]
|
| 313 |
+
# μΊμλ λ¨μ΄ λͺ©λ‘κ³Ό νμ¬ λ¨μ΄ λͺ©λ‘μ΄ λ€λ₯΄λ©΄ μ¬μμ±
|
| 314 |
+
if sorted(word_list_cached) != sorted(word_list):
|
| 315 |
+
embeddings = encode_words(word_list, normalize=True)
|
| 316 |
+
st.session_state.embeddings_cache[embedding_cache_key] = (word_list, embeddings)
|
| 317 |
+
else:
|
| 318 |
+
embeddings = encode_words(word_list, normalize=True)
|
| 319 |
+
st.session_state.embeddings_cache[embedding_cache_key] = (word_list, embeddings)
|
| 320 |
+
|
| 321 |
if embeddings is None or embeddings.shape[0] == 0 or embeddings.shape[1] == 0:
|
| 322 |
+
st.error("λ¨μ΄ μλ² λ© μμ±μ μ€ν¨νμ΅λλ€.")
|
| 323 |
return None
|
| 324 |
|
| 325 |
+
# 3D μ’ν μμ± - t-SNE μ¬μ©
|
| 326 |
embeddings_3d = None
|
| 327 |
+
with st.spinner('λ¨μ΄ μ’ν κ³μ° μ€ (t-SNE)...'):
|
| 328 |
+
# t-SNE νλΌλ―Έν° μ€μ (λ°μ΄ν° ν¬κΈ°μ λ°λΌ λμ μ‘°μ )
|
| 329 |
+
n_samples = embeddings.shape[0]
|
| 330 |
+
# perplexityλ n_samples - 1 λ³΄λ€ μμμΌ ν¨
|
| 331 |
+
effective_perplexity = min(30, max(5, n_samples - 1)) # μ΅μ 5, μ΅λ 30 λλ μνμ-1
|
| 332 |
+
# λ°λ³΅ νμ
|
| 333 |
+
max_iter = max(250, min(1000, n_samples * 5)) # μν μμ λ°λΌ μ‘°μ νλ μ΅μ/μ΅λκ° μ€μ
|
| 334 |
+
# νμ΅λ₯
|
| 335 |
+
learning_rate = max(10, min(200, n_samples / 12)) if n_samples > 12 else 'auto' # μν μ κΈ°λ°, λ무 μμΌλ©΄ auto
|
| 336 |
+
|
| 337 |
+
if n_samples <= 3: # t-SNEλ μ΅μ 4κ° μν κΆμ₯
|
| 338 |
+
st.warning(f"t-SNEλ μ΅μ 4κ°μ λ¨μ΄κ° νμν©λλ€ (νμ¬ {n_samples}κ°). PCAλ₯Ό μ¬μ©ν©λλ€.")
|
| 339 |
+
from sklearn.decomposition import PCA
|
| 340 |
+
pca = PCA(n_components=min(3, n_samples), random_state=42) # μ΅λ 3μ°¨μ λλ μν μ
|
| 341 |
+
embeddings_3d_pca = pca.fit_transform(embeddings)
|
| 342 |
+
# 3μ°¨μμΌλ‘ λ§μΆκΈ° (λΆμ‘±νλ©΄ 0μΌλ‘ μ±μ)
|
| 343 |
+
embeddings_3d = np.zeros((n_samples, 3))
|
| 344 |
+
embeddings_3d[:, :embeddings_3d_pca.shape[1]] = embeddings_3d_pca
|
| 345 |
else:
|
| 346 |
+
try:
|
| 347 |
+
# max_iter λ³μ λμ κ³μ° λ° ν λΉ
|
| 348 |
+
max_iter = max(250, min(1000, n_samples * 5)) # <--- μ΄ μ€μ μ€μ μ½λλ‘ μΆκ°/νμ±ν
|
| 349 |
+
|
| 350 |
+
tsne = TSNE(n_components=3, random_state=42,
|
| 351 |
+
perplexity=effective_perplexity,
|
| 352 |
+
n_iter=max_iter, # μ΄μ μ μλ max_iter μ¬μ©
|
| 353 |
+
init='pca',
|
| 354 |
+
learning_rate=learning_rate,
|
| 355 |
+
n_jobs=-1)
|
| 356 |
+
embeddings_3d = tsne.fit_transform(embeddings)
|
| 357 |
+
except Exception as e:
|
| 358 |
+
st.error(f"t-SNE μ€ν μ€ μ€λ₯ λ°μ: {e}. PCAλ‘ λ체ν©λλ€.")
|
| 359 |
+
from sklearn.decomposition import PCA
|
| 360 |
+
pca = PCA(n_components=3, random_state=42)
|
| 361 |
+
embeddings_3d = pca.fit_transform(embeddings)
|
| 362 |
+
|
| 363 |
+
if embeddings_3d is None:
|
| 364 |
+
st.error("λ¨μ΄ μ’ν μμ±μ μ€ν¨νμ΅λλ€.")
|
| 365 |
return None
|
| 366 |
|
| 367 |
+
# μ μ¬λ κ³μ° λ° μ£μ§ μ μ
|
| 368 |
edges = []
|
| 369 |
edge_weights = []
|
| 370 |
+
with st.spinner('λ¨μ΄ κ° μ μ¬λ κ³μ° λ° μ°κ²°(μ£μ§) μμ± μ€...'):
|
| 371 |
+
# μ μ¬λ νλ ¬ κ³μ°
|
| 372 |
+
similarity_matrix = cosine_similarity(embeddings)
|
| 373 |
+
|
| 374 |
+
# μκ³κ° μ΄μμΈ μ£μ§λ§ μΆκ°
|
| 375 |
+
for i in range(n_samples):
|
| 376 |
+
for j in range(i + 1, n_samples): # μ€λ³΅ λ° μκΈ° μμ μ°κ²° λ°©μ§
|
| 377 |
+
similarity = similarity_matrix[i, j]
|
| 378 |
+
if similarity >= similarity_threshold: # λ±νΈ ν¬ν¨ (μκ³κ°κ³Ό κ°μλ μ°κ²°)
|
| 379 |
+
edges.append((word_list[i], word_list[j]))
|
| 380 |
+
edge_weights.append(similarity)
|
| 381 |
+
|
| 382 |
+
# NetworkX κ·Έλν μμ±
|
|
|
|
| 383 |
G = nx.Graph()
|
| 384 |
+
# λ
Έλ μΆκ° (λ¨μ΄μ 3D μ’ν)
|
| 385 |
for i, word in enumerate(word_list):
|
| 386 |
+
G.add_node(word, pos=(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2]))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
+
# μ£μ§μ κ°μ€μΉ μΆκ°
|
|
|
|
|
|
|
|
|
|
| 389 |
for edge, weight in zip(edges, edge_weights):
|
| 390 |
+
# self-loop λ°©μ§ (μ΄λ‘ μ μ λ‘μ§μμ λ°μ μ ν¨)
|
| 391 |
+
if edge[0] != edge[1]:
|
| 392 |
G.add_edge(edge[0], edge[1], weight=weight)
|
|
|
|
| 393 |
|
| 394 |
+
# Plotly κ·Έλν μμ±
|
| 395 |
edge_x, edge_y, edge_z = [], [], []
|
| 396 |
if G.number_of_edges() > 0:
|
| 397 |
for edge in G.edges():
|
| 398 |
try:
|
| 399 |
pos0 = G.nodes[edge[0]]['pos']
|
| 400 |
pos1 = G.nodes[edge[1]]['pos']
|
| 401 |
+
edge_x.extend([pos0[0], pos1[0], None]) # Noneμ μ λκΈ°
|
| 402 |
edge_y.extend([pos0[1], pos1[1], None])
|
| 403 |
edge_z.extend([pos0[2], pos1[2], None])
|
| 404 |
except KeyError as e:
|
| 405 |
+
st.warning(f"μ£μ§ μμ± μ€ λ
Έλ ν€ μ€λ₯: {e}. ν΄λΉ μ£μ§λ₯Ό 건λ<0xEB><0x84>λλ€.")
|
| 406 |
+
continue # λ¬Έμ κ° μλ μ£μ§λ 건λλ
|
| 407 |
+
|
| 408 |
+
# μ£μ§ νΈλ μ΄μ€
|
| 409 |
+
edge_trace = go.Scatter3d(
|
| 410 |
+
x=edge_x, y=edge_y, z=edge_z,
|
| 411 |
+
mode='lines',
|
| 412 |
+
line=dict(width=1, color='#888'),
|
| 413 |
+
hoverinfo='none' # μ£μ§μλ νΈλ² μ 보 μμ
|
| 414 |
+
)
|
| 415 |
|
| 416 |
+
# λ
Έλ μ’ν λ° ν
μ€νΈ μ 보
|
| 417 |
+
node_x, node_y, node_z, node_text = [], [], [], []
|
| 418 |
+
node_adjacencies = [] # μ°κ²° μ (degree)
|
| 419 |
+
node_hover_text = [] # νΈλ² ν
μ€νΈ
|
|
|
|
|
|
|
| 420 |
|
| 421 |
+
nodes_data = []
|
| 422 |
+
for node in G.nodes():
|
| 423 |
+
try:
|
| 424 |
+
pos = G.nodes[node]['pos']
|
| 425 |
+
degree = G.degree(node) # λ
Έλμ μ°κ²° μ κ³μ°
|
| 426 |
+
nodes_data.append({
|
| 427 |
+
'x': pos[0], 'y': pos[1], 'z': pos[2],
|
| 428 |
+
'text': node,
|
| 429 |
+
'degree': degree,
|
| 430 |
+
'hover_text': f'{node}<br>μ°κ²° μ: {degree}'
|
| 431 |
+
})
|
| 432 |
+
except KeyError:
|
| 433 |
+
st.warning(f"λ
Έλ '{node}' μ²λ¦¬ μ€ 'pos' ν€ μ€λ₯. ν΄λΉ λ
Έλλ₯Ό 건λ<0xEB><0x84>λλ€.")
|
| 434 |
+
continue # μμΉ μ 보 μλ λ
Έλ 건λλ
|
| 435 |
+
|
| 436 |
+
# λ
Έλ λ°μ΄ν°κ° μμ κ²½μ°μλ§ μ²λ¦¬
|
| 437 |
+
if nodes_data:
|
| 438 |
+
# λ
Έλ ν¬κΈ°λ₯Ό μ°κ²° μμ λ°λΌ μ‘°μ (μμ: λ‘κ·Έ μ€μΌμΌλ§)
|
| 439 |
+
degrees = np.array([data['degree'] for data in nodes_data])
|
| 440 |
+
# λ‘κ·Έ μ€μΌμΌλ§ μ μ© (0μΈ κ²½μ° λλΉ +1), μ΅λ/μ΅μ ν¬κΈ° μ ν
|
| 441 |
+
node_sizes = np.log1p(degrees) * 3 + 6 # κΈ°λ³Έ ν¬κΈ° 6, μ°κ²° λ§μμλ‘ μ»€μ§
|
| 442 |
+
node_sizes = np.clip(node_sizes, 5, 20) # μ΅μ 5, μ΅λ 20
|
| 443 |
+
|
| 444 |
+
# λ
Έλ λ°μ΄ν° λΆλ¦¬
|
| 445 |
+
node_x = [data['x'] for data in nodes_data]
|
| 446 |
+
node_y = [data['y'] for data in nodes_data]
|
| 447 |
+
node_z = [data['z'] for data in nodes_data]
|
| 448 |
+
node_text = [data['text'] for data in nodes_data]
|
| 449 |
+
node_hover_text = [data['hover_text'] for data in nodes_data]
|
| 450 |
+
|
| 451 |
+
# λ
Έλ νΈλ μ΄μ€
|
| 452 |
+
node_trace = go.Scatter3d(
|
| 453 |
+
x=node_x, y=node_y, z=node_z,
|
| 454 |
+
mode='markers+text', # λ§μ»€μ ν
μ€νΈ ν¨κ» νμ
|
| 455 |
+
text=node_text, # λ
Έλ μμ νμλ ν
μ€νΈ
|
| 456 |
+
hovertext=node_hover_text, # λ§μ°μ€ μ¬λ Έμ λ νμλ ν
μ€νΈ
|
| 457 |
+
hoverinfo='text', # νΈλ² μ hovertextλ§ νμ
|
| 458 |
+
textposition='top center', # ν
μ€νΈ μμΉ
|
| 459 |
+
textfont=dict(
|
| 460 |
+
size=10,
|
| 461 |
+
color='black',
|
| 462 |
+
family=plotly_font # μ€μ λ νκΈ ν°νΈ μ¬μ©
|
| 463 |
+
),
|
| 464 |
+
marker=dict(
|
| 465 |
+
size=node_sizes, # μ°κ²° μμ λ°λΌ ν¬κΈ° μ‘°μ λ 리μ€νΈ
|
| 466 |
+
color=node_z, # ZμΆ κ°μΌλ‘ μμ λ§€ν
|
| 467 |
+
colorscale='Viridis', # μμ μ€μΌμΌ
|
| 468 |
+
opacity=0.9,
|
| 469 |
+
colorbar=dict(thickness=15, title='Node Depth (Z)', xanchor='left', titleside='right')
|
| 470 |
+
)
|
| 471 |
)
|
| 472 |
+
else:
|
| 473 |
+
# λ
Έλ λ°μ΄ν°κ° μμΌλ©΄ λΉ νΈλ μ΄μ€ μμ±
|
| 474 |
+
node_trace = go.Scatter3d(x=[], y=[], z=[], mode='markers')
|
| 475 |
|
| 476 |
+
|
| 477 |
+
# μ¬μ©λ νμΌ μ΄λ¦ λͺ©λ‘ μμ±
|
| 478 |
+
file_names_used = []
|
| 479 |
+
if 'data_files' in st.session_state:
|
| 480 |
+
file_names_used = [st.session_state.data_files[fid]['name'] for fid in file_ids if fid in st.session_state.data_files]
|
| 481 |
file_info_str = ", ".join(file_names_used) if file_names_used else "μ μ μμ"
|
| 482 |
|
| 483 |
+
|
| 484 |
+
# λ μ΄μμ μ€μ
|
| 485 |
layout = go.Layout(
|
| 486 |
title=dict(
|
| 487 |
+
text=f'<b>μ΄ν μλ―Έ μ μ¬μ± κΈ°λ° 3D κ·Έλν</b><br>Threshold: {similarity_threshold:.2f} | λ°μ΄ν°: {file_info_str}',
|
| 488 |
font=dict(size=16, family=plotly_font),
|
| 489 |
+
x=0.5, # μ λͺ© μ€μ μ λ ¬
|
| 490 |
+
xanchor='center'
|
| 491 |
),
|
| 492 |
+
showlegend=False, # λ²λ‘ μ¨κΉ
|
| 493 |
+
margin=dict(l=10, r=10, b=10, t=80), # μ¬λ°± μ‘°μ (μ λͺ© κ³΅κ° ν보)
|
| 494 |
scene=dict(
|
| 495 |
+
xaxis=dict(
|
| 496 |
+
title='TSNE-1', showticklabels=False, # μΆ λκΈ μ¨κΉ
|
| 497 |
+
backgroundcolor="rgb(240, 240, 240)", gridcolor="white", zerolinecolor="white"
|
| 498 |
+
),
|
| 499 |
+
yaxis=dict(
|
| 500 |
+
title='TSNE-2', showticklabels=False,
|
| 501 |
+
backgroundcolor="rgb(240, 240, 240)", gridcolor="white", zerolinecolor="white"
|
| 502 |
+
),
|
| 503 |
+
zaxis=dict(
|
| 504 |
+
title='TSNE-3', showticklabels=False,
|
| 505 |
+
backgroundcolor="rgb(240, 240, 240)", gridcolor="white", zerolinecolor="white"
|
| 506 |
+
),
|
| 507 |
+
aspectratio=dict(x=1, y=1, z=0.8), # κ°λ‘μΈλ‘λΉ μ‘°μ
|
| 508 |
+
camera=dict(
|
| 509 |
+
eye=dict(x=1.2, y=1.2, z=0.8) # μ΄κΈ° μΉ΄λ©λΌ μμ
|
| 510 |
+
)
|
| 511 |
),
|
| 512 |
+
# νΈλ² λͺ¨λ μ€μ (κ°μ₯ κ°κΉμ΄ λ°μ΄ν° ν¬μΈνΈ λλ ν΅ν©)
|
| 513 |
hovermode='closest'
|
| 514 |
)
|
| 515 |
|
| 516 |
+
# Figure κ°μ²΄ μμ±
|
| 517 |
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
|
| 518 |
|
| 519 |
+
# κ²°κ³Ό μΊμ μ μ₯
|
| 520 |
st.session_state.graph_cache[cache_key] = fig
|
| 521 |
|
| 522 |
return fig
|
| 523 |
|
| 524 |
+
|
| 525 |
def handle_uploaded_file(uploaded_file):
|
| 526 |
+
"""μ
λ‘λλ νμΌμ μ²λ¦¬νκ³ λ°μ΄ν° νμΌ λͺ©λ‘μ μΆκ°ν©λλ€."""
|
| 527 |
if uploaded_file is not None:
|
| 528 |
+
# νμΌλͺ
μμ μ²λ¦¬ (uuid μ¬μ© κΆμ₯) λ° μ μ₯ κ²½λ‘
|
| 529 |
+
# original_name = uploaded_file.name
|
| 530 |
+
unique_id = str(uuid.uuid4()) # κ³ μ ID μμ±
|
| 531 |
+
# file_extension = os.path.splitext(original_name)[1]
|
| 532 |
+
# file_name = f"{unique_id}{file_extension}" # κ³ μ IDλ‘ νμΌλͺ
μμ±
|
| 533 |
+
file_name = f"{unique_id}_{uploaded_file.name}" # μλ³Έ μ΄λ¦ μΌλΆ ν¬ν¨ (μ νμ )
|
| 534 |
file_path = os.path.join(UPLOAD_FOLDER, file_name)
|
| 535 |
|
| 536 |
try:
|
| 537 |
+
# νμΌ μ μ₯
|
| 538 |
with open(file_path, 'wb') as f:
|
| 539 |
f.write(uploaded_file.getbuffer())
|
| 540 |
+
st.info(f"νμΌ '{uploaded_file.name}' ({file_name}) μ μ₯ μλ£. λ΄μ© κ²μ¦ μ€...")
|
| 541 |
|
| 542 |
+
# μ
λ‘λλ νμΌ κ²μ¦ (λ¨μ΄ λ‘λ μλ)
|
| 543 |
words = load_words_from_json(file_path)
|
| 544 |
+
|
| 545 |
+
if words is None or not words : # λ‘λ μ€ν¨ λλ λΉ λ¦¬μ€νΈ
|
| 546 |
+
try:
|
| 547 |
+
os.remove(file_path) # μ ν¨νμ§ μμΌλ©΄ νμΌ μμ
|
| 548 |
+
st.error(f"μ
λ‘λλ νμΌ '{uploaded_file.name}'μμ μ ν¨ν 'word' λ°μ΄ν°λ₯Ό μ°Ύμ μ μμ΅λλ€. νμΌ νμ(UTF-8 μΈμ½λ© JSON λ°°μ΄, κ° κ°μ²΄μ 'word' ν€)μ νμΈν΄μ£ΌμΈμ. νμΌμ΄ μμ λμμ΅λλ€.")
|
| 549 |
+
except OSError as e:
|
| 550 |
+
st.error(f"μ ν¨νμ§ μμ νμΌμ μμ νλ μ€ μ€λ₯ λ°μ: {e}")
|
| 551 |
+
return None # μ€ν¨ μ None λ°ν
|
| 552 |
+
|
| 553 |
+
st.success(f"νμΌ '{uploaded_file.name}' κ²μ¦ μλ£. {len(words)}κ°μ λ¨μ΄λ₯Ό μ°Ύμμ΅λλ€.")
|
| 554 |
+
|
| 555 |
+
# λ°μ΄ν° νμΌ λ€μ μ€μΊνμ¬ μ νμΌ μ 보 ν¬ν¨ (μΈμ
μν μ
λ°μ΄νΈ)
|
| 556 |
+
st.session_state.data_files = scan_data_files()
|
| 557 |
+
|
| 558 |
+
# μ νμΌμ ν΄λΉνλ file_id μ°ΎκΈ° (scan_data_filesμμ μμ±λ ID μ¬μ©)
|
| 559 |
+
new_file_id = f"uploaded_{file_name}" # scan_data_filesμ λμΌν λ‘μ§μΌλ‘ ID μμ±
|
| 560 |
+
if new_file_id in st.session_state.data_files:
|
| 561 |
+
return new_file_id # μ±κ³΅ μ νμΌ ID λ°ν
|
| 562 |
else:
|
| 563 |
+
st.error("νμΌ λͺ©λ‘ μ
λ°μ΄νΈ νμλ μ νμΌ IDλ₯Ό μ°Ύμ§ λͺ»νμ΅λλ€.")
|
| 564 |
+
return None
|
| 565 |
+
|
|
|
|
|
|
|
| 566 |
except Exception as e:
|
| 567 |
+
st.error(f"νμΌ μ
λ‘λ λ° μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}")
|
| 568 |
+
# μ€λ₯ λ°μ μ μ
λ‘λλ νμΌ μμ μλ
|
| 569 |
+
try:
|
| 570 |
+
if os.path.exists(file_path):
|
| 571 |
+
os.remove(file_path)
|
| 572 |
+
except OSError as del_e:
|
| 573 |
+
st.warning(f"μ€λ₯ λ°μ ν νμΌ μμ μ€ν¨: {del_e}")
|
| 574 |
+
return None # μ€ν¨ μ None λ°ν
|
| 575 |
+
|
| 576 |
|
| 577 |
def delete_file(file_id):
|
| 578 |
+
"""νμΌμ μμ ν©λλ€."""
|
| 579 |
+
if file_id not in st.session_state.get('data_files', {}):
|
|
|
|
| 580 |
st.error('μμ ν νμΌμ μ°Ύμ μ μμ΅λλ€.')
|
| 581 |
return False
|
| 582 |
|
| 583 |
+
file_info = st.session_state.data_files[file_id]
|
| 584 |
+
|
| 585 |
+
# μ
λ‘λλ νμΌλ§ μμ νμ©
|
| 586 |
if file_info.get('type') != 'uploaded':
|
| 587 |
st.error('κΈ°λ³Έ λ°μ΄ν° νμΌμ μμ ν μ μμ΅λλ€.')
|
| 588 |
return False
|
|
|
|
| 590 |
file_path = file_info.get('path')
|
| 591 |
file_name = file_info.get('name', 'μ μ μμ')
|
| 592 |
|
| 593 |
+
if not file_path:
|
| 594 |
+
st.error(f"νμΌ '{file_name}'μ κ²½λ‘ μ λ³΄κ° μμ΅λλ€.")
|
| 595 |
+
return False
|
| 596 |
+
|
| 597 |
try:
|
| 598 |
+
# νμΌ μμ€ν
μμ νμΌ μμ
|
| 599 |
+
if os.path.exists(file_path):
|
| 600 |
os.remove(file_path)
|
| 601 |
+
st.info(f"νμΌ μμ€ν
μμ '{file_name}' μμ μλ£.")
|
| 602 |
else:
|
| 603 |
+
st.warning(f"νμΌ μμ€ν
μ '{file_name}'({file_path})μ΄(κ°) μ΄λ―Έ μ‘΄μ¬νμ§ μμ΅λλ€.")
|
| 604 |
|
| 605 |
+
# μΈμ
μνμμ νμΌ μ 보 μ κ±°
|
| 606 |
del st.session_state.data_files[file_id]
|
|
|
|
|
|
|
| 607 |
|
| 608 |
+
# κ΄λ ¨ μΊμ νλͺ© μμ (κ·Έλν, μλ² λ©)
|
| 609 |
+
keys_to_remove_graph = [k for k in st.session_state.graph_cache if file_id in k]
|
| 610 |
+
for key in keys_to_remove_graph:
|
| 611 |
del st.session_state.graph_cache[key]
|
|
|
|
| 612 |
|
| 613 |
+
keys_to_remove_embed = [k for k in st.session_state.embeddings_cache if file_id in k]
|
| 614 |
+
for key in keys_to_remove_embed:
|
| 615 |
+
del st.session_state.embeddings_cache[key]
|
| 616 |
+
|
| 617 |
+
# νμ¬ μ νλ νμΌ λͺ©λ‘μμλ μ κ±°
|
| 618 |
+
if file_id in st.session_state.selected_files:
|
| 619 |
+
st.session_state.selected_files.remove(file_id)
|
| 620 |
+
|
| 621 |
+
st.success(f"νμΌ '{file_name}' κ΄λ ¨ μ 보 λ° μΊμκ° μμ λμμ΅λλ€.")
|
| 622 |
return True
|
| 623 |
|
| 624 |
except Exception as e:
|
| 625 |
st.error(f"νμΌ μμ μ€ μ€λ₯ λ°μ: {e}")
|
| 626 |
return False
|
| 627 |
|
| 628 |
+
|
| 629 |
def clear_cache():
|
| 630 |
+
"""κ·Έλν λ° μλ² λ© μΊμλ₯Ό μ΄κΈ°νν©λλ€."""
|
| 631 |
st.session_state.graph_cache = {}
|
| 632 |
+
st.session_state.embeddings_cache = {}
|
| 633 |
+
st.session_state.fig = None # νμ¬ νμμ€μΈ κ·Έλνλ μ΄κΈ°ν
|
| 634 |
+
st.success('κ·Έλν λ° μλ² λ© μΊμκ° μ΄κΈ°νλμμ΅λλ€.')
|
| 635 |
+
st.experimental_rerun() # μΊμ ν΄λ¦¬μ΄ ν UI κ°±μ
|
|
|
|
| 636 |
|
|
|
|
|
|
|
|
|
|
| 637 |
|
| 638 |
+
# --- μ± μ€ν μμ ---
|
|
|
|
|
|
|
|
|
|
| 639 |
|
| 640 |
+
# λ°μ΄ν° νμΌ μ€μΊ (μ± μμ μ λλ νμ μ)
|
| 641 |
if 'data_files' not in st.session_state or not st.session_state.data_files:
|
| 642 |
st.session_state.data_files = scan_data_files()
|
| 643 |
|
| 644 |
# νμ΄ν λ° μκ°
|
| 645 |
+
st.title('νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν')
|
| 646 |
st.markdown("""
|
| 647 |
+
μ΄ λꡬλ μ 곡λ JSON νμΌμμ νκ΅μ΄ λ¨μ΄ λͺ©λ‘μ μ½μ΄λ€μ¬, λ¨μ΄ κ°μ μλ―Έμ μ μ¬μ±(μ¬κΈ°μλ λ¬Έμ κ΅¬μ± κΈ°λ° μ μ¬μ±)μ κ³μ°νκ³ ,
|
| 648 |
+
κ·Έ κ΄κ³λ₯Ό μΈν°λν°λΈν 3D λ€νΈμν¬ κ·Έλνλ‘ μκ°νν©λλ€.
|
| 649 |
""")
|
| 650 |
|
| 651 |
+
# --- μ¬μ΄λλ° μ€μ ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
st.sidebar.title('βοΈ μ€μ λ° μ μ΄')
|
| 653 |
|
| 654 |
+
# μκ³κ° μ€μ
|
| 655 |
threshold = st.sidebar.slider(
|
| 656 |
+
'μ μ¬λ μκ³κ° (Similarity Threshold)',
|
| 657 |
+
min_value=0.1,
|
| 658 |
+
max_value=0.95, # μ΅λκ° μ½κ° λλ¦Ό
|
| 659 |
+
value=st.session_state.threshold,
|
| 660 |
+
step=0.05,
|
| 661 |
+
help='μ΄ κ°λ³΄λ€ μ μ¬λκ° λμ λ¨μ΄λ€λ§ μ°κ²°μ (μ£μ§)μΌλ‘ μ΄μ΄μ§λλ€. κ°μ΄ λμμλ‘ μ°κ²°μ΄ λ μ격ν΄μ§λλ€.'
|
| 662 |
)
|
| 663 |
+
# μ¬λΌμ΄λ κ°μ΄ λ³κ²½λλ©΄ μΈμ
μν μ
λ°μ΄νΈ (μ½λ°± μ¬μ©μ΄ λ ν¨μ¨μ μΌ μ μμ)
|
| 664 |
if threshold != st.session_state.threshold:
|
| 665 |
st.session_state.threshold = threshold
|
| 666 |
+
st.session_state.fig = None # μκ³κ° λ³κ²½ μ νμ¬ κ·Έλν μ΄κΈ°ν (μ¬μμ± νμ μλ¦Ό)
|
| 667 |
+
st.session_state.generate_clicked = False # ν΄λ¦ μνλ 리μ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
st.sidebar.divider()
|
| 670 |
|
| 671 |
+
# νμΌ μ
λ‘λ
|
| 672 |
st.sidebar.header('π νμΌ μ
λ‘λ')
|
| 673 |
uploaded_file = st.sidebar.file_uploader(
|
| 674 |
+
"JSON νμΌ μ
λ‘λ",
|
| 675 |
+
type=['json'],
|
| 676 |
+
help="λ¨μ΄ λͺ©λ‘μ΄ ν¬ν¨λ JSON νμΌμ μ
λ‘λνμΈμ. νμ: [{'word': 'λ¨μ΄1'}, {'word': 'λ¨μ΄2'}, ...]"
|
| 677 |
)
|
| 678 |
+
|
| 679 |
if uploaded_file is not None:
|
| 680 |
+
# μ
λ‘λ λ²νΌ λμ νμΌμ΄ μμΌλ©΄ λ°λ‘ μ²λ¦¬ μλ (μ¬μ©μ κ²½ν κ°μ )
|
| 681 |
+
# if st.sidebar.button('μ
λ‘λ μ²λ¦¬', key='upload_button'): # λ²νΌ μ κ±°
|
| 682 |
with st.spinner("μ
λ‘λλ νμΌ μ²λ¦¬ μ€..."):
|
| 683 |
new_file_id = handle_uploaded_file(uploaded_file)
|
| 684 |
if new_file_id:
|
| 685 |
+
st.sidebar.success(f"νμΌ '{uploaded_file.name}' μ
λ‘λ λ° μ²λ¦¬ μλ£!")
|
| 686 |
+
# μλ‘ μ
λ‘λλ νμΌμ μλμΌλ‘ μ ν λͺ©λ‘μ μΆκ°νκ³ μ ν μνλ‘ λ§λ¦
|
| 687 |
if new_file_id not in st.session_state.selected_files:
|
| 688 |
st.session_state.selected_files.append(new_file_id)
|
| 689 |
+
# μ€ν¬λ¦½νΈ μ¬μ€ννμ¬ UI μ
λ°μ΄νΈ
|
| 690 |
+
st.experimental_rerun()
|
| 691 |
+
else:
|
| 692 |
+
# handle_uploaded_file λ΄λΆμμ μ€λ₯ λ©μμ§ νμλ¨
|
| 693 |
+
pass
|
| 694 |
+
# μ
λ‘λ μμ ― μ΄κΈ°νλ₯Ό μν΄ None ν λΉ (μ νμ )
|
| 695 |
+
# uploaded_file = None # μ΄λ κ² νλ©΄ νμΌ μ ν μ°½μ΄ λ€μ λνλ¨, νμμ λ°λΌ μ‘°μ
|
| 696 |
|
| 697 |
st.sidebar.divider()
|
| 698 |
|
| 699 |
+
# νμΌ μ ν μμ
|
| 700 |
st.sidebar.header('ποΈ λ°μ΄ν° νμΌ μ ν')
|
| 701 |
+
|
| 702 |
+
if st.session_state.data_files:
|
| 703 |
+
# μ¬μ©ν νμΌ μ ν 체ν¬λ°μ€
|
| 704 |
+
st.sidebar.markdown("**μ¬μ©ν νμΌμ μ ννμΈμ (λ€μ€ μ ν κ°λ₯):**")
|
| 705 |
+
|
| 706 |
+
# μ ν μν κ΄λ¦¬λ₯Ό μν μμ 리μ€νΈ
|
| 707 |
selected_files_temp = []
|
| 708 |
+
# νμΌ λͺ©λ‘ μ λ ¬ (μ΄λ¦μ)
|
| 709 |
+
sorted_file_ids = sorted(st.session_state.data_files.keys(), key=lambda fid: st.session_state.data_files[fid]['name'])
|
| 710 |
+
|
| 711 |
|
| 712 |
+
# κ° νμΌμ λν 체ν¬λ°μ€ λ° μ 보 νμ
|
| 713 |
for file_id in sorted_file_ids:
|
| 714 |
+
if file_id not in st.session_state.data_files: continue # μμ λ κ²½μ° κ±΄λλ°κΈ°
|
| 715 |
+
file_info = st.session_state.data_files[file_id]
|
| 716 |
+
|
| 717 |
file_label = f"{file_info['name']} ({file_info['word_count']} λ¨μ΄)"
|
| 718 |
file_type_tag = "[κΈ°λ³Έ]" if file_info['type'] == 'default' else "[μ
λ‘λ]"
|
| 719 |
label_full = f"{file_label} {file_type_tag}"
|
| 720 |
+
|
| 721 |
+
# νμ¬ νμΌμ΄ μ νλμλμ§ νμΈ (μΈμ
μν κΈ°μ€)
|
| 722 |
is_selected = file_id in st.session_state.selected_files
|
| 723 |
|
| 724 |
+
# 체ν¬λ°μ€ μμ±
|
| 725 |
+
checkbox_key = f"cb_{file_id}" # κ³ μ ν€
|
| 726 |
+
# 체ν¬λ°μ€ κ° λ³κ²½ μ μ½λ°± μ¬μ© λμ , 루ν ν λΉκ΅ λ°©μμΌλ‘ μ²λ¦¬
|
| 727 |
+
if st.sidebar.checkbox(label_full, value=is_selected, key=checkbox_key):
|
| 728 |
+
# 체ν¬λ κ²½μ° μμ 리μ€νΈμ μΆκ°
|
| 729 |
selected_files_temp.append(file_id)
|
| 730 |
+
|
| 731 |
+
# μν λ¨μ΄ λ° μμ λ²νΌ (μ
λ‘λλ νμΌμλ§)
|
| 732 |
with st.sidebar.expander("νμΌ μ 보 보기", expanded=False):
|
| 733 |
+
st.markdown(f"**μν λ¨μ΄:** `{'`, `'.join(file_info['sample_words'])}`")
|
| 734 |
if file_info['type'] == 'uploaded':
|
| 735 |
+
delete_button_key = f"del_{file_id}"
|
| 736 |
+
if st.button('ποΈ μ΄ νμΌ μμ ', key=delete_button_key, help=f"'{file_info['name']}' νμΌμ μꡬμ μΌλ‘ μμ ν©λλ€."):
|
| 737 |
+
with st.spinner(f"'{file_info['name']}' μμ μ€..."):
|
| 738 |
+
if delete_file(file_id):
|
| 739 |
+
# μμ μ±κ³΅ μ, selected_files_tempμμλ μ κ±° (νμ)
|
| 740 |
+
if file_id in selected_files_temp:
|
| 741 |
+
selected_files_temp.remove(file_id)
|
| 742 |
+
# data_files μνκ° λ³κ²½λμμΌλ―λ‘ μ¬μ€ν νμ
|
| 743 |
+
st.experimental_rerun()
|
| 744 |
+
else:
|
| 745 |
+
st.error("νμΌ μμ μ μ€ν¨νμ΅λλ€.")
|
| 746 |
+
# st.sidebar.markdown("---") # ꡬλΆμ μ κ±° λλ μ€νμΌ μ‘°μ
|
| 747 |
+
|
| 748 |
+
# --- μ€μ: μ ν μν μ
λ°μ΄νΈ ---
|
| 749 |
+
# νμ¬ μ²΄ν¬λ°μ€ μν(selected_files_temp)μ μΈμ
μν(st.session_state.selected_files)κ° λ€λ₯Ό λλ§ μ
λ°μ΄νΈ
|
| 750 |
+
# μμμ μκ΄μμ΄ λΉκ΅νκΈ° μν΄ μ λ ¬ ν λΉκ΅
|
| 751 |
if sorted(selected_files_temp) != sorted(st.session_state.selected_files):
|
| 752 |
st.session_state.selected_files = selected_files_temp
|
| 753 |
+
st.session_state.fig = None # νμΌ μ ν λ³κ²½ μ κ·Έλν μ΄κΈ°ν
|
| 754 |
+
st.session_state.generate_clicked = False # ν΄λ¦ μνλ 리μ
|
| 755 |
+
# μ ν λ³κ²½ μ λ°λ‘ μ¬μ€ννμ¬ UI λ°μ (μ νμ μ΄μ§λ§ μ¬μ©μ κ²½ν κ°μ )
|
| 756 |
+
st.experimental_rerun()
|
| 757 |
|
| 758 |
st.sidebar.divider()
|
| 759 |
|
| 760 |
+
# κ·Έλν μμ± λ²νΌ
|
| 761 |
+
# μ νλ νμΌμ΄ μμ λλ§ νμ±ν
|
| 762 |
if st.session_state.selected_files:
|
| 763 |
if st.sidebar.button('π κ·Έλν μμ±/μ
λ°μ΄νΈ', key='generate_button', type="primary"):
|
| 764 |
+
# λ²νΌ ν΄λ¦ μ, generate_clicked νλκ·Έ μ€μ
|
| 765 |
+
# μ νλ νμΌμ΄ μλμ§ λ€μ νλ² νμΈ (νΉμ λͺ¨λ₯Ό λμμ± λ¬Έμ λ°©μ§)
|
| 766 |
+
if st.session_state.selected_files:
|
| 767 |
+
st.session_state.generate_clicked = True
|
| 768 |
+
# μ¬κΈ°μ st.experimental_rerun() νΈμΆ μ κ±°! λ²νΌ ν΄λ¦ μ μλμΌλ‘ μ¬μ€νλ¨
|
| 769 |
+
else:
|
| 770 |
+
st.sidebar.warning('κ·Έλνλ₯Ό μμ±ν νμΌμ λ¨Όμ μ νν΄μ£ΌμΈμ.')
|
| 771 |
+
st.session_state.generate_clicked = False # λ§μ½μ μν΄ λ¦¬μ
|
| 772 |
else:
|
| 773 |
+
st.sidebar.warning('κ·Έλνλ₯Ό μμ±νλ €λ©΄ μ΅μ 1κ° μ΄μμ νμΌμ μ νν΄μ£ΌμΈμ.')
|
| 774 |
|
| 775 |
else:
|
| 776 |
+
st.sidebar.warning('μ¬μ© κ°λ₯ν λ°μ΄ν° νμΌμ΄ μμ΅λλ€. νμΌμ μ
λ‘λνκ±°λ `data` ν΄λμ JSON νμΌμ μΆκ°νμΈμ.')
|
| 777 |
|
|
|
|
| 778 |
|
| 779 |
+
# μΊμ μ΄κΈ°ν λ²νΌ (νμ νμ)
|
| 780 |
if st.sidebar.button('π μΊμ μ΄κΈ°ν', key='clear_cache_button'):
|
| 781 |
clear_cache()
|
| 782 |
|
|
|
|
| 783 |
# --- λ©μΈ μ½ν
μΈ μμ ---
|
| 784 |
st.header("π 3D λ¨μ΄ λ€νΈμν¬ μκ°ν")
|
| 785 |
|
| 786 |
# κ·Έλν νμ λ‘μ§
|
| 787 |
+
# 1. μ νλ νμΌμ΄ μμ΄μΌ ν¨
|
| 788 |
+
# 2. 'κ·Έλν μμ±' λ²νΌμ΄ ν΄λ¦λμκ±°λ (generate_clicked == True)
|
| 789 |
+
# 3. μ΄λ―Έ μμ±λ κ·Έλνκ° μΈμ
μνμ μμ΄μΌ ν¨ (st.session_state.fig is not None)
|
| 790 |
+
|
| 791 |
if st.session_state.selected_files:
|
| 792 |
+
# κ·Έλνλ₯Ό μμ±ν΄μΌ νλ 쑰건 : λ²νΌ ν΄λ¦ νλκ·Έκ° True μ΄κ±°λ, μκ³κ°/νμΌμ ν λ³κ²½μΌλ‘ figκ° Noneμ΄ λ κ²½μ°
|
| 793 |
should_generate_graph = st.session_state.generate_clicked or \
|
| 794 |
+
(st.session_state.fig is None and st.session_state.selected_files) # νμΌ μ ν ν figκ° μμ λ
|
| 795 |
|
| 796 |
+
if should_generate_graph:
|
| 797 |
+
with st.spinner('κ·Έλν μμ± μ€... μ μλ§ κΈ°λ€λ €μ£ΌμΈμ.'):
|
| 798 |
try:
|
| 799 |
+
# generate_graph ν¨μ νΈμΆ
|
| 800 |
+
fig = generate_graph(st.session_state.selected_files, st.session_state.threshold)
|
| 801 |
+
# μ±κ³΅μ μΌλ‘ μμ±λλ©΄ μΈμ
μνμ μ μ₯
|
| 802 |
+
st.session_state.fig = fig
|
| 803 |
+
# μμ± μλ£ ν ν΄λ¦ νλκ·Έ 리μ
|
| 804 |
+
st.session_state.generate_clicked = False
|
|
|
|
|
|
|
|
|
|
| 805 |
except Exception as e:
|
| 806 |
+
st.error(f"κ·Έλν μμ± μ€ μ€λ₯ λ°μ: {e}")
|
| 807 |
+
st.session_state.fig = None # μ€λ₯ λ°μ μ fig μ΄κΈ°ν
|
| 808 |
+
st.session_state.generate_clicked = False # νλκ·Έ 리μ
|
|
|
|
| 809 |
|
| 810 |
+
# μμ±λ κ·Έλνκ° μΈμ
μνμ μμΌλ©΄ νμ
|
| 811 |
if st.session_state.get('fig') is not None:
|
| 812 |
st.plotly_chart(st.session_state.fig, use_container_width=True)
|
| 813 |
|
| 814 |
# νμ¬ κ·Έλν μ 보 νμ
|
| 815 |
try:
|
| 816 |
+
selected_file_names = [st.session_state.data_files[fid]['name'] for fid in st.session_state.selected_files if fid in st.session_state.data_files]
|
| 817 |
+
total_word_count = sum(st.session_state.data_files[fid]['word_count'] for fid in st.session_state.selected_files if fid in st.session_state.data_files)
|
| 818 |
+
# μ€μ κ·Έλνμ λ
Έλ/μ£μ§ μ κ°μ Έμ€κΈ° (fig κ°μ²΄ λΆμ νμ)
|
| 819 |
num_nodes = len(st.session_state.fig.data[1].x) if len(st.session_state.fig.data) > 1 and hasattr(st.session_state.fig.data[1], 'x') else 0
|
| 820 |
num_edges = len(st.session_state.fig.data[0].x) // 3 if len(st.session_state.fig.data) > 0 and hasattr(st.session_state.fig.data[0], 'x') and st.session_state.fig.data[0].x else 0
|
| 821 |
|
|
|
|
|
|
|
|
|
|
| 822 |
|
| 823 |
st.info(f"""
|
| 824 |
**νμ¬ κ·Έλν μ 보**
|
|
|
|
| 829 |
except Exception as info_e:
|
| 830 |
st.warning(f"κ·Έλν μ 보 νμ μ€ μ€λ₯: {info_e}")
|
| 831 |
|
| 832 |
+
|
| 833 |
# μ¬μ© μ€λͺ
|
| 834 |
with st.expander("π‘ κ·Έλν μ‘°μ λ°©λ²"):
|
| 835 |
st.markdown("""
|
| 836 |
+
- **νλ/μΆμ:** λ§μ°μ€ ν μ€ν¬λ‘€ λλ ν°μΉμ€ν¬λ¦°μμ λ μκ°λ½ μ¬μ©
|
| 837 |
- **νμ :** λ§μ°μ€ μΌμͺ½ λ²νΌ λλ₯Έ μνλ‘ λλκ·Έ
|
| 838 |
+
- **μ΄λ (Pan):** λ§μ°μ€ μ€λ₯Έμͺ½ λ²νΌ λλ₯Έ μνλ‘ λλκ·Έ λλ Shift + μΌμͺ½ λ²νΌ λλκ·Έ
|
| 839 |
+
- **λ¨μ΄ μ 보 νμΈ:** λ§μ°μ€ 컀μλ₯Ό λ¨μ΄(λ§μ»€) μμ μ¬λ¦¬λ©΄ λ¨μ΄ μ΄λ¦κ³Ό μ°κ²°λ λ€λ₯Έ λ¨μ΄μ μλ₯Ό λ³Ό μ μμ΅λλ€.
|
| 840 |
+
- **ν΄λ° μ¬μ©:** κ·Έλν μ°μΈ‘ μλ¨μ ν΄λ° μμ΄μ½μ μ¬μ©νμ¬ λ€μν 보기 μ΅μ
(λ€μ΄λ‘λ, νλ/μΆμ μμ μ§μ λ±)μ νμ©ν μ μμ΅λλ€.
|
| 841 |
""")
|
| 842 |
+
elif not should_generate_graph and not st.session_state.selected_files:
|
| 843 |
+
st.info("π μ¬μ΄λλ°μμ λΆμν λ°μ΄ν° νμΌμ μ νν΄μ£ΌμΈμ.")
|
| 844 |
+
elif not should_generate_graph and st.session_state.selected_files and st.session_state.fig is None:
|
| 845 |
+
# νμΌμ μ ννμ§λ§ μμ§ μμ± λ²νΌ μ λλ¦ or μμ± μ€ν¨
|
| 846 |
st.info("π μ¬μ΄λλ°μμ 'π κ·Έλν μμ±/μ
λ°μ΄νΈ' λ²νΌμ ν΄λ¦νμ¬ μκ°νλ₯Ό μμνμΈμ.")
|
| 847 |
|
|
|
|
| 848 |
elif not st.session_state.data_files:
|
| 849 |
st.warning("νμν λ°μ΄ν° νμΌμ΄ μμ΅λλ€. νμΌμ μ
λ‘λνκ±°λ `data` ν΄λμ μ ν¨ν JSON νμΌμ μΆκ°νμΈμ.")
|
| 850 |
+
else:
|
| 851 |
+
# data_filesλ μμ§λ§ selected_filesκ° μλ κ²½μ°
|
| 852 |
st.info("π μ¬μ΄λλ°μμ λΆμν λ°μ΄ν° νμΌμ μ νν΄μ£ΌμΈμ.")
|
| 853 |
|
|
|
|
| 854 |
# --- νλ¨ μ 보 μΉμ
---
|
| 855 |
st.divider()
|
| 856 |
+
|
| 857 |
with st.expander("βΉοΈ μ΄ μκ°ν λꡬμ λνμ¬"):
|
| 858 |
+
st.markdown("""
|
| 859 |
μ΄ λꡬλ λ€μκ³Ό κ°μ κ³Όμ μ ν΅ν΄ νκ΅μ΄ λ¨μ΄ λ€νΈμν¬λ₯Ό μκ°νν©λλ€:
|
| 860 |
|
| 861 |
1. **λ°μ΄ν° λ‘λ©:** μ¬μ©μκ° μ 곡ν JSON νμΌμμ 'word' νλλ₯Ό κ°μ§ λ¨μ΄ λͺ©λ‘μ μΆμΆν©λλ€.
|
| 862 |
+
2. **λ¨μ΄ μλ² λ©:** κ° λ¨μ΄λ₯Ό κ³ μ°¨μ λ²‘ν° κ³΅κ°μ ννν©λλ€. νμ¬λ **λ¬Έμ κ΅¬μ± κΈ°λ° TF-IDF μ€νμΌ μλ² λ©**μ μ¬μ©νμ¬, λ¨μ΄λ₯Ό μ΄λ£¨λ λ¬Έμλ€μ λΉλλ₯Ό κΈ°λ°μΌλ‘ 벑ν°λ₯Ό μμ±ν©λλ€. (μΆν Word2Vec, FastText λ± μ¬μ νλ ¨λ λͺ¨λΈ μ¬μ© κ°λ₯)
|
| 863 |
+
3. **μ°¨μ μΆμ:** κ³ μ°¨μ μλ² λ© λ²‘ν°λ₯Ό μκ°ν κ°λ₯ν 3μ°¨μ 곡κ°μΌλ‘ μΆμν©λλ€. **t-SNE(t-Distributed Stochastic Neighbor Embedding)** μκ³ λ¦¬μ¦μ μ¬μ©νμ¬ λ³΅μ‘ν λ°μ΄ν° ꡬ쑰λ₯Ό μ μ§νλ©΄μ μ°¨μμ μ€μ
λλ€. (λ¨μ΄ μκ° μ μ κ²½μ° PCA μ¬μ©)
|
| 864 |
+
4. **μ μ¬λ κ³μ°:** 3D 곡κ°μΌλ‘ μΆμνκΈ° μ μ μλ³Έ μλ² λ© λ²‘ν° κ°μ **μ½μ¬μΈ μ μ¬λ(Cosine Similarity)**λ₯Ό κ³μ°νμ¬ λ¨μ΄ μμ μλ―Έμ (μ¬κΈ°μλ ꡬμ±μ ) μ μ¬μ±μ μΈ‘μ ν©λλ€.
|
| 865 |
+
5. **κ·Έλν μμ±:** μ€μ λ **μ μ¬λ μκ³κ°(Threshold)** μ΄μμΈ λ¨μ΄ μλ€μ μ°κ²°μ (μ£μ§)μΌλ‘ μ΄μ΄ λ€νΈμν¬ κ·Έλνλ₯Ό ꡬμ±ν©λλ€. κ° λ¨μ΄λ λ
Έλ(μ )λ‘ νμλ©λλ€.
|
| 866 |
+
6. **3D μκ°ν:** **Plotly λΌμ΄λΈλ¬λ¦¬**λ₯Ό μ¬μ©νμ¬ μμ±λ λ€νΈμν¬ κ·Έλνλ₯Ό μΈν°λν°λΈν 3D 곡κ°μ μκ°νν©λλ€. λ
Έλμ μμΉλ t-SNE κ²°κ³Ό μ’νλ₯Ό λ°λ₯΄λ©°, μμμ΄λ ν¬κΈ°λ ZμΆ κ°μ΄λ μ°κ²° μ(degree) λ±μ λ°μν μ μμ΅λλ€.
|
| 867 |
+
|
| 868 |
+
μ΄λ₯Ό ν΅ν΄ λ¨μ΄λ€μ΄ μλ‘ μΌλ§λ μ μ¬νμ§μ λ°λΌ κ΅°μ§μ μ΄λ£¨κ±°λ μ°κ²°λλ ν¨ν΄μ μκ°μ μΌλ‘ νμν μ μμ΅λλ€.
|
| 869 |
+
""")
|
| 870 |
+
|
| 871 |
+
with st.expander("π JSON νμΌ νμ μλ΄"):
|
| 872 |
+
st.markdown("""
|
| 873 |
+
μ
λ‘λνκ±°λ `data` ν΄λμ λ£λ JSON νμΌμ **UTF-8 μΈμ½λ©**μ΄μ΄μΌ νλ©°, λ€μκ³Ό κ°μ νμμ λ°λΌμΌ ν©λλ€:
|
| 874 |
+
|
| 875 |
+
```json
|
| 876 |
+
[
|
| 877 |
+
{
|
| 878 |
+
"word": "νκ΅"
|
| 879 |
+
},
|
| 880 |
+
{
|
| 881 |
+
"word": "μ μλ"
|
| 882 |
+
},
|
| 883 |
+
{
|
| 884 |
+
"word": "νμ"
|
| 885 |
+
},
|
| 886 |
+
{
|
| 887 |
+
"word": "κ΅μ€"
|
| 888 |
+
},
|
| 889 |
+
{
|
| 890 |
+
"word": "μ»΄ν¨ν°",
|
| 891 |
+
"description": "μ΄ νλλ 무μλ©λλ€"
|
| 892 |
+
}
|
| 893 |
+
]
|
| 894 |
+
```
|
| 895 |
+
|
| 896 |
+
- νμΌμ μ΅μμ ꡬ쑰��� **λ°°μ΄(List)**μ΄μ΄μΌ ν©λλ€ (`[...]`).
|
| 897 |
+
- λ°°μ΄μ κ° μμλ **κ°μ²΄(Dictionary)**μ¬μΌ ν©λλ€ (`{...}`).
|
| 898 |
+
- κ° κ°μ²΄λ λ°λμ `"word"`λΌλ ν€λ₯Ό ν¬ν¨ν΄μΌ νλ©°, κ·Έ κ°μ λΆμν **νκ΅μ΄ λ¨μ΄ λ¬Έμμ΄**μ΄μ΄μΌ ν©λλ€.
|
| 899 |
+
- `"word"` μΈμ λ€λ₯Έ ν€κ° μμ΄λ 무방νλ, νμ¬ λ²μ μμλ μ¬μ©λμ§ μκ³ λ¬΄μλ©λλ€.
|
| 900 |
+
- νμΌ μΈμ½λ©μ΄ UTF-8μ΄ μλ κ²½μ° νκΈμ΄ κΉ¨μ§κ±°λ μ€λ₯κ° λ°μν μ μμ΅λλ€.
|
| 901 |
""")
|