Spaces:
Sleeping
Sleeping
fix
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
import streamlit as st
|
| 3 |
import json
|
| 4 |
import os
|
|
@@ -15,12 +14,16 @@ import matplotlib.pyplot as plt
|
|
| 15 |
import matplotlib.font_manager as fm
|
| 16 |
from sklearn.manifold import TSNE
|
| 17 |
import warnings
|
|
|
|
|
|
|
|
|
|
| 18 |
warnings.filterwarnings('ignore')
|
| 19 |
|
|
|
|
| 20 |
# νμ΄μ§ μ€μ
|
| 21 |
st.set_page_config(
|
| 22 |
-
page_title="νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν",
|
| 23 |
-
page_icon="
|
| 24 |
layout="wide"
|
| 25 |
)
|
| 26 |
|
|
@@ -29,100 +32,122 @@ DATA_FOLDER = 'data'
|
|
| 29 |
UPLOAD_FOLDER = 'uploads'
|
| 30 |
|
| 31 |
# ν΄λ μμ±
|
|
|
|
|
|
|
| 32 |
if not os.path.exists(UPLOAD_FOLDER):
|
| 33 |
os.makedirs(UPLOAD_FOLDER)
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if 'embeddings_cache' not in st.session_state:
|
| 40 |
-
st.session_state.embeddings_cache = {}
|
| 41 |
-
|
| 42 |
if 'graph_cache' not in st.session_state:
|
| 43 |
st.session_state.graph_cache = {}
|
| 44 |
-
|
| 45 |
if 'data_files' not in st.session_state:
|
| 46 |
st.session_state.data_files = {}
|
| 47 |
-
|
| 48 |
if 'selected_files' not in st.session_state:
|
| 49 |
st.session_state.selected_files = []
|
| 50 |
-
|
| 51 |
if 'threshold' not in st.session_state:
|
| 52 |
-
st.session_state.threshold = 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# --- νκΈ ν°νΈ μ€μ ν¨μ ---
|
| 55 |
def set_korean_font():
|
| 56 |
-
"""
|
| 57 |
-
νμ¬ μ΄μ체μ μ λ§λ νκΈ ν°νΈλ₯Ό matplotlib λ° Plotlyμ©μΌλ‘ μ€μ μλνκ³ ,
|
| 58 |
-
Plotlyμμ μ¬μ©ν ν°νΈ μ΄λ¦μ λ°νν©λλ€.
|
| 59 |
-
"""
|
| 60 |
system_name = platform.system()
|
| 61 |
-
plotly_font_name =
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
available_fonts = [f.name for f in fm.fontManager.ttflist]
|
| 82 |
nanum_fonts = [name for name in available_fonts if 'Nanum' in name]
|
| 83 |
if nanum_fonts:
|
| 84 |
font_name = nanum_fonts[0]
|
| 85 |
-
|
| 86 |
-
plotly_font_name = font_name if 'Nanum' in font_name else plotly_font_name_linux
|
| 87 |
else:
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
plotly_font_name = "Malgun Gothic"
|
| 92 |
-
elif "AppleGothic" in available_fonts:
|
| 93 |
-
font_name = "AppleGothic"
|
| 94 |
-
plotly_font_name = "AppleGothic"
|
| 95 |
-
else:
|
| 96 |
-
font_name = None
|
| 97 |
-
|
| 98 |
-
except Exception as e:
|
| 99 |
-
font_name = None
|
| 100 |
-
|
| 101 |
-
if not font_name:
|
| 102 |
-
font_name = None
|
| 103 |
-
plotly_font_name = None # Plotlyλ κΈ°λ³Έκ° μ¬μ©
|
| 104 |
-
|
| 105 |
-
else: # κΈ°ν OS
|
| 106 |
-
font_name = None
|
| 107 |
-
plotly_font_name = None
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
try:
|
| 112 |
plt.rc('font', family=font_name)
|
| 113 |
plt.rc('axes', unicode_minus=False)
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
plt.rcdefaults()
|
| 116 |
plt.rc('axes', unicode_minus=False)
|
| 117 |
-
|
|
|
|
|
|
|
| 118 |
plt.rcdefaults()
|
| 119 |
plt.rc('axes', unicode_minus=False)
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
return plotly_font_name # Plotlyμμ μ¬μ©ν ν°νΈ μ΄λ¦ λ°ν
|
| 125 |
-
|
| 126 |
|
| 127 |
# --- λ°μ΄ν° λ‘λ ν¨μ ---
|
| 128 |
def load_words_from_json(filepath):
|
|
@@ -130,190 +155,263 @@ def load_words_from_json(filepath):
|
|
| 130 |
try:
|
| 131 |
with open(filepath, 'r', encoding='utf-8') as f:
|
| 132 |
data = json.load(f)
|
| 133 |
-
# dataκ° λ¦¬μ€νΈ ννλΌκ³ κ°μ
|
| 134 |
if isinstance(data, list):
|
| 135 |
-
words = [item.get('word', '') for item in data if item.get('word')]
|
| 136 |
-
# λΉ λ¬Έμμ΄ μ κ±°
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
return words
|
| 139 |
else:
|
| 140 |
-
st.error(f"μ€λ₯: νμΌ '{filepath}'μ μ΅μμ νμμ΄ λ¦¬μ€νΈκ° μλλλ€.")
|
| 141 |
return None
|
| 142 |
except FileNotFoundError:
|
| 143 |
st.error(f"μ€λ₯: νμΌ '{filepath}'λ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
| 144 |
return None
|
| 145 |
-
except json.JSONDecodeError:
|
| 146 |
-
st.error(f"μ€λ₯: νμΌ '{filepath}'μ JSON νμμ΄ μλͺ»λμμ΅λλ€.")
|
| 147 |
return None
|
| 148 |
except Exception as e:
|
| 149 |
-
st.error(f"
|
| 150 |
return None
|
| 151 |
|
| 152 |
-
|
| 153 |
def scan_data_files():
|
| 154 |
-
"""λ°μ΄ν° ν΄λμμ μ¬μ© κ°λ₯ν
|
| 155 |
data_files = {}
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
}
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
if words:
|
| 177 |
-
data_files[file_id] = {
|
| 178 |
-
'path': file_path,
|
| 179 |
-
'name': file_name,
|
| 180 |
-
'word_count': len(words),
|
| 181 |
-
'type': 'uploaded',
|
| 182 |
-
'sample_words': words[:5] if len(words) > 5 else words
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
return data_files
|
| 186 |
|
| 187 |
-
|
| 188 |
-
def merge_word_lists(file_ids):
|
| 189 |
"""μ νλ νμΌλ€μμ λ¨μ΄λ₯Ό λ‘λνκ³ μ€λ³΅ μ κ±°νμ¬ λ³ν©ν©λλ€."""
|
| 190 |
-
all_words =
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
for file_id in file_ids:
|
| 193 |
-
if file_id in
|
| 194 |
-
file_path =
|
| 195 |
words = load_words_from_json(file_path)
|
| 196 |
if words:
|
| 197 |
-
all_words.
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
| 201 |
return unique_words
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
def encode_words(words, normalize=True):
|
| 205 |
-
"""λ¨μ΄ λͺ©λ‘μ μλ² λ©μΌλ‘ λ³νν©λλ€."""
|
| 206 |
-
# κ°λ¨ν TF-IDF μ€νμΌ μλ² λ© μμ± (μμ)
|
| 207 |
embeddings = []
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
if cache_key in st.session_state.graph_cache:
|
|
|
|
| 233 |
return st.session_state.graph_cache[cache_key]
|
| 234 |
-
|
| 235 |
-
#
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
if not word_list:
|
| 242 |
-
st.error("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
return None
|
| 244 |
-
|
| 245 |
-
#
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
if
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
G = nx.Graph()
|
|
|
|
| 274 |
for i, word in enumerate(word_list):
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
for edge, weight in zip(edges, edge_weights):
|
| 279 |
-
G.
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
edge_y = []
|
| 285 |
-
|
| 286 |
-
if edges:
|
| 287 |
for edge in G.edges():
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
node_trace = go.Scatter3d(
|
| 318 |
x=node_x, y=node_y, z=node_z,
|
| 319 |
mode='markers+text',
|
|
@@ -321,294 +419,341 @@ def generate_graph(file_ids, similarity_threshold=0.7):
|
|
| 321 |
hovertext=node_hover_text,
|
| 322 |
hoverinfo='text',
|
| 323 |
textposition='top center',
|
| 324 |
-
textfont=dict(
|
| 325 |
-
size=10,
|
| 326 |
-
color='black',
|
| 327 |
-
family=plotly_font
|
| 328 |
-
),
|
| 329 |
marker=dict(
|
| 330 |
-
size=
|
| 331 |
-
color=node_z,
|
| 332 |
colorscale='Viridis',
|
| 333 |
opacity=0.9,
|
| 334 |
-
colorbar=dict(thickness=15, title='Node Depth (Z
|
| 335 |
)
|
| 336 |
)
|
| 337 |
-
|
| 338 |
-
#
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
# λ μ΄μμ μ€μ
|
| 344 |
layout = go.Layout(
|
| 345 |
title=dict(
|
| 346 |
-
text=f'
|
| 347 |
-
font=dict(size=16, family=plotly_font)
|
|
|
|
| 348 |
),
|
| 349 |
showlegend=False,
|
| 350 |
-
margin=dict(
|
| 351 |
scene=dict(
|
| 352 |
-
xaxis=dict(title='TSNE
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
)
|
| 360 |
)
|
| 361 |
-
|
| 362 |
-
# Figure μμ±
|
| 363 |
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
|
| 364 |
-
|
| 365 |
-
#
|
| 366 |
st.session_state.graph_cache[cache_key] = fig
|
| 367 |
-
|
| 368 |
-
return fig
|
| 369 |
|
|
|
|
| 370 |
|
|
|
|
| 371 |
def handle_uploaded_file(uploaded_file):
|
| 372 |
-
"""μ
λ‘λλ νμΌμ
|
| 373 |
if uploaded_file is not None:
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
file_name = f"{timestamp}_{uploaded_file.name}"
|
| 377 |
file_path = os.path.join(UPLOAD_FOLDER, file_name)
|
| 378 |
-
|
| 379 |
try:
|
| 380 |
-
# νμΌ μ μ₯
|
| 381 |
with open(file_path, 'wb') as f:
|
| 382 |
f.write(uploaded_file.getbuffer())
|
| 383 |
-
|
| 384 |
-
|
| 385 |
words = load_words_from_json(file_path)
|
| 386 |
-
if not words:
|
| 387 |
-
os.remove(file_path)
|
| 388 |
-
st.error(
|
| 389 |
return None
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
for file_id, file_info in st.session_state.data_files.items():
|
| 397 |
-
if file_info['path'] == file_path:
|
| 398 |
-
new_file_id = file_id
|
| 399 |
-
break
|
| 400 |
-
|
| 401 |
-
return new_file_id
|
| 402 |
-
|
| 403 |
except Exception as e:
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
if os.path.exists(file_path):
|
| 407 |
-
os.remove(file_path)
|
| 408 |
-
except:
|
| 409 |
-
pass
|
| 410 |
-
st.error(f'νμΌ μ
λ‘λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}')
|
| 411 |
return None
|
| 412 |
|
| 413 |
-
|
| 414 |
def delete_file(file_id):
|
| 415 |
-
"""νμΌμ
|
| 416 |
-
|
| 417 |
-
|
|
|
|
| 418 |
return False
|
| 419 |
-
|
| 420 |
-
file_info =
|
| 421 |
-
|
| 422 |
-
# μ
λ‘λλ νμΌλ§ μμ νμ©
|
| 423 |
-
if file_info['type'] != 'uploaded':
|
| 424 |
st.error('κΈ°λ³Έ λ°μ΄ν° νμΌμ μμ ν μ μμ΅λλ€.')
|
| 425 |
return False
|
| 426 |
-
|
| 427 |
-
# νμΌ μμ
|
| 428 |
-
file_path = file_info['path']
|
| 429 |
-
if os.path.exists(file_path):
|
| 430 |
-
os.remove(file_path)
|
| 431 |
-
|
| 432 |
-
# λ°μ΄ν° νμΌ μ 보 μ
λ°μ΄νΈ
|
| 433 |
-
st.session_state.data_files.pop(file_id)
|
| 434 |
-
|
| 435 |
-
# κ΄λ ¨ μΊμ νλͺ© μμ
|
| 436 |
-
keys_to_remove = []
|
| 437 |
-
for cache_key in st.session_state.graph_cache:
|
| 438 |
-
if file_id in cache_key:
|
| 439 |
-
keys_to_remove.append(cache_key)
|
| 440 |
-
|
| 441 |
-
for key in keys_to_remove:
|
| 442 |
-
st.session_state.graph_cache.pop(key)
|
| 443 |
-
|
| 444 |
-
# μ νλ νμΌ λͺ©λ‘μμλ μ κ±°
|
| 445 |
-
if file_id in st.session_state.selected_files:
|
| 446 |
-
st.session_state.selected_files.remove(file_id)
|
| 447 |
-
|
| 448 |
-
return True
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
def clear_cache():
|
| 452 |
-
"""μΊμλ₯Ό μ΄κΈ°νν©λλ€."""
|
| 453 |
st.session_state.graph_cache = {}
|
| 454 |
-
st.session_state.embeddings_cache = {}
|
| 455 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
# λ°μ΄ν° νμΌ μ€μΊ
|
| 459 |
-
st.session_state.data_files
|
|
|
|
| 460 |
|
| 461 |
# νμ΄ν λ° μκ°
|
| 462 |
-
st.title('νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν')
|
| 463 |
-
st.markdown(
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
-
#
|
| 466 |
-
st.
|
|
|
|
|
|
|
| 467 |
|
| 468 |
-
# μκ³κ° μ€μ
|
| 469 |
-
threshold = st.sidebar.slider(
|
| 470 |
-
'μ μ¬λ μκ³κ°',
|
| 471 |
-
min_value=0.1,
|
| 472 |
-
max_value=0.9,
|
| 473 |
-
value=st.session_state.threshold,
|
| 474 |
-
step=0.05,
|
| 475 |
-
help='λμ κ° = λ μ격ν μ°κ²° κΈ°μ€ (μ μ μ£μ§)'
|
| 476 |
-
)
|
| 477 |
-
st.session_state.threshold = threshold
|
| 478 |
|
| 479 |
-
#
|
| 480 |
-
st.sidebar.
|
| 481 |
-
uploaded_file = st.sidebar.file_uploader("JSON νμΌ μ ν", type=['json'], help="'word' νλλ₯Ό κ°μ§ κ°μ²΄ λ°°μ΄μ΄ ν¬ν¨λ JSON νμΌ")
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
if uploaded_file is not None:
|
| 484 |
-
|
| 485 |
new_file_id = handle_uploaded_file(uploaded_file)
|
| 486 |
if new_file_id:
|
| 487 |
-
st.success(f"νμΌ '{uploaded_file.name}'
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
st.
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
# multiselect λμ session_stateλ₯Ό μ§μ μ¬μ©νλ λ°©μμΌλ‘ λ³κ²½
|
| 502 |
-
if st.session_state.data_files:
|
| 503 |
-
# νμΌ μ ν μ΅μ
μμ±
|
| 504 |
-
options = {}
|
| 505 |
-
for file_id, file_info in st.session_state.data_files.items():
|
| 506 |
-
label = f"{file_info['name']} ({file_info['word_count']}κ° λ¨μ΄) {'[κΈ°λ³Έ]' if file_info['type'] == 'default' else '[μ
λ‘λλ¨]'}"
|
| 507 |
-
options[file_id] = label
|
| 508 |
-
|
| 509 |
-
# κΈ°λ³Έκ° μ€μ (μμ§ μ νμ΄ μμΌλ©΄ 첫 λ²μ§Έ νμΌ μ ν)
|
| 510 |
-
if not st.session_state.selected_files and options:
|
| 511 |
-
st.session_state.selected_files = [next(iter(options.keys()))]
|
| 512 |
-
|
| 513 |
-
# 체ν¬λ°μ€λ‘ νμΌ μ ν ꡬν - μ΄κ² key point!
|
| 514 |
-
st.sidebar.subheader("νμΌ μ ν (μ¬λ¬ κ° μ ν κ°λ₯)")
|
| 515 |
-
|
| 516 |
selected_files_temp = []
|
| 517 |
-
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
is_selected = file_id in st.session_state.selected_files
|
| 520 |
-
|
| 521 |
-
|
|
|
|
| 522 |
selected_files_temp.append(file_id)
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
if st.session_state.selected_files:
|
| 529 |
-
st.sidebar.
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
st.write(f"**{file_info['name']}**")
|
| 536 |
-
st.write(f"λ¨μ΄ μ: {file_info['word_count']}")
|
| 537 |
-
st.write(f"μν: {', '.join(file_info['sample_words'])}")
|
| 538 |
-
|
| 539 |
-
with col2:
|
| 540 |
-
if file_info['type'] == 'uploaded':
|
| 541 |
-
if st.button('μμ ', key=f"delete_{file_id}"):
|
| 542 |
-
if delete_file(file_id):
|
| 543 |
-
st.success(f"νμΌ '{file_info['name']}'μ΄(κ°) μμ λμμ΅λλ€.")
|
| 544 |
-
# μ€ν¬λ¦½νΈ μ¬μ€ν
|
| 545 |
-
st.experimental_rerun()
|
| 546 |
-
|
| 547 |
-
# κ·Έλν μμ± λ²νΌ
|
| 548 |
-
generate_button = st.sidebar.button('κ·Έλν μμ±')
|
| 549 |
else:
|
| 550 |
-
st.sidebar.
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
# μ¬μ© μ€λͺ
|
| 576 |
-
with st.expander("κ·Έλν μ‘°μ λ°©λ²"):
|
| 577 |
st.markdown("""
|
| 578 |
-
-
|
| 579 |
-
-
|
| 580 |
-
-
|
| 581 |
-
-
|
|
|
|
| 582 |
""")
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
"""
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
{"word": "μ μλ"},
|
| 607 |
-
{"word": "μΉκ΅¬"},
|
| 608 |
-
{"word": "μμ "},
|
| 609 |
-
...
|
| 610 |
-
]
|
| 611 |
-
```
|
| 612 |
-
|
| 613 |
-
κ° νλͺ©μ "word" νλλ₯Ό κ°μ§ κ°μ²΄μ΄λ©°, λ°°μ΄ μμ ν¬ν¨λμ΄μΌ ν©λλ€.
|
| 614 |
-
""")
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import json
|
| 3 |
import os
|
|
|
|
| 14 |
import matplotlib.font_manager as fm
|
| 15 |
from sklearn.manifold import TSNE
|
| 16 |
import warnings
|
| 17 |
+
import gensim # FastText μ¬μ©μ μν gensim import
|
| 18 |
+
import hashlib # μΊμ ν€ μμ±μ μν΄ μΆκ°
|
| 19 |
+
|
| 20 |
warnings.filterwarnings('ignore')
|
| 21 |
|
| 22 |
+
# --- κΈ°λ³Έ μ€μ ---
|
| 23 |
# νμ΄μ§ μ€μ
|
| 24 |
st.set_page_config(
|
| 25 |
+
page_title="νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν (FastText)",
|
| 26 |
+
page_icon="π§ ", # μμ΄μ½ λ³κ²½
|
| 27 |
layout="wide"
|
| 28 |
)
|
| 29 |
|
|
|
|
| 32 |
UPLOAD_FOLDER = 'uploads'
|
| 33 |
|
| 34 |
# ν΄λ μμ±
|
| 35 |
+
if not os.path.exists(DATA_FOLDER):
|
| 36 |
+
os.makedirs(DATA_FOLDER)
|
| 37 |
if not os.path.exists(UPLOAD_FOLDER):
|
| 38 |
os.makedirs(UPLOAD_FOLDER)
|
| 39 |
|
| 40 |
+
|
| 41 |
+
# --- FastText λͺ¨λΈ μ€μ ---
|
| 42 |
+
# !!! μ¬μ©μ νμ μ€μ !!!
|
| 43 |
+
# λ€μ΄λ‘λν νκ΅μ΄ FastText λͺ¨λΈ νμΌ(.bin)μ μ 체 κ²½λ‘λ₯Ό μ§μ νμΈμ.
|
| 44 |
+
# μμ: "C:/Users/YourUser/Downloads/cc.ko.300.bin" λλ "/home/user/models/cc.ko.300.bin"
|
| 45 |
+
# λͺ¨λΈ λ€μ΄λ‘λ: https://fasttext.cc/docs/en/crawl-vectors.html λ± μ°Έμ‘°
|
| 46 |
+
FASTTEXT_MODEL_PATH = "YOUR_PATH_TO/cc.ko.300.bin" # <--- μ¬κΈ°μ μ€μ νμΌ κ²½λ‘ μ
λ ₯!!!
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# --- μΈμ
μν μ΄κΈ°ν ---
|
| 50 |
+
if 'fasttext_model' not in st.session_state:
|
| 51 |
+
st.session_state.fasttext_model = None # λͺ¨λΈ κ°μ²΄ μ μ₯
|
| 52 |
if 'embeddings_cache' not in st.session_state:
|
| 53 |
+
st.session_state.embeddings_cache = {} # μλ² λ© μΊμλ λ¨μ΄ λͺ©λ‘+λͺ¨λΈ κΈ°λ°μΌλ‘ μ¬κ³ λ € κ°λ₯ (μ¬κΈ°μ λ¨μν)
|
|
|
|
| 54 |
if 'graph_cache' not in st.session_state:
|
| 55 |
st.session_state.graph_cache = {}
|
|
|
|
| 56 |
if 'data_files' not in st.session_state:
|
| 57 |
st.session_state.data_files = {}
|
|
|
|
| 58 |
if 'selected_files' not in st.session_state:
|
| 59 |
st.session_state.selected_files = []
|
|
|
|
| 60 |
if 'threshold' not in st.session_state:
|
| 61 |
+
st.session_state.threshold = 0.6 # μλ―Έ κΈ°λ°μ΄λ―λ‘ μκ³κ° κΈ°λ³Έκ° μ‘°μ κ°λ₯
|
| 62 |
+
if 'perplexity' not in st.session_state:
|
| 63 |
+
st.session_state.perplexity = 30
|
| 64 |
+
if 'learning_rate' not in st.session_state:
|
| 65 |
+
st.session_state.learning_rate = 'auto'
|
| 66 |
+
if 'n_iter' not in st.session_state:
|
| 67 |
+
st.session_state.n_iter = 1000
|
| 68 |
+
if 'generate_clicked' not in st.session_state:
|
| 69 |
+
st.session_state.generate_clicked = False
|
| 70 |
+
if 'fig' not in st.session_state:
|
| 71 |
+
st.session_state.fig = None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# --- FastText λͺ¨λΈ λ‘λ© ν¨μ (μΊμ± μ¬μ©) ---
|
| 75 |
+
@st.cache_resource # λͺ¨λΈ κ°μ²΄λ ν¬λ―λ‘ λ¦¬μμ€ μΊμ± μ¬μ©
|
| 76 |
+
def load_fasttext_model(model_path):
|
| 77 |
+
"""μ§μ λ κ²½λ‘μμ FastText λͺ¨λΈμ λ‘λν©λλ€."""
|
| 78 |
+
if not os.path.exists(model_path):
|
| 79 |
+
st.error(f"μ€λ₯: FastText λͺ¨λΈ νμΌμ μ°Ύμ μ μμ΅λλ€: {model_path}")
|
| 80 |
+
st.error("FastText μΉμ¬μ΄νΈ λ±μμ νκ΅μ΄ λͺ¨λΈ(cc.ko.300.bin μΆμ²)μ λ€μ΄λ‘λνκ³ μ½λ μλ¨μ `FASTTEXT_MODEL_PATH` λ³μλ₯Ό μ νν μ§μ ν΄μ£ΌμΈμ.")
|
| 81 |
+
return None
|
| 82 |
+
try:
|
| 83 |
+
st.info(f"FastText λͺ¨λΈ λ‘λ© μ€... ({os.path.basename(model_path)}) λͺ¨λΈ ν¬κΈ°μ λ°λΌ μκ°μ΄ 걸릴 μ μμ΅λλ€.")
|
| 84 |
+
# .bin νμΌ λ‘λλ₯Ό μν΄ load_facebook_model μ¬μ©
|
| 85 |
+
model = gensim.models.fasttext.load_facebook_model(model_path)
|
| 86 |
+
st.info("FastText λͺ¨λΈ λ‘λ© μλ£.")
|
| 87 |
+
return model
|
| 88 |
+
except Exception as e:
|
| 89 |
+
st.error(f"FastText λͺ¨λΈ λ‘λ© μ€ μ€λ₯ λ°μ: {e}")
|
| 90 |
+
return None
|
| 91 |
|
| 92 |
# --- νκΈ ν°νΈ μ€μ ν¨μ ---
|
| 93 |
def set_korean_font():
|
| 94 |
+
""" μ΄μ체μ μ λ§λ νκΈ ν°νΈλ₯Ό μ€μ νκ³ Plotlyμ© ν°νΈ μ΄λ¦μ λ°νν©λλ€. """
|
|
|
|
|
|
|
|
|
|
| 95 |
system_name = platform.system()
|
| 96 |
+
plotly_font_name = 'sans-serif' # κΈ°λ³Έκ°
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
if system_name == "Windows":
|
| 100 |
+
font_name = "Malgun Gothic"
|
| 101 |
+
plotly_font_name = "Malgun Gothic"
|
| 102 |
+
elif system_name == "Darwin": # MacOS
|
| 103 |
+
font_name = "AppleGothic"
|
| 104 |
+
plotly_font_name = "AppleGothic"
|
| 105 |
+
elif system_name == "Linux":
|
| 106 |
+
# μμ€ν
μμ Nanum ν°νΈ μ°ΎκΈ° μλ
|
| 107 |
+
font_path = None
|
| 108 |
+
possible_paths = [
|
| 109 |
+
"/usr/share/fonts/truetype/nanum/NanumGothic.ttf",
|
| 110 |
+
"/usr/share/fonts/nanum/NanumGothic.ttf",
|
| 111 |
+
# λ€λ₯Έ κ²½λ‘ μΆκ° κ°λ₯
|
| 112 |
+
]
|
| 113 |
+
for path in possible_paths:
|
| 114 |
+
if os.path.exists(path):
|
| 115 |
+
font_path = path
|
| 116 |
+
break
|
| 117 |
+
|
| 118 |
+
if font_path:
|
| 119 |
+
fm.fontManager.addfont(font_path)
|
| 120 |
+
prop = fm.FontProperties(fname=font_path)
|
| 121 |
+
font_name = prop.get_name()
|
| 122 |
+
plotly_font_name = font_name # Plotlyλ μ΄λ¦ μ¬μ©
|
| 123 |
+
else: # μμ€ν
ν°νΈ λ§€λμ μμ κ²μ
|
| 124 |
available_fonts = [f.name for f in fm.fontManager.ttflist]
|
| 125 |
nanum_fonts = [name for name in available_fonts if 'Nanum' in name]
|
| 126 |
if nanum_fonts:
|
| 127 |
font_name = nanum_fonts[0]
|
| 128 |
+
plotly_font_name = font_name
|
|
|
|
| 129 |
else:
|
| 130 |
+
font_name = None # μ°ΎκΈ° μ€ν¨
|
| 131 |
+
else:
|
| 132 |
+
font_name = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
+
# Matplotlib μ€μ μ μ©
|
| 135 |
+
if font_name:
|
|
|
|
| 136 |
plt.rc('font', family=font_name)
|
| 137 |
plt.rc('axes', unicode_minus=False)
|
| 138 |
+
print(f"Matplotlib font set to: {font_name}")
|
| 139 |
+
else:
|
| 140 |
+
print("Suitable Korean font not found for Matplotlib. Using default.")
|
| 141 |
plt.rcdefaults()
|
| 142 |
plt.rc('axes', unicode_minus=False)
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"Error setting Korean font: {e}")
|
| 146 |
plt.rcdefaults()
|
| 147 |
plt.rc('axes', unicode_minus=False)
|
| 148 |
|
| 149 |
+
print(f"Plotly font name to use: {plotly_font_name}")
|
| 150 |
+
return plotly_font_name
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# --- λ°μ΄ν° λ‘λ ν¨μ ---
|
| 153 |
def load_words_from_json(filepath):
|
|
|
|
| 155 |
try:
|
| 156 |
with open(filepath, 'r', encoding='utf-8') as f:
|
| 157 |
data = json.load(f)
|
|
|
|
| 158 |
if isinstance(data, list):
|
| 159 |
+
words = [item.get('word', '') for item in data if isinstance(item, dict) and item.get('word')]
|
| 160 |
+
words = [word for word in words if word] # λΉ λ¬Έμμ΄ μ κ±°
|
| 161 |
+
if not words:
|
| 162 |
+
st.warning(f"κ²½κ³ : νμΌ '{os.path.basename(filepath)}'μμ 'word' ν€λ₯Ό κ°μ§ μ ν¨ν λ°μ΄ν°λ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
| 163 |
+
return None
|
| 164 |
return words
|
| 165 |
else:
|
| 166 |
+
st.error(f"μ€λ₯: νμΌ '{os.path.basename(filepath)}'μ μ΅μμ νμμ΄ λ¦¬μ€νΈκ° μλλλ€.")
|
| 167 |
return None
|
| 168 |
except FileNotFoundError:
|
| 169 |
st.error(f"μ€λ₯: νμΌ '{filepath}'λ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
| 170 |
return None
|
| 171 |
+
except json.JSONDecodeError as e:
|
| 172 |
+
st.error(f"μ€λ₯: νμΌ '{os.path.basename(filepath)}'μ JSON νμμ΄ μλͺ»λμμ΅λλ€. μ€λ₯: {e}")
|
| 173 |
return None
|
| 174 |
except Exception as e:
|
| 175 |
+
st.error(f"'{os.path.basename(filepath)}' λ°οΏ½οΏ½ν° λ‘λ© μ€ μ€λ₯ λ°μ: {e}")
|
| 176 |
return None
|
| 177 |
|
|
|
|
| 178 |
def scan_data_files():
|
| 179 |
+
"""λ°μ΄ν° ν΄λ λ° μ
λ‘λ ν΄λμμ μ¬μ© κ°λ₯ν JSON νμΌμ μ€μΊν©λλ€."""
|
| 180 |
data_files = {}
|
| 181 |
+
# κΈ°λ³Έ λ°μ΄ν° ν΄λ
|
| 182 |
+
try:
|
| 183 |
+
for file_path in glob.glob(os.path.join(DATA_FOLDER, '*.json')):
|
| 184 |
+
file_id = f"default_{os.path.basename(file_path)}"
|
| 185 |
+
file_name = os.path.basename(file_path)
|
| 186 |
+
words = load_words_from_json(file_path)
|
| 187 |
+
if words:
|
| 188 |
+
data_files[file_id] = {'path': file_path, 'name': file_name, 'word_count': len(words), 'type': 'default', 'sample_words': words[:5]}
|
| 189 |
+
except Exception as e:
|
| 190 |
+
st.error(f"κΈ°λ³Έ λ°μ΄ν° ν΄λ μ€μΊ μ€ μ€λ₯: {e}")
|
| 191 |
+
# μ
λ‘λ ν΄λ
|
| 192 |
+
try:
|
| 193 |
+
for file_path in glob.glob(os.path.join(UPLOAD_FOLDER, '*.json')):
|
| 194 |
+
file_id = f"uploaded_{os.path.basename(file_path)}"
|
| 195 |
+
file_name = os.path.basename(file_path)
|
| 196 |
+
words = load_words_from_json(file_path)
|
| 197 |
+
if words:
|
| 198 |
+
data_files[file_id] = {'path': file_path, 'name': file_name, 'word_count': len(words), 'type': 'uploaded', 'sample_words': words[:5]}
|
| 199 |
+
except Exception as e:
|
| 200 |
+
st.error(f"μ
λ‘λ ν΄λ μ€μΊ μ€ μ€λ₯: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
return data_files
|
| 202 |
|
| 203 |
+
def merge_word_lists(file_ids, current_data_files):
|
|
|
|
| 204 |
"""μ νλ νμΌλ€μμ λ¨μ΄λ₯Ό λ‘λνκ³ μ€λ³΅ μ κ±°νμ¬ λ³ν©ν©λλ€."""
|
| 205 |
+
all_words = set() # μ€λ³΅ μ κ±°λ₯Ό μν΄ set μ¬μ©
|
| 206 |
+
if not file_ids:
|
| 207 |
+
return []
|
| 208 |
+
|
| 209 |
for file_id in file_ids:
|
| 210 |
+
if file_id in current_data_files:
|
| 211 |
+
file_path = current_data_files[file_id]['path']
|
| 212 |
words = load_words_from_json(file_path)
|
| 213 |
if words:
|
| 214 |
+
all_words.update(words) # setμ μΆκ°
|
| 215 |
+
else:
|
| 216 |
+
st.warning(f"μ νλ νμΌ ID '{file_id}'λ₯Ό μ°Ύμ μ μμ΅λλ€. λͺ©λ‘μ μλ‘κ³ μΉ¨ν©λλ€.")
|
| 217 |
+
# νμΌ λͺ©λ‘ μ¬μ€μΊ λ‘μ§μ 볡μ‘ν΄μ§ μ μμΌλ―λ‘ μ¬κΈ°μλ κ²½κ³ λ§ νμ
|
| 218 |
+
# μ λ ¬λ 리μ€νΈλ‘ λ°ν
|
| 219 |
+
unique_words = sorted(list(all_words))
|
| 220 |
return unique_words
|
| 221 |
|
| 222 |
+
# --- λ¨μ΄ μλ² λ© ν¨μ (FastText μ¬μ©) ---
|
| 223 |
+
def encode_words_fasttext(words, normalize=True):
|
| 224 |
+
"""FastText λͺ¨λΈμ μ¬μ©νμ¬ λ¨μ΄ λͺ©λ‘μ μλ―Έ μλ² λ©μΌλ‘ λ³νν©λλ€."""
|
| 225 |
+
model = st.session_state.get('fasttext_model')
|
| 226 |
+
|
| 227 |
+
if model is None:
|
| 228 |
+
st.error("FastText λͺ¨λΈμ΄ λ‘λλμ§ μμ μλ² λ©μ μμ±ν μ μμ΅λλ€.")
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
if not words:
|
| 232 |
+
return np.array([])
|
| 233 |
|
|
|
|
|
|
|
|
|
|
| 234 |
embeddings = []
|
| 235 |
+
oov_count = 0
|
| 236 |
+
vector_size = model.vector_size
|
| 237 |
+
|
| 238 |
+
with st.spinner(f"λ¨μ΄ {len(words)}κ°μ λν μλ―Έ μλ² λ© μμ± μ€ (FastText)..."):
|
| 239 |
+
for word in words:
|
| 240 |
+
try:
|
| 241 |
+
vector = model.wv[word]
|
| 242 |
+
if np.all(vector == 0):
|
| 243 |
+
oov_count += 1
|
| 244 |
+
if normalize:
|
| 245 |
+
norm = np.linalg.norm(vector)
|
| 246 |
+
vector = vector / norm if norm > 0 else np.zeros(vector_size)
|
| 247 |
+
embeddings.append(vector)
|
| 248 |
+
except Exception as e:
|
| 249 |
+
st.warning(f"λ¨μ΄ '{word}' μ²λ¦¬ μ€ μ€λ₯ λ°μ (νΉμ OOV): {e}. 0벑ν°λ‘ λ체ν©λλ€.")
|
| 250 |
+
embeddings.append(np.zeros(vector_size))
|
| 251 |
+
oov_count += 1
|
| 252 |
+
|
| 253 |
+
if oov_count > 0:
|
| 254 |
+
st.warning(f"μ΄ {len(words)}κ° λ¨μ΄ μ€ {oov_count}κ°μ λν΄ μ ν¨ λ²‘ν°λ₯Ό μ»μ§ λͺ»νμ΅λλ€(OOV λ±).")
|
| 255 |
+
|
| 256 |
+
result_embeddings = np.array(embeddings)
|
| 257 |
+
|
| 258 |
+
if result_embeddings.size == 0 and len(words) > 0:
|
| 259 |
+
st.error("μλ² λ© μμ± κ²°κ³Όκ° λΉμ΄ μμ΅λλ€.")
|
| 260 |
+
return None
|
| 261 |
+
elif result_embeddings.shape[0] != len(words):
|
| 262 |
+
st.error(f"μ
λ ₯ λ¨μ΄ μ({len(words)})μ μμ±λ μλ² λ© μ({result_embeddings.shape[0]}) λΆμΌμΉ.")
|
| 263 |
+
return None
|
| 264 |
+
|
| 265 |
+
return result_embeddings
|
| 266 |
+
|
| 267 |
+
# --- κ·Έλν μμ± ν¨μ ---
|
| 268 |
+
def generate_graph(file_ids, similarity_threshold, perplexity, learning_rate, n_iter):
|
| 269 |
+
""" μλ―Έ μ μ¬μ± κΈ°λ° 3D κ·Έλνλ₯Ό μμ±ν©λλ€. """
|
| 270 |
+
# κ·Έλν μΊμ ν€ μμ± (νμΌ ID, μκ³κ°, t-SNE νλΌλ―Έν° ν¬ν¨)
|
| 271 |
+
param_str = f"t{similarity_threshold}_p{perplexity}_lr{learning_rate}_i{n_iter}"
|
| 272 |
+
sorted_fids = "-".join(sorted(file_ids))
|
| 273 |
+
# λ¨μ΄ λͺ©λ‘ μ체λ₯Ό ν΄μνμ¬ μΊμ ν€μ ν¬ν¨ (λ μ ννμ§λ§ λ릴 μ μμ)
|
| 274 |
+
# word_list_for_key = merge_word_lists(file_ids, st.session_state.data_files)
|
| 275 |
+
# word_hash = hashlib.sha256(str(word_list_for_key).encode()).hexdigest()[:8]
|
| 276 |
+
# cache_key = f"{sorted_fids}_{word_hash}_{param_str}_fasttext"
|
| 277 |
+
cache_key = f"{sorted_fids}_{param_str}_fasttext" # νμΌ ID κΈ°λ° μΊμ
|
| 278 |
+
|
| 279 |
if cache_key in st.session_state.graph_cache:
|
| 280 |
+
st.info("μΊμλ κ·Έλνλ₯Ό μ¬μ©ν©λλ€.")
|
| 281 |
return st.session_state.graph_cache[cache_key]
|
| 282 |
+
|
| 283 |
+
# --- νμ λ°μ΄ν° λ‘λ λ° κ²μ¦ ---
|
| 284 |
+
if not file_ids:
|
| 285 |
+
st.error("κ·Έλνλ₯Ό μμ±ν νμΌμ΄ μ νλμ§ μμμ΅λλ€.")
|
| 286 |
+
return None
|
| 287 |
+
if st.session_state.get('fasttext_model') is None:
|
| 288 |
+
st.error("FastText λͺ¨λΈμ΄ λ‘λλμ§ μμ κ·Έλν μμ±μ μ§νν μ μμ΅λλ€.")
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
plotly_font = set_korean_font() # νκΈ ν°νΈ μ€μ
|
| 292 |
+
word_list = merge_word_lists(file_ids, st.session_state.data_files) # λ¨μ΄ λͺ©λ‘ λ³ν©
|
| 293 |
+
|
| 294 |
if not word_list:
|
| 295 |
+
st.error("μ νλ νμΌμμ μ ν¨ν λ¨μ΄λ₯Ό λ‘λν μ μμ΅λλ€.")
|
| 296 |
+
return None
|
| 297 |
+
if len(word_list) < 2:
|
| 298 |
+
st.warning("κ·Έλνλ₯Ό μμ±νλ €λ©΄ μ΅μ 2κ° μ΄μμ κ³ μ λ¨μ΄κ° νμν©λλ€.")
|
| 299 |
+
return None
|
| 300 |
+
|
| 301 |
+
# --- μλ² λ© μμ± ---
|
| 302 |
+
embeddings = encode_words_fasttext(word_list, normalize=True)
|
| 303 |
+
if embeddings is None or embeddings.shape[0] == 0 or embeddings.shape[1] == 0:
|
| 304 |
+
st.error("μ ν¨ν λ¨μ΄ μλ² λ© μμ± μ€ν¨.")
|
| 305 |
return None
|
| 306 |
+
|
| 307 |
+
# --- μ°¨μ μΆμ (t-SNE) ---
|
| 308 |
+
embeddings_3d = None
|
| 309 |
+
n_samples = embeddings.shape[0]
|
| 310 |
+
with st.spinner(f'λ¨μ΄ {n_samples}κ° μ’ν κ³μ° μ€ (t-SNE)...'):
|
| 311 |
+
effective_perplexity = min(perplexity, max(5, n_samples - 1))
|
| 312 |
+
if effective_perplexity != perplexity:
|
| 313 |
+
st.warning(f"Perplexityκ° μν μμ λ§κ² {effective_perplexity}(μΌ)λ‘ μ‘°μ λμμ΅λλ€.")
|
| 314 |
+
effective_lr = learning_rate if isinstance(learning_rate, (int, float)) else 200.0 if learning_rate == 'auto' else learning_rate
|
| 315 |
+
effective_iter = n_iter
|
| 316 |
+
|
| 317 |
+
if n_samples <= 3:
|
| 318 |
+
st.warning(f"λ¨μ΄ μκ° {n_samples}κ°λ‘ μ μ΄ PCAλ₯Ό μ¬μ©ν©λλ€.")
|
| 319 |
+
from sklearn.decomposition import PCA
|
| 320 |
+
pca = PCA(n_components=min(3, n_samples), random_state=42)
|
| 321 |
+
embeddings_3d_pca = pca.fit_transform(embeddings)
|
| 322 |
+
embeddings_3d = np.zeros((n_samples, 3))
|
| 323 |
+
embeddings_3d[:, :embeddings_3d_pca.shape[1]] = embeddings_3d_pca
|
| 324 |
+
else:
|
| 325 |
+
try:
|
| 326 |
+
tsne = TSNE(n_components=3, random_state=42,
|
| 327 |
+
perplexity=effective_perplexity,
|
| 328 |
+
n_iter=effective_iter,
|
| 329 |
+
init='pca',
|
| 330 |
+
learning_rate=effective_lr,
|
| 331 |
+
n_jobs=-1)
|
| 332 |
+
embeddings_3d = tsne.fit_transform(embeddings)
|
| 333 |
+
except Exception as e:
|
| 334 |
+
st.error(f"t-SNE μ€λ₯: {e}. PCAλ‘ λ체ν©λλ€.")
|
| 335 |
+
from sklearn.decomposition import PCA
|
| 336 |
+
pca = PCA(n_components=3, random_state=42)
|
| 337 |
+
embeddings_3d = pca.fit_transform(embeddings) # PCAλ‘ μ¬μλ
|
| 338 |
+
|
| 339 |
+
if embeddings_3d is None or embeddings_3d.shape[0] != len(word_list):
|
| 340 |
+
st.error("λ¨μ΄ 3D μ’ν μμ± μ€ν¨.")
|
| 341 |
+
return None
|
| 342 |
+
|
| 343 |
+
# --- μ μ¬λ κ³μ° λ° κ·Έλν κ΅¬μ± ---
|
| 344 |
+
edges = []
|
| 345 |
+
edge_weights = []
|
| 346 |
+
with st.spinner('λ¨μ΄ κ° μλ―Έ μ μ¬λ κ³μ° λ° μ°κ²° μμ± μ€...'):
|
| 347 |
+
try:
|
| 348 |
+
similarity_matrix = cosine_similarity(embeddings)
|
| 349 |
+
for i in range(n_samples):
|
| 350 |
+
for j in range(i + 1, n_samples):
|
| 351 |
+
similarity = similarity_matrix[i, j]
|
| 352 |
+
if not np.isnan(similarity) and similarity >= similarity_threshold:
|
| 353 |
+
edges.append((word_list[i], word_list[j]))
|
| 354 |
+
edge_weights.append(similarity)
|
| 355 |
+
except Exception as e:
|
| 356 |
+
st.error(f"μ μ¬λ κ³μ° μ€ μ€λ₯ λ°μ: {e}")
|
| 357 |
+
return None
|
| 358 |
+
|
| 359 |
+
# --- NetworkX κ·Έλν μμ± ---
|
| 360 |
G = nx.Graph()
|
| 361 |
+
valid_nodes_count = 0
|
| 362 |
for i, word in enumerate(word_list):
|
| 363 |
+
if i < embeddings_3d.shape[0]: # μ’νκ° μμ±λ λ
Έλλ§ μΆκ°
|
| 364 |
+
G.add_node(word, pos=(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2]))
|
| 365 |
+
valid_nodes_count += 1
|
| 366 |
+
else:
|
| 367 |
+
st.warning(f"'{word}' λ¨μ΄ μ’ν λλ½.") # λλ½ κ²½κ³
|
| 368 |
+
|
| 369 |
+
if valid_nodes_count != len(word_list):
|
| 370 |
+
st.warning(f"{len(word_list)-valid_nodes_count}κ° λ¨μ΄ λ
Έλ μμ± μ€ν¨.")
|
| 371 |
+
|
| 372 |
+
valid_edges_count = 0
|
| 373 |
for edge, weight in zip(edges, edge_weights):
|
| 374 |
+
if G.has_node(edge[0]) and G.has_node(edge[1]): # λ
Έλκ° μλμ§ νμΈ ν μ£μ§ μΆκ°
|
| 375 |
+
G.add_edge(edge[0], edge[1], weight=weight)
|
| 376 |
+
valid_edges_count += 1
|
| 377 |
+
|
| 378 |
+
# --- Plotly μκ°ν κ°μ²΄ μμ± ---
|
| 379 |
+
edge_x, edge_y, edge_z = [], [], []
|
| 380 |
+
if G.number_of_edges() > 0:
|
|
|
|
| 381 |
for edge in G.edges():
|
| 382 |
+
try:
|
| 383 |
+
pos0 = G.nodes[edge[0]]['pos']
|
| 384 |
+
pos1 = G.nodes[edge[1]]['pos']
|
| 385 |
+
edge_x.extend([pos0[0], pos1[0], None])
|
| 386 |
+
edge_y.extend([pos0[1], pos1[1], None])
|
| 387 |
+
edge_z.extend([pos0[2], pos1[2], None])
|
| 388 |
+
except KeyError as e:
|
| 389 |
+
st.warning(f"μ£μ§ {edge} μμ± μ€ λ
Έλ μμΉ μ€λ₯: {e}")
|
| 390 |
+
continue
|
| 391 |
+
|
| 392 |
+
edge_trace = go.Scatter3d(x=edge_x, y=edge_y, z=edge_z, mode='lines', line=dict(width=1, color='#888'), hoverinfo='none')
|
| 393 |
+
|
| 394 |
+
node_x, node_y, node_z, node_text, node_hover_text, node_sizes = [], [], [], [], [], []
|
| 395 |
+
if G.number_of_nodes() > 0:
|
| 396 |
+
degrees = np.array([G.degree(node) for node in G.nodes()])
|
| 397 |
+
# λ‘κ·Έ μ€μΌμΌλ§ + ν¬κΈ° μ ν
|
| 398 |
+
raw_sizes = np.log1p(degrees) * 3 + 6
|
| 399 |
+
node_sizes_list = np.clip(raw_sizes, 5, 20).tolist()
|
| 400 |
+
|
| 401 |
+
for i, node in enumerate(G.nodes()):
|
| 402 |
+
try:
|
| 403 |
+
pos = G.nodes[node]['pos']
|
| 404 |
+
degree = G.degree(node)
|
| 405 |
+
node_x.append(pos[0])
|
| 406 |
+
node_y.append(pos[1])
|
| 407 |
+
node_z.append(pos[2])
|
| 408 |
+
node_text.append(node)
|
| 409 |
+
node_hover_text.append(f'{node}<br>μ°κ²° μ: {degree}')
|
| 410 |
+
# node_sizes 리μ€νΈλ μ΄λ―Έ μμμ κ³μ°λ¨
|
| 411 |
+
except KeyError:
|
| 412 |
+
st.warning(f"λ
Έλ '{node}' μμΉ μ 보 μ€λ₯.")
|
| 413 |
+
continue # ν΄λΉ λ
Έλ 건λλ°κΈ°
|
| 414 |
+
|
| 415 |
node_trace = go.Scatter3d(
|
| 416 |
x=node_x, y=node_y, z=node_z,
|
| 417 |
mode='markers+text',
|
|
|
|
| 419 |
hovertext=node_hover_text,
|
| 420 |
hoverinfo='text',
|
| 421 |
textposition='top center',
|
| 422 |
+
textfont=dict(size=10, color='black', family=plotly_font),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
marker=dict(
|
| 424 |
+
size=node_sizes_list if node_sizes_list else 5, # κ³μ°λ ν¬κΈ° μ¬μ©
|
| 425 |
+
color=node_z, # Zκ°μΌλ‘ μμ λ§€ν
|
| 426 |
colorscale='Viridis',
|
| 427 |
opacity=0.9,
|
| 428 |
+
colorbar=dict(thickness=15, title='Node Depth (Z)', xanchor='left', titleside='right')
|
| 429 |
)
|
| 430 |
)
|
| 431 |
+
|
| 432 |
+
# --- λ μ΄μμ μ€μ λ° Figure μμ± ---
|
| 433 |
+
current_data_files = st.session_state.get('data_files', {})
|
| 434 |
+
file_names_used = [current_data_files[fid]['name'] for fid in file_ids if fid in current_data_files]
|
| 435 |
+
file_info_str = ", ".join(file_names_used) if file_names_used else "μ μ μμ"
|
| 436 |
+
|
|
|
|
| 437 |
layout = go.Layout(
|
| 438 |
title=dict(
|
| 439 |
+
text=f'<b>μ΄ν μλ―Έ μ μ¬μ± κΈ°λ° 3D κ·Έλν (FastText)</b><br>Threshold: {similarity_threshold:.2f} | λ°μ΄ν°: {file_info_str}',
|
| 440 |
+
font=dict(size=16, family=plotly_font),
|
| 441 |
+
x=0.5, xanchor='center'
|
| 442 |
),
|
| 443 |
showlegend=False,
|
| 444 |
+
margin=dict(l=10, r=10, b=10, t=80),
|
| 445 |
scene=dict(
|
| 446 |
+
xaxis=dict(title='TSNE-1', showticklabels=False, backgroundcolor="rgb(230, 230, 230)", gridcolor="white", zerolinecolor="white"),
|
| 447 |
+
yaxis=dict(title='TSNE-2', showticklabels=False, backgroundcolor="rgb(230, 230, 230)", gridcolor="white", zerolinecolor="white"),
|
| 448 |
+
zaxis=dict(title='TSNE-3', showticklabels=False, backgroundcolor="rgb(230, 230, 230)", gridcolor="white", zerolinecolor="white"),
|
| 449 |
+
aspectratio=dict(x=1, y=1, z=0.8),
|
| 450 |
+
camera=dict(eye=dict(x=1.2, y=1.2, z=0.8))
|
| 451 |
+
),
|
| 452 |
+
hovermode='closest'
|
|
|
|
| 453 |
)
|
| 454 |
+
|
|
|
|
| 455 |
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
|
| 456 |
+
|
| 457 |
+
# κ·Έλν μΊμ μ μ₯
|
| 458 |
st.session_state.graph_cache[cache_key] = fig
|
|
|
|
|
|
|
| 459 |
|
| 460 |
+
return fig
|
| 461 |
|
| 462 |
+
# --- νμΌ μ²λ¦¬ ν¨μ ---
|
| 463 |
def handle_uploaded_file(uploaded_file):
|
| 464 |
+
""" μ
λ‘λλ νμΌμ μ²λ¦¬νκ³ data_files λͺ©λ‘μ κ°±μ ν©λλ€. """
|
| 465 |
if uploaded_file is not None:
|
| 466 |
+
unique_id = str(uuid.uuid4())
|
| 467 |
+
file_name = f"{unique_id}_{uploaded_file.name}"
|
|
|
|
| 468 |
file_path = os.path.join(UPLOAD_FOLDER, file_name)
|
| 469 |
+
|
| 470 |
try:
|
|
|
|
| 471 |
with open(file_path, 'wb') as f:
|
| 472 |
f.write(uploaded_file.getbuffer())
|
| 473 |
+
st.info(f"νμΌ '{uploaded_file.name}' μ μ₯ μλ£. λ΄μ© κ²μ¦ μ€...")
|
| 474 |
+
|
| 475 |
words = load_words_from_json(file_path)
|
| 476 |
+
if words is None or not words :
|
| 477 |
+
os.remove(file_path)
|
| 478 |
+
st.error(f"μ
λ‘λλ νμΌ '{uploaded_file.name}'μμ μ ν¨ν 'word' λ°μ΄ν°λ₯Ό μ°Ύμ μ μμ΅λλ€. μμ λμμ΅λλ€.")
|
| 479 |
return None
|
| 480 |
+
else:
|
| 481 |
+
st.success(f"νμΌ '{uploaded_file.name}' κ²μ¦ μλ£ ({len(words)} λ¨μ΄).")
|
| 482 |
+
# λ°μ΄ν° νμΌ λͺ©λ‘ μ¦μ κ°±μ
|
| 483 |
+
st.session_state.data_files = scan_data_files()
|
| 484 |
+
new_file_id = f"uploaded_{file_name}"
|
| 485 |
+
return new_file_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
except Exception as e:
|
| 487 |
+
st.error(f"νμΌ μ
λ‘λ μ²λ¦¬ μ€ μ€λ₯: {e}")
|
| 488 |
+
if os.path.exists(file_path): os.remove(file_path) # μ€λ₯ μ νμΌ μμ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
return None
|
| 490 |
|
|
|
|
| 491 |
def delete_file(file_id):
|
| 492 |
+
""" μ
λ‘λλ νμΌμ μμ νκ³ κ΄λ ¨ μΊμλ₯Ό μ 리ν©λλ€. """
|
| 493 |
+
current_data_files = st.session_state.get('data_files', {})
|
| 494 |
+
if file_id not in current_data_files:
|
| 495 |
+
st.error('μμ ν νμΌμ μ°Ύμ μ μμ΅λλ€.')
|
| 496 |
return False
|
| 497 |
+
|
| 498 |
+
file_info = current_data_files[file_id]
|
| 499 |
+
if file_info.get('type') != 'uploaded':
|
|
|
|
|
|
|
| 500 |
st.error('κΈ°λ³Έ λ°μ΄ν° νμΌμ μμ ν μ μμ΅λλ€.')
|
| 501 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
+
file_path = file_info.get('path')
|
| 504 |
+
file_name = file_info.get('name', 'μ μ μμ')
|
| 505 |
+
|
| 506 |
+
try:
|
| 507 |
+
if file_path and os.path.exists(file_path):
|
| 508 |
+
os.remove(file_path)
|
| 509 |
+
st.info(f"νμΌ '{file_name}' μμ μλ£.")
|
| 510 |
+
else:
|
| 511 |
+
st.warning(f"νμΌ '{file_name}'({file_path})μ μ°Ύμ μ μκ±°λ μ΄λ―Έ μμ λμμ΅λλ€.")
|
| 512 |
+
|
| 513 |
+
# μΈμ
μν μ
λ°μ΄νΈ
|
| 514 |
+
del st.session_state.data_files[file_id]
|
| 515 |
+
if file_id in st.session_state.selected_files:
|
| 516 |
+
st.session_state.selected_files.remove(file_id)
|
| 517 |
+
|
| 518 |
+
# κ΄λ ¨ κ·Έλν μΊμ μμ (ν€μ file_idκ° ν¬ν¨λ νλͺ©)
|
| 519 |
+
keys_to_remove = [k for k in st.session_state.graph_cache if file_id in k.split('_')[0]] # ν€ νμ κ°μ
|
| 520 |
+
for key in keys_to_remove:
|
| 521 |
+
del st.session_state.graph_cache[key]
|
| 522 |
+
if keys_to_remove: st.info(f"{len(keys_to_remove)}κ° κ΄λ ¨ κ·Έλν μΊμ μμ .")
|
| 523 |
|
| 524 |
+
st.success(f"'{file_name}' κ΄λ ¨ μ 보 λ° μΊμ μμ μλ£.")
|
| 525 |
+
return True
|
| 526 |
+
|
| 527 |
+
except Exception as e:
|
| 528 |
+
st.error(f"νμΌ μμ μ€ μ€λ₯ λ°μ: {e}")
|
| 529 |
+
return False
|
| 530 |
+
|
| 531 |
+
# --- μΊμ μ΄κΈ°ν ν¨μ ---
|
| 532 |
def clear_cache():
|
| 533 |
+
""" κ·Έλν μΊμλ₯Ό μ΄κΈ°νν©λλ€. """
|
| 534 |
st.session_state.graph_cache = {}
|
| 535 |
+
# st.session_state.embeddings_cache = {} # μλ² λ© μΊμλ νμ¬ μ¬μ© μ ν¨
|
| 536 |
+
st.session_state.fig = None
|
| 537 |
+
st.success('κ·Έλν μΊμκ° μ΄κΈ°νλμμ΅λλ€.')
|
| 538 |
+
st.rerun() # UI κ°±μ
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
# ==============================================================================
|
| 542 |
+
# --- Streamlit μ± μ€ν λΆλΆ ---
|
| 543 |
+
# ==============================================================================
|
| 544 |
|
| 545 |
+
# --- μ± μμ μ μ΄κΈ°ν ---
|
| 546 |
+
# FastText λͺ¨λΈ λ‘λ μλ
|
| 547 |
+
if 'fasttext_model' not in st.session_state or st.session_state.fasttext_model is None:
|
| 548 |
+
st.session_state.fasttext_model = load_fasttext_model(FASTTEXT_MODEL_PATH)
|
| 549 |
|
| 550 |
# λ°μ΄ν° νμΌ μ€μΊ
|
| 551 |
+
if 'data_files' not in st.session_state or not st.session_state.data_files:
|
| 552 |
+
st.session_state.data_files = scan_data_files()
|
| 553 |
|
| 554 |
# νμ΄ν λ° μκ°
|
| 555 |
+
st.title('νκ΅μ΄ λ¨μ΄ μλ―Έ λ€νΈμν¬ μκ°ν (FastText κΈ°λ°)')
|
| 556 |
+
st.markdown("""
|
| 557 |
+
μ΄ λꡬλ JSON νμΌμ λ¨μ΄ λͺ©λ‘μ **FastText μλ² λ©**μΌλ‘ λ³ννμ¬ μλ―Έμ μ μ¬μ±μ κ³μ°νκ³ , κ·Έ κ΄κ³λ₯Ό 3D λ€νΈμν¬ κ·Έλνλ‘ μκ°νν©λλ€.
|
| 558 |
+
μ μ¬ν μλ―Έμ λ¨μ΄λ€μ΄ μλ‘ κ°κΉκ² λ°°μΉλλ κ²½ν₯μ 보μ
λλ€.
|
| 559 |
+
""")
|
| 560 |
|
| 561 |
+
# λͺ¨λΈ λ‘λ© μν νμΈ
|
| 562 |
+
if st.session_state.get('fasttext_model') is None:
|
| 563 |
+
st.error("FastText λͺ¨λΈ λ‘λ© μ€ν¨. μ½λ μλ¨μ `FASTTEXT_MODEL_PATH` μ€μ μ νμΈνκ³ μ±μ μ¬μ€νν΄μ£ΌμΈμ.")
|
| 564 |
+
st.stop() # λͺ¨λΈ μμΌλ©΄ μ± μ€λ¨
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
+
# --- μ¬μ΄λλ° ---
|
| 568 |
+
st.sidebar.title('βοΈ μ€μ λ° μ μ΄')
|
|
|
|
| 569 |
|
| 570 |
+
# 1. μ μ¬λ μκ³κ°
|
| 571 |
+
threshold = st.sidebar.slider(
|
| 572 |
+
'μ μ¬λ μκ³κ° (Similarity Threshold)', 0.1, 0.95, st.session_state.threshold, 0.05,
|
| 573 |
+
help='μ΄ κ° μ΄μμΌλ‘ μ μ¬ν λ¨μ΄λ§ μ°κ²°ν©λλ€. λμμλ‘ μ°κ²°μ΄ μ격ν΄μ§λλ€.'
|
| 574 |
+
)
|
| 575 |
+
if threshold != st.session_state.threshold:
|
| 576 |
+
st.session_state.threshold = threshold
|
| 577 |
+
st.session_state.fig = None # μ€μ λ³κ²½ μ κ·Έλν μ¬μμ± νμ μλ¦Ό
|
| 578 |
+
st.session_state.generate_clicked = False
|
| 579 |
+
|
| 580 |
+
st.sidebar.divider()
|
| 581 |
+
|
| 582 |
+
# 2. t-SNE νλΌλ―Έν° (μκ°ν λ―ΈμΈ μ‘°μ )
|
| 583 |
+
st.sidebar.header("t-SNE νλΌλ―Έν° (κ³ κΈ)")
|
| 584 |
+
perplexity = st.sidebar.slider(
|
| 585 |
+
"Perplexity", 5, 50, st.session_state.perplexity, 1,
|
| 586 |
+
help="κ° μ μ΄ κ³ λ €νλ μ΄μ μμ κ΄λ ¨. κ΅°μ§ ννμ μν₯."
|
| 587 |
+
)
|
| 588 |
+
learning_rate = st.sidebar.select_slider(
|
| 589 |
+
"Learning Rate", options=[10, 50, 100, 200, 500, 1000, 'auto'], value=st.session_state.learning_rate,
|
| 590 |
+
help="μ΅μ ν νμ΅ μλ. κ΅°μ§ κ° κ±°λ¦¬μ μν₯."
|
| 591 |
+
)
|
| 592 |
+
n_iter = st.sidebar.select_slider(
|
| 593 |
+
"Iterations", options=[250, 500, 1000, 2000, 5000], value=st.session_state.n_iter,
|
| 594 |
+
help="μ΅μ ν λ°λ³΅ νμ. λμμλ‘ μμ μ μ΄λ μ€λ κ±Έλ¦Ό."
|
| 595 |
+
)
|
| 596 |
+
# t-SNE νλΌλ―Έν° λ³κ²½ μ μν μ
λ°μ΄νΈ λ° κ·Έλν μ΄κΈ°ν
|
| 597 |
+
if (perplexity != st.session_state.perplexity or
|
| 598 |
+
learning_rate != st.session_state.learning_rate or
|
| 599 |
+
n_iter != st.session_state.n_iter):
|
| 600 |
+
st.session_state.perplexity = perplexity
|
| 601 |
+
st.session_state.learning_rate = learning_rate
|
| 602 |
+
st.session_state.n_iter = n_iter
|
| 603 |
+
st.session_state.fig = None
|
| 604 |
+
st.session_state.generate_clicked = False
|
| 605 |
+
|
| 606 |
+
st.sidebar.divider()
|
| 607 |
+
|
| 608 |
+
# 3. νμΌ μ
λ‘λ
|
| 609 |
+
st.sidebar.header('π νμΌ μ
λ‘λ')
|
| 610 |
+
uploaded_file = st.sidebar.file_uploader(
|
| 611 |
+
"JSON νμΌ μ
λ‘λ (νμ: [{'word': 'λ¨μ΄1'}, ...])", type=['json']
|
| 612 |
+
)
|
| 613 |
if uploaded_file is not None:
|
| 614 |
+
with st.spinner("μ
λ‘λλ νμΌ μ²λ¦¬ μ€..."):
|
| 615 |
new_file_id = handle_uploaded_file(uploaded_file)
|
| 616 |
if new_file_id:
|
| 617 |
+
st.sidebar.success(f"νμΌ '{uploaded_file.name}' μ
λ‘λ μλ£!")
|
| 618 |
+
# μλ‘ μ
λ‘λλ νμΌμ μλμΌλ‘ μ ν λͺ©λ‘μ μΆκ° λ° μ ν
|
| 619 |
+
if new_file_id not in st.session_state.selected_files:
|
| 620 |
+
st.session_state.selected_files.append(new_file_id)
|
| 621 |
+
st.rerun() # UI μ¦μ κ°±μ
|
| 622 |
+
|
| 623 |
+
st.sidebar.divider()
|
| 624 |
+
|
| 625 |
+
# 4. νμΌ μ ν
|
| 626 |
+
st.sidebar.header('ποΈ λ°μ΄ν° νμΌ μ ν')
|
| 627 |
+
current_data_files = st.session_state.get('data_files', {})
|
| 628 |
+
if current_data_files:
|
| 629 |
+
st.sidebar.markdown("**μ¬μ©ν νμΌμ μ ννμΈμ:**")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
selected_files_temp = []
|
| 631 |
+
sorted_file_ids = sorted(current_data_files.keys(), key=lambda fid: current_data_files[fid]['name'])
|
| 632 |
+
|
| 633 |
+
for file_id in sorted_file_ids:
|
| 634 |
+
if file_id not in current_data_files: continue # μμ μ₯μΉ
|
| 635 |
+
file_info = current_data_files[file_id]
|
| 636 |
+
file_label = f"{file_info['name']} ({file_info['word_count']} λ¨μ΄)"
|
| 637 |
+
file_type_tag = "[κΈ°λ³Έ]" if file_info['type'] == 'default' else "[μ
λ‘λ]"
|
| 638 |
+
label_full = f"{file_label} {file_type_tag}"
|
| 639 |
is_selected = file_id in st.session_state.selected_files
|
| 640 |
+
|
| 641 |
+
# 체ν¬λ°μ€ μν λ³κ²½ κ°μ§
|
| 642 |
+
if st.sidebar.checkbox(label_full, value=is_selected, key=f"cb_{file_id}"):
|
| 643 |
selected_files_temp.append(file_id)
|
| 644 |
+
# νμΌ μ 보 νμ₯ μΉμ
|
| 645 |
+
with st.sidebar.expander("νμΌ μ 보 보기", expanded=False):
|
| 646 |
+
st.markdown(f"**μν:** `{'`, `'.join(file_info['sample_words'])}`")
|
| 647 |
+
if file_info['type'] == 'uploaded':
|
| 648 |
+
if st.button('ποΈ μ΄ νμΌ μμ ', key=f"del_{file_id}", help=f"'{file_info['name']}' μμ "):
|
| 649 |
+
if delete_file(file_id):
|
| 650 |
+
st.rerun() # μμ μ±κ³΅ μ UI κ°±μ
|
| 651 |
+
|
| 652 |
+
# μ ν μν λ³κ²½ μ μΈμ
μ
λ°μ΄νΈ λ° κ·Έλν μ΄κΈ°ν
|
| 653 |
+
if sorted(selected_files_temp) != sorted(st.session_state.selected_files):
|
| 654 |
+
st.session_state.selected_files = selected_files_temp
|
| 655 |
+
st.session_state.fig = None
|
| 656 |
+
st.session_state.generate_clicked = False
|
| 657 |
+
st.rerun() # μ ν λ³κ²½ μ μ¦μ UI λ°μ
|
| 658 |
+
|
| 659 |
+
st.sidebar.divider()
|
| 660 |
+
|
| 661 |
+
# 5. κ·Έλν μμ± λ²νΌ
|
| 662 |
if st.session_state.selected_files:
|
| 663 |
+
if st.sidebar.button('π κ·Έλν μμ±/μ
λ°μ΄νΈ', key='generate_button', type="primary"):
|
| 664 |
+
st.session_state.generate_clicked = True
|
| 665 |
+
# λ²νΌ ν΄λ¦ μ μλμΌλ‘ rerun λλ―λ‘ μ¬κΈ°μλ νλκ·Έλ§ μ€μ
|
| 666 |
+
else:
|
| 667 |
+
st.sidebar.warning('κ·Έλνλ₯Ό μμ±ν νμΌμ 1κ° μ΄μ μ νν΄μ£ΌμΈμ.')
|
| 668 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
else:
|
| 670 |
+
st.sidebar.info('μ¬μ© κ°λ₯ν λ°μ΄ν° νμΌμ΄ μμ΅λλ€. νμΌμ μ
λ‘λνκ±°λ `data` ν΄λμ JSON νμΌμ μΆκ°νμΈμ.')
|
| 671 |
+
|
| 672 |
+
st.sidebar.divider()
|
| 673 |
+
|
| 674 |
+
# 6. μΊμ μ΄κΈ°ν λ²νΌ
|
| 675 |
+
if st.sidebar.button('π μΊμ μ΄κΈ°ν', key='clear_cache_button'):
|
| 676 |
+
clear_cache()
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
# --- λ©μΈ μ½ν
μΈ μμ ---
|
| 680 |
+
st.header("π 3D λ¨μ΄ λ€νΈμν¬ μκ°ν")
|
| 681 |
+
|
| 682 |
+
# κ·Έλν νμ λ‘μ§
|
| 683 |
+
if st.session_state.selected_files:
|
| 684 |
+
# κ·Έλνλ₯Ό μμ±ν΄μΌ νλ 쑰건 νμΈ
|
| 685 |
+
should_generate_graph = st.session_state.generate_clicked or \
|
| 686 |
+
(st.session_state.fig is None and st.session_state.selected_files) # μ νμ νλλ° μμ§ κ·Έλν μμ λ
|
| 687 |
+
|
| 688 |
+
if should_generate_graph and st.session_state.get('fasttext_model'): # λͺ¨λΈ λ‘λ νμΈ
|
| 689 |
+
with st.spinner('μλ―Έ κΈ°λ° κ·Έλν μμ± μ€... μ μλ§ κΈ°λ€λ €μ£ΌμΈμ.'):
|
| 690 |
+
try:
|
| 691 |
+
# generate_graph ν¨μ νΈμΆ (λͺ¨λ νλΌλ―Έν° μ λ¬)
|
| 692 |
+
fig = generate_graph(
|
| 693 |
+
st.session_state.selected_files,
|
| 694 |
+
st.session_state.threshold,
|
| 695 |
+
st.session_state.perplexity,
|
| 696 |
+
st.session_state.learning_rate,
|
| 697 |
+
st.session_state.n_iter
|
| 698 |
+
)
|
| 699 |
+
st.session_state.fig = fig # μ±κ³΅ μ fig μ μ₯
|
| 700 |
+
except Exception as e:
|
| 701 |
+
st.error(f"κ·Έλν μμ± μ€ μ¬κ°ν μ€λ₯ λ°μ: {e}")
|
| 702 |
+
st.session_state.fig = None # μ€ν¨ μ fig μ΄κΈ°ν
|
| 703 |
+
finally:
|
| 704 |
+
st.session_state.generate_clicked = False # μμ
μλ£ ν ν΄λ¦ νλκ·Έ 리μ
|
| 705 |
+
|
| 706 |
+
# μμ±λ κ·Έλνκ° μμΌλ©΄ νμ
|
| 707 |
+
if st.session_state.get('fig') is not None:
|
| 708 |
+
st.plotly_chart(st.session_state.fig, use_container_width=True)
|
| 709 |
+
|
| 710 |
+
# νμ¬ κ·Έλν μ 보 νμ
|
| 711 |
+
try:
|
| 712 |
+
num_nodes = len(st.session_state.fig.data[1].x) if len(st.session_state.fig.data) > 1 and hasattr(st.session_state.fig.data[1], 'x') else 0
|
| 713 |
+
num_edges = len(st.session_state.fig.data[0].x) // 3 if len(st.session_state.fig.data) > 0 and hasattr(st.session_state.fig.data[0], 'x') and st.session_state.fig.data[0].x else 0
|
| 714 |
+
|
| 715 |
+
# μ¬μ©λ νμΌ μ΄λ¦ μ»κΈ° (λ°μ΄ν° λ‘λ ν)
|
| 716 |
+
current_data_files = st.session_state.get('data_files', {})
|
| 717 |
+
selected_file_names = [current_data_files[fid]['name'] for fid in st.session_state.selected_files if fid in current_data_files]
|
| 718 |
+
|
| 719 |
+
st.info(f"""
|
| 720 |
+
**νμ¬ κ·Έλν μ 보**
|
| 721 |
+
- **λ°μ΄ν° νμΌ:** {', '.join(selected_file_names)}
|
| 722 |
+
- **κ³ μ λ¨μ΄ μ (λ
Έλ):** {num_nodes} κ°
|
| 723 |
+
- **μ°κ²°μ μ (μ£μ§):** {num_edges} κ° (μ μ¬λ β₯ {st.session_state.threshold:.2f})
|
| 724 |
+
""")
|
| 725 |
+
except Exception as info_e:
|
| 726 |
+
st.warning(f"κ·Έλν μ 보 νμ μ€ μ€λ₯: {info_e}")
|
| 727 |
+
|
| 728 |
# μ¬μ© μ€λͺ
|
| 729 |
+
with st.expander("π‘ κ·Έλν μ‘°μ λ°©λ²"):
|
| 730 |
st.markdown("""
|
| 731 |
+
- **νλ/μΆμ:** λ§μ°μ€ ν μ€ν¬λ‘€
|
| 732 |
+
- **νμ :** λ§μ°μ€ μΌμͺ½ λ²νΌ λλ₯Έ μνλ‘ λλκ·Έ
|
| 733 |
+
- **μ΄λ (Pan):** λ§μ°μ€ μ€λ₯Έμͺ½ λ²νΌ λλ₯Έ μνλ‘ λλκ·Έ
|
| 734 |
+
- **λ¨μ΄ μ 보 νμΈ:** λ§μ°μ€ 컀μλ₯Ό λ¨μ΄(λ§μ»€) μμ μ¬λ¦¬λ©΄ λ¨μ΄ μ΄λ¦κ³Ό μ°κ²° μλ₯Ό λ³Ό μ μμ΅λλ€.
|
| 735 |
+
- **ν΄λ°:** κ·Έλν μ°μΈ‘ μλ¨ ν΄λ° μμ΄μ½μΌλ‘ λ€μν κΈ°λ₯(λ€μ΄λ‘λ, μ΄κΈ°ν λ±) μ¬μ© κ°λ₯.
|
| 736 |
""")
|
| 737 |
+
# κ·Έλν μμ±μ ν΄μΌνλλ° μμ§ μ ν κ²½μ° or μμ± μ€ν¨ν κ²½μ°
|
| 738 |
+
elif not should_generate_graph and st.session_state.fig is None:
|
| 739 |
+
st.info("π μ¬μ΄λλ°μμ 'π κ·Έλν μμ±/μ
λ°μ΄νΈ' λ²νΌμ ν΄λ¦νμ¬ μκ°νλ₯Ό μμνμΈμ.")
|
| 740 |
+
|
| 741 |
+
# μ νλ νμΌμ΄ μλ κ²½μ°
|
| 742 |
+
elif not st.session_state.data_files:
|
| 743 |
+
st.warning("νμν λ°μ΄ν° νμΌμ΄ μμ΅λλ€. νμΌμ μ
λ‘λνκ±°λ `data` ν΄λμ μ ν¨ν JSON νμΌμ μΆκ°νμΈμ.")
|
| 744 |
+
else: # λ°μ΄ν° νμΌμ μμΌλ μ ννμ§ μμ κ²½μ°
|
| 745 |
+
st.info("π μ¬μ΄λλ°μμ λΆμν λ°μ΄ν° νμΌμ μ νν΄μ£ΌμΈμ.")
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
# --- νλ¨ μ 보 μΉμ
---
|
| 749 |
+
st.divider()
|
| 750 |
+
with st.expander("βΉοΈ μ΄ μκ°ν λꡬμ λνμ¬"):
|
| 751 |
+
st.markdown(f"""
|
| 752 |
+
μ΄ λꡬλ λ€μκ³Ό κ°μ κ³Όμ μ ν΅ν΄ νκ΅μ΄ λ¨μ΄ λ€νΈμν¬λ₯Ό μκ°νν©λλ€:
|
| 753 |
+
|
| 754 |
+
1. **λ°μ΄ν° λ‘λ©:** μ¬μ©μκ° μ 곡ν JSON νμΌμμ 'word' νλλ₯Ό κ°μ§ λ¨μ΄ λͺ©λ‘μ μΆμΆν©λλ€.
|
| 755 |
+
2. **λ¨μ΄ μλ² λ© (FastText):** κ° λ¨μ΄λ₯Ό **μ¬μ νμ΅λ FastText λͺ¨λΈ**(`{os.path.basename(FASTTEXT_MODEL_PATH)}` μ¬μ© μ€)μ μ¬μ©νμ¬ κ³ μ°¨μμ μλ―Έ 벑ν°λ‘ λ³νν©λλ€.
|
| 756 |
+
3. **μ μ¬λ κ³μ°:** λ¨μ΄ λ²‘ν° κ°μ **μ½μ¬μΈ μ μ¬λ**λ₯Ό κ³μ°ν©λλ€.
|
| 757 |
+
4. **μ°¨μ μΆμ (t-SNE):** κ³ μ°¨μ 벑ν°λ₯Ό 3μ°¨μμΌλ‘ μΆμνμ¬ μκ°νν©λλ€. t-SNE νλΌλ―Έν°(Perplexity: {st.session_state.perplexity}, Learning Rate: {st.session_state.learning_rate}, Iterations: {st.session_state.n_iter})λ₯Ό μ‘°μ νμ¬ κ΅°μ§ ννλ₯Ό λ―ΈμΈ μ‘°μ ν μ μμ΅λλ€.
|
| 758 |
+
5. **κ·Έλν μμ± λ° μκ°ν:** μ μ¬λκ° μ€μ λ μκ³κ°(νμ¬: {st.session_state.threshold:.2f}) μ΄μμΈ λ¨μ΄λ€μ μ°κ²°νμ¬ 3D λ€νΈμν¬ κ·Έλνλ₯Ό μμ±νκ³ νμν©λλ€.
|
| 759 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|