jeongsoo commited on
Commit
e50c25d
Β·
1 Parent(s): d9e50af
Files changed (1) hide show
  1. app.py +604 -459
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import streamlit as st
3
  import json
4
  import os
@@ -15,12 +14,16 @@ import matplotlib.pyplot as plt
15
  import matplotlib.font_manager as fm
16
  from sklearn.manifold import TSNE
17
  import warnings
 
 
 
18
  warnings.filterwarnings('ignore')
19
 
 
20
  # νŽ˜μ΄μ§€ μ„€μ •
21
  st.set_page_config(
22
- page_title="ν•œκ΅­μ–΄ 단어 의미 λ„€νŠΈμ›Œν¬ μ‹œκ°ν™”",
23
- page_icon="πŸ”€",
24
  layout="wide"
25
  )
26
 
@@ -29,100 +32,122 @@ DATA_FOLDER = 'data'
29
  UPLOAD_FOLDER = 'uploads'
30
 
31
  # 폴더 생성
 
 
32
  if not os.path.exists(UPLOAD_FOLDER):
33
  os.makedirs(UPLOAD_FOLDER)
34
 
35
- # μ„Έμ…˜ μƒνƒœ μ΄ˆκΈ°ν™”
36
- if 'model' not in st.session_state:
37
- st.session_state.model = None
38
-
 
 
 
 
 
 
 
 
39
  if 'embeddings_cache' not in st.session_state:
40
- st.session_state.embeddings_cache = {}
41
-
42
  if 'graph_cache' not in st.session_state:
43
  st.session_state.graph_cache = {}
44
-
45
  if 'data_files' not in st.session_state:
46
  st.session_state.data_files = {}
47
-
48
  if 'selected_files' not in st.session_state:
49
  st.session_state.selected_files = []
50
-
51
  if 'threshold' not in st.session_state:
52
- st.session_state.threshold = 0.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # --- ν•œκΈ€ 폰트 μ„€μ • ν•¨μˆ˜ ---
55
  def set_korean_font():
56
- """
57
- ν˜„μž¬ μš΄μ˜μ²΄μ œμ— λ§žλŠ” ν•œκΈ€ 폰트λ₯Ό matplotlib 및 Plotly용으둜 μ„€μ • μ‹œλ„ν•˜κ³ ,
58
- Plotlyμ—μ„œ μ‚¬μš©ν•  폰트 이름을 λ°˜ν™˜ν•©λ‹ˆλ‹€.
59
- """
60
  system_name = platform.system()
61
- plotly_font_name = None # Plotlyμ—μ„œ μ‚¬μš©ν•  폰트 이름
62
-
63
- # Matplotlib 폰트 μ„€μ •
64
- if system_name == "Windows":
65
- font_name = "Malgun Gothic"
66
- plotly_font_name = "Malgun Gothic"
67
- elif system_name == "Darwin": # MacOS
68
- font_name = "AppleGothic"
69
- plotly_font_name = "AppleGothic"
70
- elif system_name == "Linux":
71
- # Linuxμ—μ„œ μ„ ν˜Έν•˜λŠ” ν•œκΈ€ 폰트 경둜 λ˜λŠ” 이름 μ„€μ •
72
- font_path = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
73
- plotly_font_name_linux = "NanumGothic" # PlotlyλŠ” 폰트 '이름'을 주둜 μ‚¬μš©
74
-
75
- if os.path.exists(font_path):
76
- font_name = fm.FontProperties(fname=font_path).get_name()
77
- plotly_font_name = plotly_font_name_linux
78
- else:
79
- # μ‹œμŠ€ν…œμ—μ„œ 'Nanum' 포함 폰트 μ°ΎκΈ° μ‹œλ„
80
- try:
 
 
 
 
 
 
 
 
81
  available_fonts = [f.name for f in fm.fontManager.ttflist]
82
  nanum_fonts = [name for name in available_fonts if 'Nanum' in name]
83
  if nanum_fonts:
84
  font_name = nanum_fonts[0]
85
- # Plotlyμ—μ„œ μ‚¬μš©ν•  이름도 λΉ„μŠ·ν•˜κ²Œ μ„€μ • (μ •ν™•ν•œ 이름은 μ‹œμŠ€ν…œλ§ˆλ‹€ λ‹€λ₯Ό 수 있음)
86
- plotly_font_name = font_name if 'Nanum' in font_name else plotly_font_name_linux
87
  else:
88
- # λ‹€λ₯Έ OS 폰트 μ‹œλ„
89
- if "Malgun Gothic" in available_fonts:
90
- font_name = "Malgun Gothic"
91
- plotly_font_name = "Malgun Gothic"
92
- elif "AppleGothic" in available_fonts:
93
- font_name = "AppleGothic"
94
- plotly_font_name = "AppleGothic"
95
- else:
96
- font_name = None
97
-
98
- except Exception as e:
99
- font_name = None
100
-
101
- if not font_name:
102
- font_name = None
103
- plotly_font_name = None # Plotly도 κΈ°λ³Έκ°’ μ‚¬μš©
104
-
105
- else: # 기타 OS
106
- font_name = None
107
- plotly_font_name = None
108
 
109
- # Matplotlib 폰트 μ„€μ • 적용
110
- if font_name:
111
- try:
112
  plt.rc('font', family=font_name)
113
  plt.rc('axes', unicode_minus=False)
114
- except Exception as e:
 
 
115
  plt.rcdefaults()
116
  plt.rc('axes', unicode_minus=False)
117
- else:
 
 
118
  plt.rcdefaults()
119
  plt.rc('axes', unicode_minus=False)
120
 
121
- if not plotly_font_name:
122
- plotly_font_name = 'sans-serif' # Plotly κΈ°λ³Έκ°’ μ§€μ •
123
-
124
- return plotly_font_name # Plotlyμ—μ„œ μ‚¬μš©ν•  폰트 이름 λ°˜ν™˜
125
-
126
 
127
  # --- 데이터 λ‘œλ“œ ν•¨μˆ˜ ---
128
  def load_words_from_json(filepath):
@@ -130,190 +155,263 @@ def load_words_from_json(filepath):
130
  try:
131
  with open(filepath, 'r', encoding='utf-8') as f:
132
  data = json.load(f)
133
- # dataκ°€ 리슀트 ν˜•νƒœλΌκ³  κ°€μ •
134
  if isinstance(data, list):
135
- words = [item.get('word', '') for item in data if item.get('word')]
136
- # 빈 λ¬Έμžμ—΄ 제거
137
- words = [word for word in words if word]
 
 
138
  return words
139
  else:
140
- st.error(f"였λ₯˜: 파일 '{filepath}'의 μ΅œμƒμœ„ ν˜•μ‹μ΄ λ¦¬μŠ€νŠΈκ°€ μ•„λ‹™λ‹ˆλ‹€.")
141
  return None
142
  except FileNotFoundError:
143
  st.error(f"였λ₯˜: 파일 '{filepath}'λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
144
  return None
145
- except json.JSONDecodeError:
146
- st.error(f"였λ₯˜: 파일 '{filepath}'의 JSON ν˜•μ‹μ΄ 잘λͺ»λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
147
  return None
148
  except Exception as e:
149
- st.error(f"데이터 λ‘œλ”© 쀑 였λ₯˜ λ°œμƒ: {e}")
150
  return None
151
 
152
-
153
  def scan_data_files():
154
- """데이터 ν΄λ”μ—μ„œ μ‚¬μš© κ°€λŠ₯ν•œ λͺ¨λ“  JSON νŒŒμΌμ„ μŠ€μΊ”ν•˜κ³  정보λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€."""
155
  data_files = {}
156
-
157
- # κΈ°λ³Έ 데이터 폴더 μŠ€μΊ”
158
- for file_path in glob.glob(os.path.join(DATA_FOLDER, '*.json')):
159
- file_id = str(uuid.uuid4())
160
- file_name = os.path.basename(file_path)
161
- words = load_words_from_json(file_path)
162
- if words:
163
- data_files[file_id] = {
164
- 'path': file_path,
165
- 'name': file_name,
166
- 'word_count': len(words),
167
- 'type': 'default',
168
- 'sample_words': words[:5] if len(words) > 5 else words
169
- }
170
-
171
- # μ—…λ‘œλ“œ 폴더 μŠ€μΊ”
172
- for file_path in glob.glob(os.path.join(UPLOAD_FOLDER, '*.json')):
173
- file_id = str(uuid.uuid4())
174
- file_name = os.path.basename(file_path)
175
- words = load_words_from_json(file_path)
176
- if words:
177
- data_files[file_id] = {
178
- 'path': file_path,
179
- 'name': file_name,
180
- 'word_count': len(words),
181
- 'type': 'uploaded',
182
- 'sample_words': words[:5] if len(words) > 5 else words
183
- }
184
-
185
  return data_files
186
 
187
-
188
- def merge_word_lists(file_ids):
189
  """μ„ νƒλœ νŒŒμΌλ“€μ—μ„œ 단어λ₯Ό λ‘œλ“œν•˜κ³  쀑볡 μ œκ±°ν•˜μ—¬ λ³‘ν•©ν•©λ‹ˆλ‹€."""
190
- all_words = []
191
-
 
 
192
  for file_id in file_ids:
193
- if file_id in st.session_state.data_files:
194
- file_path = st.session_state.data_files[file_id]['path']
195
  words = load_words_from_json(file_path)
196
  if words:
197
- all_words.extend(words)
198
-
199
- # 쀑볡 제거 및 μ •λ ¬
200
- unique_words = sorted(list(set(all_words)))
 
 
201
  return unique_words
202
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- def encode_words(words, normalize=True):
205
- """단어 λͺ©λ‘μ„ μž„λ² λ”©μœΌλ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€."""
206
- # κ°„λ‹¨ν•œ TF-IDF μŠ€νƒ€μΌ μž„λ² λ”© 생성 (μž„μ‹œ)
207
  embeddings = []
208
- vocab = set(" ".join(words))
209
- dim = len(vocab)
210
-
211
- char_to_idx = {char: i for i, char in enumerate(sorted(vocab))}
212
-
213
- for word in words:
214
- embed = np.zeros(dim)
215
- for char in word:
216
- if char in char_to_idx:
217
- embed[char_to_idx[char]] += 1
218
-
219
- # μ •κ·œν™” (선택적)
220
- if normalize and np.sum(embed) > 0:
221
- embed = embed / np.linalg.norm(embed)
222
-
223
- embeddings.append(embed)
224
-
225
- return np.array(embeddings)
226
-
227
-
228
- def generate_graph(file_ids, similarity_threshold=0.7):
229
- """μ—¬λŸ¬ νŒŒμΌμ—μ„œ 단어λ₯Ό λ‘œλ“œν•˜κ³  κ·Έλž˜ν”„λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€."""
230
- # μΊμ‹œ ν‚€ 생성 (파일 ID와 μž„κ³„κ°’ μ‘°ν•©)
231
- cache_key = f"{'-'.join(sorted(file_ids))}_{similarity_threshold}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  if cache_key in st.session_state.graph_cache:
 
233
  return st.session_state.graph_cache[cache_key]
234
-
235
- # ν•œκΈ€ 폰트 μ„€μ •
236
- plotly_font = set_korean_font()
237
-
238
- # μ„ νƒλœ νŒŒμΌλ“€μ—μ„œ 단어 λ‘œλ“œ 및 병합
239
- word_list = merge_word_lists(file_ids)
240
-
 
 
 
 
 
241
  if not word_list:
242
- st.error("데이터λ₯Ό λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
 
 
 
 
 
 
 
 
 
243
  return None
244
-
245
- # μž„λ² λ”© 생성
246
- with st.spinner('μž„λ² λ”© 생성 쀑...'):
247
- embeddings = encode_words(word_list, normalize=True)
248
-
249
- # 3D μ’Œν‘œ 생성 - t-SNE μ‚¬μš©
250
- with st.spinner('차원 μΆ•μ†Œ 쀑 (t-SNE)...'):
251
- effective_perplexity = min(30, len(word_list) - 1)
252
- if effective_perplexity <= 0:
253
- effective_perplexity = 5 # 맀우 μž‘μ€ 데이터셋 λŒ€λΉ„
254
-
255
- tsne = TSNE(n_components=3, random_state=42, perplexity=effective_perplexity,
256
- max_iter=1000, init='pca', learning_rate='auto')
257
- embeddings_3d = tsne.fit_transform(embeddings)
258
-
259
- # μœ μ‚¬λ„ 계산 및 μ—£μ§€ μ •μ˜
260
- with st.spinner('μœ μ‚¬λ„ 계산 쀑...'):
261
- similarity_matrix = cosine_similarity(embeddings)
262
-
263
- edges = []
264
- edge_weights = []
265
- for i in range(len(word_list)):
266
- for j in range(i + 1, len(word_list)):
267
- similarity = similarity_matrix[i, j]
268
- if similarity > similarity_threshold:
269
- edges.append((word_list[i], word_list[j]))
270
- edge_weights.append(similarity)
271
-
272
- # NetworkX κ·Έλž˜ν”„ 생성
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  G = nx.Graph()
 
274
  for i, word in enumerate(word_list):
275
- G.add_node(word, pos=(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2]))
276
-
277
- # 엣지와 κ°€μ€‘μΉ˜ μΆ”κ°€
 
 
 
 
 
 
 
278
  for edge, weight in zip(edges, edge_weights):
279
- G.add_edge(edge[0], edge[1], weight=weight)
280
-
281
- # Plotly κ·Έλž˜ν”„ 생성
282
- # μ—£μ§€ μ’Œν‘œ μΆ”μΆœ
283
- edge_x = []
284
- edge_y = []
285
- edge_z = []
286
- if edges:
287
  for edge in G.edges():
288
- x0, y0, z0 = G.nodes[edge[0]]['pos']
289
- x1, y1, z1 = G.nodes[edge[1]]['pos']
290
- edge_x.extend([x0, x1, None])
291
- edge_y.extend([y0, y1, None])
292
- edge_z.extend([z0, z1, None])
293
-
294
- # μ—£μ§€μš© Scatter3d 트레이슀 생성
295
- edge_trace = go.Scatter3d(
296
- x=edge_x, y=edge_y, z=edge_z,
297
- mode='lines',
298
- line=dict(width=1, color='#888'),
299
- hoverinfo='none'
300
- )
301
- else:
302
- edge_trace = go.Scatter3d(x=[], y=[], z=[], mode='lines')
303
-
304
- # λ…Έλ“œ μœ„μΉ˜μ™€ ν…μŠ€νŠΈ μΆ”μΆœ
305
- node_x = [G.nodes[node]['pos'][0] for node in G.nodes()]
306
- node_y = [G.nodes[node]['pos'][1] for node in G.nodes()]
307
- node_z = [G.nodes[node]['pos'][2] for node in G.nodes()]
308
- node_text = list(G.nodes())
309
- node_adjacencies = []
310
- node_hover_text = []
311
- for node, adjacencies in enumerate(G.adjacency()):
312
- num_connections = len(adjacencies[1])
313
- node_adjacencies.append(num_connections)
314
- node_hover_text.append(f'{node_text[node]}<br>μ—°κ²°: {num_connections}개')
315
-
316
- # λ…Έλ“œμš© Scatter3d 트레이슀 생성
 
 
 
 
317
  node_trace = go.Scatter3d(
318
  x=node_x, y=node_y, z=node_z,
319
  mode='markers+text',
@@ -321,294 +419,341 @@ def generate_graph(file_ids, similarity_threshold=0.7):
321
  hovertext=node_hover_text,
322
  hoverinfo='text',
323
  textposition='top center',
324
- textfont=dict(
325
- size=10,
326
- color='black',
327
- family=plotly_font
328
- ),
329
  marker=dict(
330
- size=6,
331
- color=node_z,
332
  colorscale='Viridis',
333
  opacity=0.9,
334
- colorbar=dict(thickness=15, title='Node Depth (Z-axis)', xanchor='left', title_side='right')
335
  )
336
  )
337
-
338
- # 파일 정보 λ¬Έμžμ—΄ 생성
339
- file_names = [st.session_state.data_files[file_id]['name'] for file_id in file_ids
340
- if file_id in st.session_state.data_files]
341
- file_info = ", ".join(file_names)
342
-
343
- # λ ˆμ΄μ•„μ›ƒ μ„€μ •
344
  layout = go.Layout(
345
  title=dict(
346
- text=f'μ–΄νœ˜ 의미 μœ μ‚¬μ„± 기반 3D κ·Έλž˜ν”„ (Threshold: {similarity_threshold})<br>데이터: {file_info}',
347
- font=dict(size=16, family=plotly_font)
 
348
  ),
349
  showlegend=False,
350
- margin=dict(b=20, l=5, r=5, t=80), # 제λͺ© 높이 확보λ₯Ό μœ„ν•΄ t κ°’ 증가
351
  scene=dict(
352
- xaxis=dict(title='TSNE Dimension 1', showticklabels=False, backgroundcolor="rgb(230, 230,230)",
353
- gridcolor="white", zerolinecolor="white"),
354
- yaxis=dict(title='TSNE Dimension 2', showticklabels=False, backgroundcolor="rgb(230, 230,230)",
355
- gridcolor="white", zerolinecolor="white"),
356
- zaxis=dict(title='TSNE Dimension 3', showticklabels=False, backgroundcolor="rgb(230, 230,230)",
357
- gridcolor="white", zerolinecolor="white"),
358
- aspectratio=dict(x=1, y=1, z=0.8)
359
- )
360
  )
361
-
362
- # Figure 생성
363
  fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
364
-
365
- # κ²°κ³Ό μΊμ‹œ μ €μž₯
366
  st.session_state.graph_cache[cache_key] = fig
367
-
368
- return fig
369
 
 
370
 
 
371
  def handle_uploaded_file(uploaded_file):
372
- """μ—…λ‘œλ“œλœ νŒŒμΌμ„ μ²˜λ¦¬ν•˜οΏ½οΏ½οΏ½ 데이터 파일 λͺ©λ‘μ— μΆ”κ°€ν•©λ‹ˆλ‹€."""
373
  if uploaded_file is not None:
374
- # 파일λͺ… μ•ˆμ „ 처리 및 μ €μž₯
375
- timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
376
- file_name = f"{timestamp}_{uploaded_file.name}"
377
  file_path = os.path.join(UPLOAD_FOLDER, file_name)
378
-
379
  try:
380
- # 파일 μ €μž₯
381
  with open(file_path, 'wb') as f:
382
  f.write(uploaded_file.getbuffer())
383
-
384
- # μ—…λ‘œλ“œλœ 파일 검증
385
  words = load_words_from_json(file_path)
386
- if not words:
387
- os.remove(file_path) # 잘λͺ»λœ ν˜•μ‹μ΄λ©΄ 파일 μ‚­μ œ
388
- st.error('μ—…λ‘œλ“œλœ νŒŒμΌμ—μ„œ 단어λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. 파일 ν˜•μ‹μ„ ν™•μΈν•˜μ„Έμš”.')
389
  return None
390
-
391
- # 데이터 파일 λ‹€μ‹œ μŠ€μΊ”ν•˜μ—¬ μƒˆ 파일 정보 포함
392
- st.session_state.data_files = scan_data_files()
393
-
394
- # μƒˆ νŒŒμΌμ— ν•΄λ‹Ήν•˜λŠ” file_id μ°ΎκΈ°
395
- new_file_id = None
396
- for file_id, file_info in st.session_state.data_files.items():
397
- if file_info['path'] == file_path:
398
- new_file_id = file_id
399
- break
400
-
401
- return new_file_id
402
-
403
  except Exception as e:
404
- # 였λ₯˜ λ°œμƒ μ‹œ μ—…λ‘œλ“œλœ 파일 μ‚­μ œ μ‹œλ„
405
- try:
406
- if os.path.exists(file_path):
407
- os.remove(file_path)
408
- except:
409
- pass
410
- st.error(f'파일 μ—…λ‘œλ“œ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}')
411
  return None
412
 
413
-
414
  def delete_file(file_id):
415
- """νŒŒμΌμ„ μ‚­μ œν•©λ‹ˆλ‹€."""
416
- if file_id not in st.session_state.data_files:
417
- st.error('νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.')
 
418
  return False
419
-
420
- file_info = st.session_state.data_files[file_id]
421
-
422
- # μ—…λ‘œλ“œλœ 파일만 μ‚­μ œ ν—ˆμš©
423
- if file_info['type'] != 'uploaded':
424
  st.error('κΈ°λ³Έ 데이터 νŒŒμΌμ€ μ‚­μ œν•  수 μ—†μŠ΅λ‹ˆλ‹€.')
425
  return False
426
-
427
- # 파일 μ‚­μ œ
428
- file_path = file_info['path']
429
- if os.path.exists(file_path):
430
- os.remove(file_path)
431
-
432
- # 데이터 파일 정보 μ—…λ°μ΄νŠΈ
433
- st.session_state.data_files.pop(file_id)
434
-
435
- # κ΄€λ ¨ μΊμ‹œ ν•­λͺ© μ‚­μ œ
436
- keys_to_remove = []
437
- for cache_key in st.session_state.graph_cache:
438
- if file_id in cache_key:
439
- keys_to_remove.append(cache_key)
440
-
441
- for key in keys_to_remove:
442
- st.session_state.graph_cache.pop(key)
443
-
444
- # μ„ νƒλœ 파일 λͺ©λ‘μ—μ„œλ„ 제거
445
- if file_id in st.session_state.selected_files:
446
- st.session_state.selected_files.remove(file_id)
447
-
448
- return True
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
 
 
 
 
 
 
 
 
451
  def clear_cache():
452
- """μΊμ‹œλ₯Ό μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€."""
453
  st.session_state.graph_cache = {}
454
- st.session_state.embeddings_cache = {}
455
- st.success('μΊμ‹œκ°€ μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.')
 
 
 
 
 
 
 
456
 
 
 
 
 
457
 
458
  # 데이터 파일 μŠ€μΊ”
459
- st.session_state.data_files = scan_data_files()
 
460
 
461
  # 타이틀 및 μ†Œκ°œ
462
- st.title('ν•œκ΅­μ–΄ 단어 의미 λ„€νŠΈμ›Œν¬ μ‹œκ°ν™”')
463
- st.markdown('이 λ„κ΅¬λŠ” ν•œκ΅­μ–΄ 단어듀 κ°„μ˜ 의미적 관계λ₯Ό 3D κ³΅κ°„μ—μ„œ μ‹œκ°ν™”ν•©λ‹ˆλ‹€.')
 
 
 
464
 
465
- # μ‚¬μ΄λ“œλ°” μ„€μ •
466
- st.sidebar.title('μ„€μ •')
 
 
467
 
468
- # μž„κ³„κ°’ μ„€μ •
469
- threshold = st.sidebar.slider(
470
- 'μœ μ‚¬λ„ μž„κ³„κ°’',
471
- min_value=0.1,
472
- max_value=0.9,
473
- value=st.session_state.threshold,
474
- step=0.05,
475
- help='높은 κ°’ = 더 μ—„κ²©ν•œ μ—°κ²° κΈ°μ€€ (적은 μ—£μ§€)'
476
- )
477
- st.session_state.threshold = threshold
478
 
479
- # 파일 μ—…λ‘œλ“œ
480
- st.sidebar.header('파일 μ—…λ‘œλ“œ')
481
- uploaded_file = st.sidebar.file_uploader("JSON 파일 선택", type=['json'], help="'word' ν•„λ“œλ₯Ό κ°€μ§„ 객체 배열이 ν¬ν•¨λœ JSON 파일")
482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  if uploaded_file is not None:
484
- if st.sidebar.button('파일 μ—…λ‘œλ“œ'):
485
  new_file_id = handle_uploaded_file(uploaded_file)
486
  if new_file_id:
487
- st.success(f"파일 '{uploaded_file.name}'이(κ°€) μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
488
- st.session_state.selected_files = [new_file_id]
489
- # 슀크립트 μž¬μ‹€ν–‰
490
- st.experimental_rerun()
491
-
492
- # μΊμ‹œ μ΄ˆκΈ°ν™” λ²„νŠΌ
493
- if st.sidebar.button('μΊμ‹œ μ΄ˆκΈ°ν™”'):
494
- clear_cache()
495
-
496
- # 파일 선택 μ˜μ—­
497
- st.sidebar.header('데이터 파일 선택')
498
-
499
- # app.py의 핡심 μˆ˜μ • λΆ€λΆ„
500
-
501
- # multiselect λŒ€μ‹  session_stateλ₯Ό 직접 μ‚¬μš©ν•˜λŠ” λ°©μ‹μœΌλ‘œ λ³€κ²½
502
- if st.session_state.data_files:
503
- # 파일 선택 μ˜΅μ…˜ 생성
504
- options = {}
505
- for file_id, file_info in st.session_state.data_files.items():
506
- label = f"{file_info['name']} ({file_info['word_count']}개 단어) {'[κΈ°λ³Έ]' if file_info['type'] == 'default' else '[μ—…λ‘œλ“œλ¨]'}"
507
- options[file_id] = label
508
-
509
- # κΈ°λ³Έκ°’ μ„€μ • (아직 선택이 μ—†μœΌλ©΄ 첫 번째 파일 선택)
510
- if not st.session_state.selected_files and options:
511
- st.session_state.selected_files = [next(iter(options.keys()))]
512
-
513
- # μ²΄ν¬λ°•μŠ€λ‘œ 파일 선택 κ΅¬ν˜„ - 이게 key point!
514
- st.sidebar.subheader("파일 선택 (μ—¬λŸ¬ 개 선택 κ°€λŠ₯)")
515
-
516
  selected_files_temp = []
517
- for file_id, label in options.items():
518
- # ν˜„μž¬ 선택 μƒνƒœλ₯Ό 기반으둜 κΈ°λ³Έκ°’ μ„€μ •
 
 
 
 
 
 
519
  is_selected = file_id in st.session_state.selected_files
520
- # 각 νŒŒμΌμ— λŒ€ν•œ μ²΄ν¬λ°•μŠ€ 생성
521
- if st.sidebar.checkbox(label, value=is_selected, key=f"file_{file_id}"):
 
522
  selected_files_temp.append(file_id)
523
-
524
- # 선택 μƒνƒœ μ—…λ°μ΄νŠΈ
525
- st.session_state.selected_files = selected_files_temp
526
-
527
- # μ„ νƒλœ 파일 미리보기
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  if st.session_state.selected_files:
529
- st.sidebar.subheader('μ„ νƒλœ 파일 미리보기')
530
- for file_id in st.session_state.selected_files:
531
- file_info = st.session_state.data_files[file_id]
532
-
533
- col1, col2 = st.sidebar.columns([3, 1])
534
- with col1:
535
- st.write(f"**{file_info['name']}**")
536
- st.write(f"단어 수: {file_info['word_count']}")
537
- st.write(f"μƒ˜ν”Œ: {', '.join(file_info['sample_words'])}")
538
-
539
- with col2:
540
- if file_info['type'] == 'uploaded':
541
- if st.button('μ‚­μ œ', key=f"delete_{file_id}"):
542
- if delete_file(file_id):
543
- st.success(f"파일 '{file_info['name']}'이(κ°€) μ‚­μ œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
544
- # 슀크립트 μž¬μ‹€ν–‰
545
- st.experimental_rerun()
546
-
547
- # κ·Έλž˜ν”„ 생성 λ²„νŠΌ
548
- generate_button = st.sidebar.button('κ·Έλž˜ν”„ 생성')
549
  else:
550
- st.sidebar.warning('μ‚¬μš© κ°€λŠ₯ν•œ 데이터 파일이 μ—†μŠ΅λ‹ˆλ‹€. νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ„Έμš”.')
551
- generate_button = False
552
-
553
- # 메인 μ½˜ν…μΈ 
554
- if st.session_state.selected_files and (generate_button or 'fig' in st.session_state):
555
- with st.spinner('κ·Έλž˜ν”„ 생성 쀑...'):
556
- fig = generate_graph(st.session_state.selected_files, threshold)
557
- st.session_state.fig = fig
558
-
559
- if fig:
560
- # κ·Έλž˜ν”„ ν‘œμ‹œ
561
- st.plotly_chart(fig, use_container_width=True)
562
-
563
- # μ„ νƒλœ 파일 정보
564
- file_names = [st.session_state.data_files[file_id]['name'] for file_id in st.session_state.selected_files]
565
- word_counts = sum([st.session_state.data_files[file_id]['word_count'] for file_id in st.session_state.selected_files])
566
-
567
- # 차트 정보
568
- st.info(f"""
569
- **ν˜„μž¬ κ·Έλž˜ν”„ 정보**
570
- - 데이터 파일: {', '.join(file_names)}
571
- - 총 단어 수: {word_counts}개
572
- - μœ μ‚¬λ„ μž„κ³„κ°’: {threshold}
573
- """)
574
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  # μ‚¬μš© μ„€λͺ…
576
- with st.expander("κ·Έλž˜ν”„ μ‘°μž‘ 방법"):
577
  st.markdown("""
578
- - **마우슀 휠**: ν™•λŒ€/μΆ•μ†Œ
579
- - **마우슀 λ“œλž˜κ·Έ**: νšŒμ „
580
- - **마우슀 였λ₯Έμͺ½ λ²„νŠΌ λ“œλž˜κ·Έ**: 이동
581
- - **단어에 마우슀 μ˜€λ²„**: 단어 이름 및 μ—°κ²° 수 확인
 
582
  """)
583
- elif not st.session_state.selected_files:
584
- st.info('쒌츑 μ‚¬μ΄λ“œλ°”μ—μ„œ 데이터 νŒŒμΌμ„ μ„ νƒν•œ ν›„ "κ·Έλž˜ν”„ 생성" λ²„νŠΌμ„ ν΄λ¦­ν•˜μ„Έμš”.')
585
-
586
- # 정보 μ„Ήμ…˜
587
- with st.expander("이 μ‹œκ°ν™”μ— λŒ€ν•΄"):
588
- st.markdown("""
589
- 이 λ„κ΅¬λŠ” λ‹€μŒκ³Ό 같은 κΈ°μˆ μ„ μ‚¬μš©ν•˜μ—¬ ν•œκ΅­μ–΄ 단어 λ„€νŠΈμ›Œν¬λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€:
590
-
591
- - **문자 기반 μž„λ² λ”©**: 각 λ‹¨μ–΄μ˜ 문자 ꡬ성을 기반으둜 μž„λ² λ”©μ„ μƒμ„±ν•©λ‹ˆλ‹€.
592
- - **t-SNE 차원 μΆ•μ†Œ**: λ³΅μž‘ν•œ 고차원 벑터λ₯Ό 3D 곡간에 νˆ¬μ˜ν•˜μ—¬ 의미적 관계λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€.
593
- - **코사인 μœ μ‚¬λ„**: 단어 벑터 κ°„ 각도λ₯Ό 기반으둜 의미적 μœ μ‚¬μ„±μ„ μΈ‘μ •ν•©λ‹ˆλ‹€.
594
- - **Plotly μ‹œκ°ν™”**: μΈν„°λž™ν‹°λΈŒν•œ 3D μ‹œκ°ν™”λ₯Ό μ œκ³΅ν•©λ‹ˆλ‹€.
595
-
596
- 각 λ‹¨μ–΄λŠ” 3D κ³΅κ°„μ˜ 점으둜 ν‘œμ‹œλ˜λ©°, μœ μ‚¬λ„κ°€ 높은 단어듀은 μ—°κ²°μ„ (μ—£μ§€)으둜 μ—°κ²°λ©λ‹ˆλ‹€. 색상은 zμΆ• 값에 따라 λ‹€λ₯΄κ²Œ ν‘œμ‹œλ©λ‹ˆλ‹€.
597
- """)
598
-
599
- # JSON 파일 ν˜•μ‹ μ•ˆλ‚΄
600
- with st.expander("JSON 파일 ν˜•μ‹"):
601
- st.markdown("""
602
- μ—…λ‘œλ“œν•˜λŠ” JSON νŒŒμΌμ€ λ‹€μŒ ν˜•μ‹μ„ 따라야 ν•©λ‹ˆλ‹€:
603
- ```json
604
- [
605
- {"word": "학ꡐ"},
606
- {"word": "μ„ μƒλ‹˜"},
607
- {"word": "친ꡬ"},
608
- {"word": "μˆ™μ œ"},
609
- ...
610
- ]
611
- ```
612
-
613
- 각 ν•­λͺ©μ€ "word" ν•„λ“œλ₯Ό κ°€μ§„ 객체이며, λ°°μ—΄ μ•ˆμ— ν¬ν•¨λ˜μ–΄μ•Ό ν•©λ‹ˆλ‹€.
614
- """)
 
 
1
  import streamlit as st
2
  import json
3
  import os
 
14
  import matplotlib.font_manager as fm
15
  from sklearn.manifold import TSNE
16
  import warnings
17
+ import gensim # FastText μ‚¬μš©μ„ μœ„ν•œ gensim import
18
+ import hashlib # μΊμ‹œ ν‚€ 생성을 μœ„ν•΄ μΆ”κ°€
19
+
20
  warnings.filterwarnings('ignore')
21
 
22
+ # --- κΈ°λ³Έ μ„€μ • ---
23
  # νŽ˜μ΄μ§€ μ„€μ •
24
  st.set_page_config(
25
+ page_title="ν•œκ΅­μ–΄ 단어 의미 λ„€νŠΈμ›Œν¬ μ‹œκ°ν™” (FastText)",
26
+ page_icon="🧠", # μ•„μ΄μ½˜ λ³€κ²½
27
  layout="wide"
28
  )
29
 
 
32
  UPLOAD_FOLDER = 'uploads'
33
 
34
  # 폴더 생성
35
+ if not os.path.exists(DATA_FOLDER):
36
+ os.makedirs(DATA_FOLDER)
37
  if not os.path.exists(UPLOAD_FOLDER):
38
  os.makedirs(UPLOAD_FOLDER)
39
 
40
+
41
+ # --- FastText λͺ¨λΈ μ„€μ • ---
42
+ # !!! μ‚¬μš©μž ν•„μˆ˜ μ„€μ • !!!
43
+ # λ‹€μš΄λ‘œλ“œν•œ ν•œκ΅­μ–΄ FastText λͺ¨λΈ 파일(.bin)의 전체 경둜λ₯Ό μ§€μ •ν•˜μ„Έμš”.
44
+ # μ˜ˆμ‹œ: "C:/Users/YourUser/Downloads/cc.ko.300.bin" λ˜λŠ” "/home/user/models/cc.ko.300.bin"
45
+ # λͺ¨λΈ λ‹€μš΄λ‘œλ“œ: https://fasttext.cc/docs/en/crawl-vectors.html λ“± μ°Έμ‘°
46
+ FASTTEXT_MODEL_PATH = "YOUR_PATH_TO/cc.ko.300.bin" # <--- 여기에 μ‹€μ œ 파일 경둜 μž…λ ₯!!!
47
+
48
+
49
+ # --- μ„Έμ…˜ μƒνƒœ μ΄ˆκΈ°ν™” ---
50
+ if 'fasttext_model' not in st.session_state:
51
+ st.session_state.fasttext_model = None # λͺ¨λΈ 객체 μ €μž₯
52
  if 'embeddings_cache' not in st.session_state:
53
+ st.session_state.embeddings_cache = {} # μž„λ² λ”© μΊμ‹œλŠ” 단어 λͺ©λ‘+λͺ¨λΈ 기반으둜 재고렀 κ°€λŠ₯ (μ—¬κΈ°μ„  λ‹¨μˆœν™”)
 
54
  if 'graph_cache' not in st.session_state:
55
  st.session_state.graph_cache = {}
 
56
  if 'data_files' not in st.session_state:
57
  st.session_state.data_files = {}
 
58
  if 'selected_files' not in st.session_state:
59
  st.session_state.selected_files = []
 
60
  if 'threshold' not in st.session_state:
61
+ st.session_state.threshold = 0.6 # 의미 κΈ°λ°˜μ΄λ―€λ‘œ μž„κ³„κ°’ κΈ°λ³Έκ°’ μ‘°μ • κ°€λŠ₯
62
+ if 'perplexity' not in st.session_state:
63
+ st.session_state.perplexity = 30
64
+ if 'learning_rate' not in st.session_state:
65
+ st.session_state.learning_rate = 'auto'
66
+ if 'n_iter' not in st.session_state:
67
+ st.session_state.n_iter = 1000
68
+ if 'generate_clicked' not in st.session_state:
69
+ st.session_state.generate_clicked = False
70
+ if 'fig' not in st.session_state:
71
+ st.session_state.fig = None
72
+
73
+
74
+ # --- FastText λͺ¨λΈ λ‘œλ”© ν•¨μˆ˜ (캐싱 μ‚¬μš©) ---
75
+ @st.cache_resource # λͺ¨λΈ κ°μ²΄λŠ” ν¬λ―€λ‘œ λ¦¬μ†ŒμŠ€ 캐싱 μ‚¬μš©
76
+ def load_fasttext_model(model_path):
77
+ """μ§€μ •λœ κ²½λ‘œμ—μ„œ FastText λͺ¨λΈμ„ λ‘œλ“œν•©λ‹ˆλ‹€."""
78
+ if not os.path.exists(model_path):
79
+ st.error(f"였λ₯˜: FastText λͺ¨λΈ νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {model_path}")
80
+ st.error("FastText μ›Ήμ‚¬μ΄νŠΈ λ“±μ—μ„œ ν•œκ΅­μ–΄ λͺ¨λΈ(cc.ko.300.bin μΆ”μ²œ)을 λ‹€μš΄λ‘œλ“œν•˜κ³  μ½”λ“œ μƒλ‹¨μ˜ `FASTTEXT_MODEL_PATH` λ³€μˆ˜λ₯Ό μ •ν™•νžˆ μ§€μ •ν•΄μ£Όμ„Έμš”.")
81
+ return None
82
+ try:
83
+ st.info(f"FastText λͺ¨λΈ λ‘œλ”© 쀑... ({os.path.basename(model_path)}) λͺ¨λΈ 크기에 따라 μ‹œκ°„μ΄ 걸릴 수 μžˆμŠ΅λ‹ˆλ‹€.")
84
+ # .bin 파일 λ‘œλ“œλ₯Ό μœ„ν•΄ load_facebook_model μ‚¬μš©
85
+ model = gensim.models.fasttext.load_facebook_model(model_path)
86
+ st.info("FastText λͺ¨λΈ λ‘œλ”© μ™„λ£Œ.")
87
+ return model
88
+ except Exception as e:
89
+ st.error(f"FastText λͺ¨λΈ λ‘œλ”© 쀑 였λ₯˜ λ°œμƒ: {e}")
90
+ return None
91
 
92
  # --- ν•œκΈ€ 폰트 μ„€μ • ν•¨μˆ˜ ---
93
  def set_korean_font():
94
+ """ μš΄μ˜μ²΄μ œμ— λ§žλŠ” ν•œκΈ€ 폰트λ₯Ό μ„€μ •ν•˜κ³  Plotly용 폰트 이름을 λ°˜ν™˜ν•©λ‹ˆλ‹€. """
 
 
 
95
  system_name = platform.system()
96
+ plotly_font_name = 'sans-serif' # κΈ°λ³Έκ°’
97
+
98
+ try:
99
+ if system_name == "Windows":
100
+ font_name = "Malgun Gothic"
101
+ plotly_font_name = "Malgun Gothic"
102
+ elif system_name == "Darwin": # MacOS
103
+ font_name = "AppleGothic"
104
+ plotly_font_name = "AppleGothic"
105
+ elif system_name == "Linux":
106
+ # μ‹œμŠ€ν…œμ—μ„œ Nanum 폰트 μ°ΎκΈ° μ‹œλ„
107
+ font_path = None
108
+ possible_paths = [
109
+ "/usr/share/fonts/truetype/nanum/NanumGothic.ttf",
110
+ "/usr/share/fonts/nanum/NanumGothic.ttf",
111
+ # λ‹€λ₯Έ 경둜 μΆ”κ°€ κ°€λŠ₯
112
+ ]
113
+ for path in possible_paths:
114
+ if os.path.exists(path):
115
+ font_path = path
116
+ break
117
+
118
+ if font_path:
119
+ fm.fontManager.addfont(font_path)
120
+ prop = fm.FontProperties(fname=font_path)
121
+ font_name = prop.get_name()
122
+ plotly_font_name = font_name # PlotlyλŠ” 이름 μ‚¬μš©
123
+ else: # μ‹œμŠ€ν…œ 폰트 λ§€λ‹ˆμ €μ—μ„œ 검색
124
  available_fonts = [f.name for f in fm.fontManager.ttflist]
125
  nanum_fonts = [name for name in available_fonts if 'Nanum' in name]
126
  if nanum_fonts:
127
  font_name = nanum_fonts[0]
128
+ plotly_font_name = font_name
 
129
  else:
130
+ font_name = None # μ°ΎκΈ° μ‹€νŒ¨
131
+ else:
132
+ font_name = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ # Matplotlib μ„€μ • 적용
135
+ if font_name:
 
136
  plt.rc('font', family=font_name)
137
  plt.rc('axes', unicode_minus=False)
138
+ print(f"Matplotlib font set to: {font_name}")
139
+ else:
140
+ print("Suitable Korean font not found for Matplotlib. Using default.")
141
  plt.rcdefaults()
142
  plt.rc('axes', unicode_minus=False)
143
+
144
+ except Exception as e:
145
+ print(f"Error setting Korean font: {e}")
146
  plt.rcdefaults()
147
  plt.rc('axes', unicode_minus=False)
148
 
149
+ print(f"Plotly font name to use: {plotly_font_name}")
150
+ return plotly_font_name
 
 
 
151
 
152
  # --- 데이터 λ‘œλ“œ ν•¨μˆ˜ ---
153
  def load_words_from_json(filepath):
 
155
  try:
156
  with open(filepath, 'r', encoding='utf-8') as f:
157
  data = json.load(f)
 
158
  if isinstance(data, list):
159
+ words = [item.get('word', '') for item in data if isinstance(item, dict) and item.get('word')]
160
+ words = [word for word in words if word] # 빈 λ¬Έμžμ—΄ 제거
161
+ if not words:
162
+ st.warning(f"κ²½κ³ : 파일 '{os.path.basename(filepath)}'μ—μ„œ 'word' ν‚€λ₯Ό κ°€μ§„ μœ νš¨ν•œ 데이터λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
163
+ return None
164
  return words
165
  else:
166
+ st.error(f"였λ₯˜: 파일 '{os.path.basename(filepath)}'의 μ΅œμƒμœ„ ν˜•μ‹μ΄ λ¦¬μŠ€νŠΈκ°€ μ•„λ‹™λ‹ˆλ‹€.")
167
  return None
168
  except FileNotFoundError:
169
  st.error(f"였λ₯˜: 파일 '{filepath}'λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
170
  return None
171
+ except json.JSONDecodeError as e:
172
+ st.error(f"였λ₯˜: 파일 '{os.path.basename(filepath)}'의 JSON ν˜•μ‹μ΄ 잘λͺ»λ˜μ—ˆμŠ΅λ‹ˆλ‹€. 였λ₯˜: {e}")
173
  return None
174
  except Exception as e:
175
+ st.error(f"'{os.path.basename(filepath)}' 데��터 λ‘œλ”© 쀑 였λ₯˜ λ°œμƒ: {e}")
176
  return None
177
 
 
178
  def scan_data_files():
179
+ """데이터 폴더 및 μ—…λ‘œλ“œ ν΄λ”μ—μ„œ μ‚¬μš© κ°€λŠ₯ν•œ JSON νŒŒμΌμ„ μŠ€μΊ”ν•©λ‹ˆλ‹€."""
180
  data_files = {}
181
+ # κΈ°λ³Έ 데이터 폴더
182
+ try:
183
+ for file_path in glob.glob(os.path.join(DATA_FOLDER, '*.json')):
184
+ file_id = f"default_{os.path.basename(file_path)}"
185
+ file_name = os.path.basename(file_path)
186
+ words = load_words_from_json(file_path)
187
+ if words:
188
+ data_files[file_id] = {'path': file_path, 'name': file_name, 'word_count': len(words), 'type': 'default', 'sample_words': words[:5]}
189
+ except Exception as e:
190
+ st.error(f"κΈ°λ³Έ 데이터 폴더 μŠ€μΊ” 쀑 였λ₯˜: {e}")
191
+ # μ—…λ‘œλ“œ 폴더
192
+ try:
193
+ for file_path in glob.glob(os.path.join(UPLOAD_FOLDER, '*.json')):
194
+ file_id = f"uploaded_{os.path.basename(file_path)}"
195
+ file_name = os.path.basename(file_path)
196
+ words = load_words_from_json(file_path)
197
+ if words:
198
+ data_files[file_id] = {'path': file_path, 'name': file_name, 'word_count': len(words), 'type': 'uploaded', 'sample_words': words[:5]}
199
+ except Exception as e:
200
+ st.error(f"μ—…λ‘œλ“œ 폴더 μŠ€μΊ” 쀑 였λ₯˜: {e}")
 
 
 
 
 
 
 
 
 
201
  return data_files
202
 
203
+ def merge_word_lists(file_ids, current_data_files):
 
204
  """μ„ νƒλœ νŒŒμΌλ“€μ—μ„œ 단어λ₯Ό λ‘œλ“œν•˜κ³  쀑볡 μ œκ±°ν•˜μ—¬ λ³‘ν•©ν•©λ‹ˆλ‹€."""
205
+ all_words = set() # 쀑볡 제거λ₯Ό μœ„ν•΄ set μ‚¬μš©
206
+ if not file_ids:
207
+ return []
208
+
209
  for file_id in file_ids:
210
+ if file_id in current_data_files:
211
+ file_path = current_data_files[file_id]['path']
212
  words = load_words_from_json(file_path)
213
  if words:
214
+ all_words.update(words) # set에 μΆ”κ°€
215
+ else:
216
+ st.warning(f"μ„ νƒλœ 파일 ID '{file_id}'λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. λͺ©λ‘μ„ μƒˆλ‘œκ³ μΉ¨ν•©λ‹ˆλ‹€.")
217
+ # 파일 λͺ©λ‘ μž¬μŠ€μΊ” λ‘œμ§μ€ λ³΅μž‘ν•΄μ§ˆ 수 μžˆμœΌλ―€λ‘œ μ—¬κΈ°μ„œλŠ” 경고만 ν‘œμ‹œ
218
+ # μ •λ ¬λœ 리슀트둜 λ°˜ν™˜
219
+ unique_words = sorted(list(all_words))
220
  return unique_words
221
 
222
+ # --- 단어 μž„λ² λ”© ν•¨μˆ˜ (FastText μ‚¬μš©) ---
223
+ def encode_words_fasttext(words, normalize=True):
224
+ """FastText λͺ¨λΈμ„ μ‚¬μš©ν•˜μ—¬ 단어 λͺ©λ‘μ„ 의미 μž„λ² λ”©μœΌλ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€."""
225
+ model = st.session_state.get('fasttext_model')
226
+
227
+ if model is None:
228
+ st.error("FastText λͺ¨λΈμ΄ λ‘œλ“œλ˜μ§€ μ•Šμ•„ μž„λ² λ”©μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
229
+ return None
230
+
231
+ if not words:
232
+ return np.array([])
233
 
 
 
 
234
  embeddings = []
235
+ oov_count = 0
236
+ vector_size = model.vector_size
237
+
238
+ with st.spinner(f"단어 {len(words)}κ°œμ— λŒ€ν•œ 의미 μž„λ² λ”© 생성 쀑 (FastText)..."):
239
+ for word in words:
240
+ try:
241
+ vector = model.wv[word]
242
+ if np.all(vector == 0):
243
+ oov_count += 1
244
+ if normalize:
245
+ norm = np.linalg.norm(vector)
246
+ vector = vector / norm if norm > 0 else np.zeros(vector_size)
247
+ embeddings.append(vector)
248
+ except Exception as e:
249
+ st.warning(f"단어 '{word}' 처리 쀑 였λ₯˜ λ°œμƒ (ν˜Ήμ€ OOV): {e}. 0λ²‘ν„°λ‘œ λŒ€μ²΄ν•©λ‹ˆλ‹€.")
250
+ embeddings.append(np.zeros(vector_size))
251
+ oov_count += 1
252
+
253
+ if oov_count > 0:
254
+ st.warning(f"총 {len(words)}개 단어 쀑 {oov_count}κ°œμ— λŒ€ν•΄ 유효 벑터λ₯Ό μ–»μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€(OOV λ“±).")
255
+
256
+ result_embeddings = np.array(embeddings)
257
+
258
+ if result_embeddings.size == 0 and len(words) > 0:
259
+ st.error("μž„λ² λ”© 생성 κ²°κ³Όκ°€ λΉ„μ–΄ μžˆμŠ΅λ‹ˆλ‹€.")
260
+ return None
261
+ elif result_embeddings.shape[0] != len(words):
262
+ st.error(f"μž…λ ₯ 단어 수({len(words)})와 μƒμ„±λœ μž„λ² λ”© 수({result_embeddings.shape[0]}) 뢈일치.")
263
+ return None
264
+
265
+ return result_embeddings
266
+
267
+ # --- κ·Έλž˜ν”„ 생성 ν•¨μˆ˜ ---
268
+ def generate_graph(file_ids, similarity_threshold, perplexity, learning_rate, n_iter):
269
+ """ 의미 μœ μ‚¬μ„± 기반 3D κ·Έλž˜ν”„λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€. """
270
+ # κ·Έλž˜ν”„ μΊμ‹œ ν‚€ 생성 (파일 ID, μž„κ³„κ°’, t-SNE νŒŒλΌλ―Έν„° 포함)
271
+ param_str = f"t{similarity_threshold}_p{perplexity}_lr{learning_rate}_i{n_iter}"
272
+ sorted_fids = "-".join(sorted(file_ids))
273
+ # 단어 λͺ©λ‘ 자체λ₯Ό ν•΄μ‹œν•˜μ—¬ μΊμ‹œ 킀에 포함 (더 μ •ν™•ν•˜μ§€λ§Œ 느릴 수 있음)
274
+ # word_list_for_key = merge_word_lists(file_ids, st.session_state.data_files)
275
+ # word_hash = hashlib.sha256(str(word_list_for_key).encode()).hexdigest()[:8]
276
+ # cache_key = f"{sorted_fids}_{word_hash}_{param_str}_fasttext"
277
+ cache_key = f"{sorted_fids}_{param_str}_fasttext" # 파일 ID 기반 μΊμ‹œ
278
+
279
  if cache_key in st.session_state.graph_cache:
280
+ st.info("μΊμ‹œλœ κ·Έλž˜ν”„λ₯Ό μ‚¬μš©ν•©λ‹ˆλ‹€.")
281
  return st.session_state.graph_cache[cache_key]
282
+
283
+ # --- ν•„μš” 데이터 λ‘œλ“œ 및 검증 ---
284
+ if not file_ids:
285
+ st.error("κ·Έλž˜ν”„λ₯Ό 생성할 파일이 μ„ νƒλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
286
+ return None
287
+ if st.session_state.get('fasttext_model') is None:
288
+ st.error("FastText λͺ¨λΈμ΄ λ‘œλ“œλ˜μ§€ μ•Šμ•„ κ·Έλž˜ν”„ 생성을 μ§„ν–‰ν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
289
+ return None
290
+
291
+ plotly_font = set_korean_font() # ν•œκΈ€ 폰트 μ„€μ •
292
+ word_list = merge_word_lists(file_ids, st.session_state.data_files) # 단어 λͺ©λ‘ 병합
293
+
294
  if not word_list:
295
+ st.error("μ„ νƒλœ νŒŒμΌμ—μ„œ μœ νš¨ν•œ 단어λ₯Ό λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
296
+ return None
297
+ if len(word_list) < 2:
298
+ st.warning("κ·Έλž˜ν”„λ₯Ό μƒμ„±ν•˜λ €λ©΄ μ΅œμ†Œ 2개 μ΄μƒμ˜ 고유 단어가 ν•„μš”ν•©λ‹ˆλ‹€.")
299
+ return None
300
+
301
+ # --- μž„λ² λ”© 생성 ---
302
+ embeddings = encode_words_fasttext(word_list, normalize=True)
303
+ if embeddings is None or embeddings.shape[0] == 0 or embeddings.shape[1] == 0:
304
+ st.error("μœ νš¨ν•œ 단어 μž„λ² λ”© 생성 μ‹€νŒ¨.")
305
  return None
306
+
307
+ # --- 차원 μΆ•μ†Œ (t-SNE) ---
308
+ embeddings_3d = None
309
+ n_samples = embeddings.shape[0]
310
+ with st.spinner(f'단어 {n_samples}개 μ’Œν‘œ 계산 쀑 (t-SNE)...'):
311
+ effective_perplexity = min(perplexity, max(5, n_samples - 1))
312
+ if effective_perplexity != perplexity:
313
+ st.warning(f"Perplexityκ°€ μƒ˜ν”Œ μˆ˜μ— 맞게 {effective_perplexity}(으)둜 μ‘°μ •λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
314
+ effective_lr = learning_rate if isinstance(learning_rate, (int, float)) else 200.0 if learning_rate == 'auto' else learning_rate
315
+ effective_iter = n_iter
316
+
317
+ if n_samples <= 3:
318
+ st.warning(f"단어 μˆ˜κ°€ {n_samples}개둜 적어 PCAλ₯Ό μ‚¬μš©ν•©λ‹ˆλ‹€.")
319
+ from sklearn.decomposition import PCA
320
+ pca = PCA(n_components=min(3, n_samples), random_state=42)
321
+ embeddings_3d_pca = pca.fit_transform(embeddings)
322
+ embeddings_3d = np.zeros((n_samples, 3))
323
+ embeddings_3d[:, :embeddings_3d_pca.shape[1]] = embeddings_3d_pca
324
+ else:
325
+ try:
326
+ tsne = TSNE(n_components=3, random_state=42,
327
+ perplexity=effective_perplexity,
328
+ n_iter=effective_iter,
329
+ init='pca',
330
+ learning_rate=effective_lr,
331
+ n_jobs=-1)
332
+ embeddings_3d = tsne.fit_transform(embeddings)
333
+ except Exception as e:
334
+ st.error(f"t-SNE 였λ₯˜: {e}. PCA둜 λŒ€μ²΄ν•©λ‹ˆλ‹€.")
335
+ from sklearn.decomposition import PCA
336
+ pca = PCA(n_components=3, random_state=42)
337
+ embeddings_3d = pca.fit_transform(embeddings) # PCA둜 μž¬μ‹œλ„
338
+
339
+ if embeddings_3d is None or embeddings_3d.shape[0] != len(word_list):
340
+ st.error("단어 3D μ’Œν‘œ 생성 μ‹€νŒ¨.")
341
+ return None
342
+
343
+ # --- μœ μ‚¬λ„ 계산 및 κ·Έλž˜ν”„ ꡬ성 ---
344
+ edges = []
345
+ edge_weights = []
346
+ with st.spinner('단어 κ°„ 의미 μœ μ‚¬λ„ 계산 및 μ—°κ²° 생성 쀑...'):
347
+ try:
348
+ similarity_matrix = cosine_similarity(embeddings)
349
+ for i in range(n_samples):
350
+ for j in range(i + 1, n_samples):
351
+ similarity = similarity_matrix[i, j]
352
+ if not np.isnan(similarity) and similarity >= similarity_threshold:
353
+ edges.append((word_list[i], word_list[j]))
354
+ edge_weights.append(similarity)
355
+ except Exception as e:
356
+ st.error(f"μœ μ‚¬λ„ 계산 쀑 였λ₯˜ λ°œμƒ: {e}")
357
+ return None
358
+
359
+ # --- NetworkX κ·Έλž˜ν”„ 생성 ---
360
  G = nx.Graph()
361
+ valid_nodes_count = 0
362
  for i, word in enumerate(word_list):
363
+ if i < embeddings_3d.shape[0]: # μ’Œν‘œκ°€ μƒμ„±λœ λ…Έλ“œλ§Œ μΆ”κ°€
364
+ G.add_node(word, pos=(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2]))
365
+ valid_nodes_count += 1
366
+ else:
367
+ st.warning(f"'{word}' 단어 μ’Œν‘œ λˆ„λ½.") # λˆ„λ½ κ²½κ³ 
368
+
369
+ if valid_nodes_count != len(word_list):
370
+ st.warning(f"{len(word_list)-valid_nodes_count}개 단어 λ…Έλ“œ 생성 μ‹€νŒ¨.")
371
+
372
+ valid_edges_count = 0
373
  for edge, weight in zip(edges, edge_weights):
374
+ if G.has_node(edge[0]) and G.has_node(edge[1]): # λ…Έλ“œκ°€ μžˆλŠ”μ§€ 확인 ν›„ μ—£μ§€ μΆ”κ°€
375
+ G.add_edge(edge[0], edge[1], weight=weight)
376
+ valid_edges_count += 1
377
+
378
+ # --- Plotly μ‹œκ°ν™” 객체 생성 ---
379
+ edge_x, edge_y, edge_z = [], [], []
380
+ if G.number_of_edges() > 0:
 
381
  for edge in G.edges():
382
+ try:
383
+ pos0 = G.nodes[edge[0]]['pos']
384
+ pos1 = G.nodes[edge[1]]['pos']
385
+ edge_x.extend([pos0[0], pos1[0], None])
386
+ edge_y.extend([pos0[1], pos1[1], None])
387
+ edge_z.extend([pos0[2], pos1[2], None])
388
+ except KeyError as e:
389
+ st.warning(f"μ—£μ§€ {edge} 생성 쀑 λ…Έλ“œ μœ„μΉ˜ 였λ₯˜: {e}")
390
+ continue
391
+
392
+ edge_trace = go.Scatter3d(x=edge_x, y=edge_y, z=edge_z, mode='lines', line=dict(width=1, color='#888'), hoverinfo='none')
393
+
394
+ node_x, node_y, node_z, node_text, node_hover_text, node_sizes = [], [], [], [], [], []
395
+ if G.number_of_nodes() > 0:
396
+ degrees = np.array([G.degree(node) for node in G.nodes()])
397
+ # 둜그 μŠ€μΌ€μΌλ§ + 크기 μ œν•œ
398
+ raw_sizes = np.log1p(degrees) * 3 + 6
399
+ node_sizes_list = np.clip(raw_sizes, 5, 20).tolist()
400
+
401
+ for i, node in enumerate(G.nodes()):
402
+ try:
403
+ pos = G.nodes[node]['pos']
404
+ degree = G.degree(node)
405
+ node_x.append(pos[0])
406
+ node_y.append(pos[1])
407
+ node_z.append(pos[2])
408
+ node_text.append(node)
409
+ node_hover_text.append(f'{node}<br>μ—°κ²° 수: {degree}')
410
+ # node_sizes λ¦¬μŠ€νŠΈλŠ” 이미 μœ„μ—μ„œ 계산됨
411
+ except KeyError:
412
+ st.warning(f"λ…Έλ“œ '{node}' μœ„μΉ˜ 정보 였λ₯˜.")
413
+ continue # ν•΄λ‹Ή λ…Έλ“œ κ±΄λ„ˆλ›°κΈ°
414
+
415
  node_trace = go.Scatter3d(
416
  x=node_x, y=node_y, z=node_z,
417
  mode='markers+text',
 
419
  hovertext=node_hover_text,
420
  hoverinfo='text',
421
  textposition='top center',
422
+ textfont=dict(size=10, color='black', family=plotly_font),
 
 
 
 
423
  marker=dict(
424
+ size=node_sizes_list if node_sizes_list else 5, # κ³„μ‚°λœ 크기 μ‚¬μš©
425
+ color=node_z, # Zκ°’μœΌλ‘œ 색상 λ§€ν•‘
426
  colorscale='Viridis',
427
  opacity=0.9,
428
+ colorbar=dict(thickness=15, title='Node Depth (Z)', xanchor='left', titleside='right')
429
  )
430
  )
431
+
432
+ # --- λ ˆμ΄μ•„μ›ƒ μ„€μ • 및 Figure 생성 ---
433
+ current_data_files = st.session_state.get('data_files', {})
434
+ file_names_used = [current_data_files[fid]['name'] for fid in file_ids if fid in current_data_files]
435
+ file_info_str = ", ".join(file_names_used) if file_names_used else "μ•Œ 수 μ—†μŒ"
436
+
 
437
  layout = go.Layout(
438
  title=dict(
439
+ text=f'<b>μ–΄νœ˜ 의미 μœ μ‚¬μ„± 기반 3D κ·Έλž˜ν”„ (FastText)</b><br>Threshold: {similarity_threshold:.2f} | 데이터: {file_info_str}',
440
+ font=dict(size=16, family=plotly_font),
441
+ x=0.5, xanchor='center'
442
  ),
443
  showlegend=False,
444
+ margin=dict(l=10, r=10, b=10, t=80),
445
  scene=dict(
446
+ xaxis=dict(title='TSNE-1', showticklabels=False, backgroundcolor="rgb(230, 230, 230)", gridcolor="white", zerolinecolor="white"),
447
+ yaxis=dict(title='TSNE-2', showticklabels=False, backgroundcolor="rgb(230, 230, 230)", gridcolor="white", zerolinecolor="white"),
448
+ zaxis=dict(title='TSNE-3', showticklabels=False, backgroundcolor="rgb(230, 230, 230)", gridcolor="white", zerolinecolor="white"),
449
+ aspectratio=dict(x=1, y=1, z=0.8),
450
+ camera=dict(eye=dict(x=1.2, y=1.2, z=0.8))
451
+ ),
452
+ hovermode='closest'
 
453
  )
454
+
 
455
  fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
456
+
457
+ # κ·Έλž˜ν”„ μΊμ‹œ μ €μž₯
458
  st.session_state.graph_cache[cache_key] = fig
 
 
459
 
460
+ return fig
461
 
462
+ # --- 파일 처리 ν•¨μˆ˜ ---
463
  def handle_uploaded_file(uploaded_file):
464
+ """ μ—…λ‘œλ“œλœ νŒŒμΌμ„ μ²˜λ¦¬ν•˜κ³  data_files λͺ©λ‘μ„ κ°±μ‹ ν•©λ‹ˆλ‹€. """
465
  if uploaded_file is not None:
466
+ unique_id = str(uuid.uuid4())
467
+ file_name = f"{unique_id}_{uploaded_file.name}"
 
468
  file_path = os.path.join(UPLOAD_FOLDER, file_name)
469
+
470
  try:
 
471
  with open(file_path, 'wb') as f:
472
  f.write(uploaded_file.getbuffer())
473
+ st.info(f"파일 '{uploaded_file.name}' μ €μž₯ μ™„λ£Œ. λ‚΄μš© 검증 쀑...")
474
+
475
  words = load_words_from_json(file_path)
476
+ if words is None or not words :
477
+ os.remove(file_path)
478
+ st.error(f"μ—…λ‘œλ“œλœ 파일 '{uploaded_file.name}'μ—μ„œ μœ νš¨ν•œ 'word' 데이터λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. μ‚­μ œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
479
  return None
480
+ else:
481
+ st.success(f"파일 '{uploaded_file.name}' 검증 μ™„λ£Œ ({len(words)} 단어).")
482
+ # 데이터 파일 λͺ©λ‘ μ¦‰μ‹œ κ°±μ‹ 
483
+ st.session_state.data_files = scan_data_files()
484
+ new_file_id = f"uploaded_{file_name}"
485
+ return new_file_id
 
 
 
 
 
 
 
486
  except Exception as e:
487
+ st.error(f"파일 μ—…λ‘œλ“œ 처리 쀑 였λ₯˜: {e}")
488
+ if os.path.exists(file_path): os.remove(file_path) # 였λ₯˜ μ‹œ 파일 μ‚­μ œ
 
 
 
 
 
489
  return None
490
 
 
491
  def delete_file(file_id):
492
+ """ μ—…λ‘œλ“œλœ νŒŒμΌμ„ μ‚­μ œν•˜κ³  κ΄€λ ¨ μΊμ‹œλ₯Ό μ •λ¦¬ν•©λ‹ˆλ‹€. """
493
+ current_data_files = st.session_state.get('data_files', {})
494
+ if file_id not in current_data_files:
495
+ st.error('μ‚­μ œν•  νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.')
496
  return False
497
+
498
+ file_info = current_data_files[file_id]
499
+ if file_info.get('type') != 'uploaded':
 
 
500
  st.error('κΈ°λ³Έ 데이터 νŒŒμΌμ€ μ‚­μ œν•  수 μ—†μŠ΅λ‹ˆλ‹€.')
501
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
+ file_path = file_info.get('path')
504
+ file_name = file_info.get('name', 'μ•Œ 수 μ—†μŒ')
505
+
506
+ try:
507
+ if file_path and os.path.exists(file_path):
508
+ os.remove(file_path)
509
+ st.info(f"파일 '{file_name}' μ‚­μ œ μ™„λ£Œ.")
510
+ else:
511
+ st.warning(f"파일 '{file_name}'({file_path})을 찾을 수 μ—†κ±°λ‚˜ 이미 μ‚­μ œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
512
+
513
+ # μ„Έμ…˜ μƒνƒœ μ—…λ°μ΄νŠΈ
514
+ del st.session_state.data_files[file_id]
515
+ if file_id in st.session_state.selected_files:
516
+ st.session_state.selected_files.remove(file_id)
517
+
518
+ # κ΄€λ ¨ κ·Έλž˜ν”„ μΊμ‹œ μ‚­μ œ (킀에 file_idκ°€ ν¬ν•¨λœ ν•­λͺ©)
519
+ keys_to_remove = [k for k in st.session_state.graph_cache if file_id in k.split('_')[0]] # ν‚€ ν˜•μ‹ κ°€μ •
520
+ for key in keys_to_remove:
521
+ del st.session_state.graph_cache[key]
522
+ if keys_to_remove: st.info(f"{len(keys_to_remove)}개 κ΄€λ ¨ κ·Έλž˜ν”„ μΊμ‹œ μ‚­μ œ.")
523
 
524
+ st.success(f"'{file_name}' κ΄€λ ¨ 정보 및 μΊμ‹œ μ‚­μ œ μ™„λ£Œ.")
525
+ return True
526
+
527
+ except Exception as e:
528
+ st.error(f"파일 μ‚­μ œ 쀑 였λ₯˜ λ°œμƒ: {e}")
529
+ return False
530
+
531
+ # --- μΊμ‹œ μ΄ˆκΈ°ν™” ν•¨μˆ˜ ---
532
  def clear_cache():
533
+ """ κ·Έλž˜ν”„ μΊμ‹œλ₯Ό μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€. """
534
  st.session_state.graph_cache = {}
535
+ # st.session_state.embeddings_cache = {} # μž„λ² λ”© μΊμ‹œλŠ” ν˜„μž¬ μ‚¬μš© μ•ˆ 함
536
+ st.session_state.fig = None
537
+ st.success('κ·Έλž˜ν”„ μΊμ‹œκ°€ μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.')
538
+ st.rerun() # UI κ°±μ‹ 
539
+
540
+
541
+ # ==============================================================================
542
+ # --- Streamlit μ•± μ‹€ν–‰ λΆ€λΆ„ ---
543
+ # ==============================================================================
544
 
545
+ # --- μ•± μ‹œμž‘ μ‹œ μ΄ˆκΈ°ν™” ---
546
+ # FastText λͺ¨λΈ λ‘œλ“œ μ‹œλ„
547
+ if 'fasttext_model' not in st.session_state or st.session_state.fasttext_model is None:
548
+ st.session_state.fasttext_model = load_fasttext_model(FASTTEXT_MODEL_PATH)
549
 
550
  # 데이터 파일 μŠ€μΊ”
551
+ if 'data_files' not in st.session_state or not st.session_state.data_files:
552
+ st.session_state.data_files = scan_data_files()
553
 
554
  # 타이틀 및 μ†Œκ°œ
555
+ st.title('ν•œκ΅­μ–΄ 단어 의미 λ„€νŠΈμ›Œν¬ μ‹œκ°ν™” (FastText 기반)')
556
+ st.markdown("""
557
+ 이 λ„κ΅¬λŠ” JSON 파일의 단어 λͺ©λ‘μ„ **FastText μž„λ² λ”©**으둜 λ³€ν™˜ν•˜μ—¬ 의미적 μœ μ‚¬μ„±μ„ κ³„μ‚°ν•˜κ³ , κ·Έ 관계λ₯Ό 3D λ„€νŠΈμ›Œν¬ κ·Έλž˜ν”„λ‘œ μ‹œκ°ν™”ν•©λ‹ˆλ‹€.
558
+ μœ μ‚¬ν•œ 의미의 단어듀이 μ„œλ‘œ κ°€κΉκ²Œ λ°°μΉ˜λ˜λŠ” κ²½ν–₯을 λ³΄μž…λ‹ˆλ‹€.
559
+ """)
560
 
561
+ # λͺ¨λΈ λ‘œλ”© μƒνƒœ 확인
562
+ if st.session_state.get('fasttext_model') is None:
563
+ st.error("FastText λͺ¨λΈ λ‘œλ”© μ‹€νŒ¨. μ½”λ“œ μƒλ‹¨μ˜ `FASTTEXT_MODEL_PATH` 섀정을 ν™•μΈν•˜κ³  앱을 μž¬μ‹€ν–‰ν•΄μ£Όμ„Έμš”.")
564
+ st.stop() # λͺ¨λΈ μ—†μœΌλ©΄ μ•± 쀑단
565
 
 
 
 
 
 
 
 
 
 
 
566
 
567
+ # --- μ‚¬μ΄λ“œλ°” ---
568
+ st.sidebar.title('βš™οΈ μ„€μ • 및 μ œμ–΄')
 
569
 
570
+ # 1. μœ μ‚¬λ„ μž„κ³„κ°’
571
+ threshold = st.sidebar.slider(
572
+ 'μœ μ‚¬λ„ μž„κ³„κ°’ (Similarity Threshold)', 0.1, 0.95, st.session_state.threshold, 0.05,
573
+ help='이 κ°’ μ΄μƒμœΌλ‘œ μœ μ‚¬ν•œ λ‹¨μ–΄λ§Œ μ—°κ²°ν•©λ‹ˆλ‹€. λ†’μ„μˆ˜λ‘ 연결이 μ—„κ²©ν•΄μ§‘λ‹ˆλ‹€.'
574
+ )
575
+ if threshold != st.session_state.threshold:
576
+ st.session_state.threshold = threshold
577
+ st.session_state.fig = None # μ„€μ • λ³€κ²½ μ‹œ κ·Έλž˜ν”„ μž¬μƒμ„± ν•„μš” μ•Œλ¦Ό
578
+ st.session_state.generate_clicked = False
579
+
580
+ st.sidebar.divider()
581
+
582
+ # 2. t-SNE νŒŒλΌλ―Έν„° (μ‹œκ°ν™” λ―Έμ„Έ μ‘°μ •)
583
+ st.sidebar.header("t-SNE νŒŒλΌλ―Έν„° (κ³ κΈ‰)")
584
+ perplexity = st.sidebar.slider(
585
+ "Perplexity", 5, 50, st.session_state.perplexity, 1,
586
+ help="각 점이 κ³ λ €ν•˜λŠ” 이웃 μˆ˜μ™€ κ΄€λ ¨. κ΅°μ§‘ ν˜•νƒœμ— 영ν–₯."
587
+ )
588
+ learning_rate = st.sidebar.select_slider(
589
+ "Learning Rate", options=[10, 50, 100, 200, 500, 1000, 'auto'], value=st.session_state.learning_rate,
590
+ help="μ΅œμ ν™” ν•™μŠ΅ 속도. κ΅°μ§‘ κ°„ 거리에 영ν–₯."
591
+ )
592
+ n_iter = st.sidebar.select_slider(
593
+ "Iterations", options=[250, 500, 1000, 2000, 5000], value=st.session_state.n_iter,
594
+ help="μ΅œμ ν™” 반볡 횟수. λ†’μ„μˆ˜λ‘ μ•ˆμ •μ μ΄λ‚˜ 였래 κ±Έλ¦Ό."
595
+ )
596
+ # t-SNE νŒŒλΌλ―Έν„° λ³€κ²½ μ‹œ μƒνƒœ μ—…λ°μ΄νŠΈ 및 κ·Έλž˜ν”„ μ΄ˆκΈ°ν™”
597
+ if (perplexity != st.session_state.perplexity or
598
+ learning_rate != st.session_state.learning_rate or
599
+ n_iter != st.session_state.n_iter):
600
+ st.session_state.perplexity = perplexity
601
+ st.session_state.learning_rate = learning_rate
602
+ st.session_state.n_iter = n_iter
603
+ st.session_state.fig = None
604
+ st.session_state.generate_clicked = False
605
+
606
+ st.sidebar.divider()
607
+
608
+ # 3. 파일 μ—…λ‘œλ“œ
609
+ st.sidebar.header('πŸ“„ 파일 μ—…λ‘œλ“œ')
610
+ uploaded_file = st.sidebar.file_uploader(
611
+ "JSON 파일 μ—…λ‘œλ“œ (ν˜•μ‹: [{'word': '단어1'}, ...])", type=['json']
612
+ )
613
  if uploaded_file is not None:
614
+ with st.spinner("μ—…λ‘œλ“œλœ 파일 처리 쀑..."):
615
  new_file_id = handle_uploaded_file(uploaded_file)
616
  if new_file_id:
617
+ st.sidebar.success(f"파일 '{uploaded_file.name}' μ—…λ‘œλ“œ μ™„λ£Œ!")
618
+ # μƒˆλ‘œ μ—…λ‘œλ“œλœ νŒŒμΌμ„ μžλ™μœΌλ‘œ 선택 λͺ©λ‘μ— μΆ”κ°€ 및 선택
619
+ if new_file_id not in st.session_state.selected_files:
620
+ st.session_state.selected_files.append(new_file_id)
621
+ st.rerun() # UI μ¦‰μ‹œ κ°±μ‹ 
622
+
623
+ st.sidebar.divider()
624
+
625
+ # 4. 파일 선택
626
+ st.sidebar.header('πŸ—‚οΈ 데이터 파일 선택')
627
+ current_data_files = st.session_state.get('data_files', {})
628
+ if current_data_files:
629
+ st.sidebar.markdown("**μ‚¬μš©ν•  νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”:**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630
  selected_files_temp = []
631
+ sorted_file_ids = sorted(current_data_files.keys(), key=lambda fid: current_data_files[fid]['name'])
632
+
633
+ for file_id in sorted_file_ids:
634
+ if file_id not in current_data_files: continue # μ•ˆμ „μž₯치
635
+ file_info = current_data_files[file_id]
636
+ file_label = f"{file_info['name']} ({file_info['word_count']} 단어)"
637
+ file_type_tag = "[κΈ°λ³Έ]" if file_info['type'] == 'default' else "[μ—…λ‘œλ“œ]"
638
+ label_full = f"{file_label} {file_type_tag}"
639
  is_selected = file_id in st.session_state.selected_files
640
+
641
+ # μ²΄ν¬λ°•μŠ€ μƒνƒœ λ³€κ²½ 감지
642
+ if st.sidebar.checkbox(label_full, value=is_selected, key=f"cb_{file_id}"):
643
  selected_files_temp.append(file_id)
644
+ # 파일 정보 ν™•μž₯ μ„Ήμ…˜
645
+ with st.sidebar.expander("파일 정보 보기", expanded=False):
646
+ st.markdown(f"**μƒ˜ν”Œ:** `{'`, `'.join(file_info['sample_words'])}`")
647
+ if file_info['type'] == 'uploaded':
648
+ if st.button('πŸ—‘οΈ 이 파일 μ‚­μ œ', key=f"del_{file_id}", help=f"'{file_info['name']}' μ‚­μ œ"):
649
+ if delete_file(file_id):
650
+ st.rerun() # μ‚­μ œ 성곡 μ‹œ UI κ°±μ‹ 
651
+
652
+ # 선택 μƒνƒœ λ³€κ²½ μ‹œ μ„Έμ…˜ μ—…λ°μ΄νŠΈ 및 κ·Έλž˜ν”„ μ΄ˆκΈ°ν™”
653
+ if sorted(selected_files_temp) != sorted(st.session_state.selected_files):
654
+ st.session_state.selected_files = selected_files_temp
655
+ st.session_state.fig = None
656
+ st.session_state.generate_clicked = False
657
+ st.rerun() # 선택 λ³€κ²½ μ‹œ μ¦‰μ‹œ UI 반영
658
+
659
+ st.sidebar.divider()
660
+
661
+ # 5. κ·Έλž˜ν”„ 생성 λ²„νŠΌ
662
  if st.session_state.selected_files:
663
+ if st.sidebar.button('πŸ“Š κ·Έλž˜ν”„ 생성/μ—…λ°μ΄νŠΈ', key='generate_button', type="primary"):
664
+ st.session_state.generate_clicked = True
665
+ # λ²„νŠΌ 클릭 μ‹œ μžλ™μœΌλ‘œ rerun λ˜λ―€λ‘œ μ—¬κΈ°μ„œλŠ” ν”Œλž˜κ·Έλ§Œ μ„€μ •
666
+ else:
667
+ st.sidebar.warning('κ·Έλž˜ν”„λ₯Ό 생성할 νŒŒμΌμ„ 1개 이상 μ„ νƒν•΄μ£Όμ„Έμš”.')
668
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  else:
670
+ st.sidebar.info('μ‚¬μš© κ°€λŠ₯ν•œ 데이터 파일이 μ—†μŠ΅λ‹ˆλ‹€. νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ `data` 폴더에 JSON νŒŒμΌμ„ μΆ”κ°€ν•˜μ„Έμš”.')
671
+
672
+ st.sidebar.divider()
673
+
674
+ # 6. μΊμ‹œ μ΄ˆκΈ°ν™” λ²„νŠΌ
675
+ if st.sidebar.button('πŸ”„ μΊμ‹œ μ΄ˆκΈ°ν™”', key='clear_cache_button'):
676
+ clear_cache()
677
+
678
+
679
+ # --- 메인 μ½˜ν…μΈ  μ˜μ—­ ---
680
+ st.header("πŸ“ˆ 3D 단어 λ„€νŠΈμ›Œν¬ μ‹œκ°ν™”")
681
+
682
+ # κ·Έλž˜ν”„ ν‘œμ‹œ 둜직
683
+ if st.session_state.selected_files:
684
+ # κ·Έλž˜ν”„λ₯Ό 생성해야 ν•˜λŠ” 쑰건 확인
685
+ should_generate_graph = st.session_state.generate_clicked or \
686
+ (st.session_state.fig is None and st.session_state.selected_files) # 선택은 ν–ˆλŠ”λ° 아직 κ·Έλž˜ν”„ 없을 λ•Œ
687
+
688
+ if should_generate_graph and st.session_state.get('fasttext_model'): # λͺ¨λΈ λ‘œλ“œ 확인
689
+ with st.spinner('의미 기반 κ·Έλž˜ν”„ 생성 쀑... μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”.'):
690
+ try:
691
+ # generate_graph ν•¨μˆ˜ 호좜 (λͺ¨λ“  νŒŒλΌλ―Έν„° 전달)
692
+ fig = generate_graph(
693
+ st.session_state.selected_files,
694
+ st.session_state.threshold,
695
+ st.session_state.perplexity,
696
+ st.session_state.learning_rate,
697
+ st.session_state.n_iter
698
+ )
699
+ st.session_state.fig = fig # 성곡 μ‹œ fig μ €μž₯
700
+ except Exception as e:
701
+ st.error(f"κ·Έλž˜ν”„ 생성 쀑 μ‹¬κ°ν•œ 였λ₯˜ λ°œμƒ: {e}")
702
+ st.session_state.fig = None # μ‹€νŒ¨ μ‹œ fig μ΄ˆκΈ°ν™”
703
+ finally:
704
+ st.session_state.generate_clicked = False # μž‘μ—… μ™„λ£Œ ν›„ 클릭 ν”Œλž˜κ·Έ 리셋
705
+
706
+ # μƒμ„±λœ κ·Έλž˜ν”„κ°€ 있으면 ν‘œμ‹œ
707
+ if st.session_state.get('fig') is not None:
708
+ st.plotly_chart(st.session_state.fig, use_container_width=True)
709
+
710
+ # ν˜„μž¬ κ·Έλž˜ν”„ 정보 ν‘œμ‹œ
711
+ try:
712
+ num_nodes = len(st.session_state.fig.data[1].x) if len(st.session_state.fig.data) > 1 and hasattr(st.session_state.fig.data[1], 'x') else 0
713
+ num_edges = len(st.session_state.fig.data[0].x) // 3 if len(st.session_state.fig.data) > 0 and hasattr(st.session_state.fig.data[0], 'x') and st.session_state.fig.data[0].x else 0
714
+
715
+ # μ‚¬μš©λœ 파일 이름 μ–»κΈ° (데이터 λ‘œλ“œ ν›„)
716
+ current_data_files = st.session_state.get('data_files', {})
717
+ selected_file_names = [current_data_files[fid]['name'] for fid in st.session_state.selected_files if fid in current_data_files]
718
+
719
+ st.info(f"""
720
+ **ν˜„μž¬ κ·Έλž˜ν”„ 정보**
721
+ - **데이터 파일:** {', '.join(selected_file_names)}
722
+ - **고유 단어 수 (λ…Έλ“œ):** {num_nodes} 개
723
+ - **μ—°κ²°μ„  수 (μ—£μ§€):** {num_edges} 개 (μœ μ‚¬λ„ β‰₯ {st.session_state.threshold:.2f})
724
+ """)
725
+ except Exception as info_e:
726
+ st.warning(f"κ·Έλž˜ν”„ 정보 ν‘œμ‹œ 쀑 였λ₯˜: {info_e}")
727
+
728
  # μ‚¬μš© μ„€λͺ…
729
+ with st.expander("πŸ’‘ κ·Έλž˜ν”„ μ‘°μž‘ 방법"):
730
  st.markdown("""
731
+ - **ν™•λŒ€/μΆ•μ†Œ:** 마우슀 휠 슀크둀
732
+ - **νšŒμ „:** 마우슀 μ™Όμͺ½ λ²„νŠΌ λˆ„λ₯Έ μƒνƒœλ‘œ λ“œλž˜κ·Έ
733
+ - **이동 (Pan):** 마우슀 였λ₯Έμͺ½ λ²„νŠΌ λˆ„λ₯Έ μƒνƒœλ‘œ λ“œλž˜κ·Έ
734
+ - **단어 정보 확인:** 마우슀 μ»€μ„œλ₯Ό 단어(마컀) μœ„μ— 올리면 단어 이름과 μ—°κ²° 수λ₯Ό λ³Ό 수 μžˆμŠ΅λ‹ˆλ‹€.
735
+ - **νˆ΄λ°”:** κ·Έλž˜ν”„ 우츑 상단 νˆ΄λ°” μ•„μ΄μ½˜μœΌλ‘œ λ‹€μ–‘ν•œ κΈ°λŠ₯(λ‹€μš΄λ‘œλ“œ, μ΄ˆκΈ°ν™” λ“±) μ‚¬μš© κ°€λŠ₯.
736
  """)
737
+ # κ·Έλž˜ν”„ 생성을 ν•΄μ•Όν•˜λŠ”λ° 아직 μ•ˆ ν•œ 경우 or 생성 μ‹€νŒ¨ν•œ 경우
738
+ elif not should_generate_graph and st.session_state.fig is None:
739
+ st.info("πŸ‘ˆ μ‚¬μ΄λ“œλ°”μ—μ„œ 'πŸ“Š κ·Έλž˜ν”„ 생성/μ—…λ°μ΄νŠΈ' λ²„νŠΌμ„ ν΄λ¦­ν•˜μ—¬ μ‹œκ°ν™”λ₯Ό μ‹œμž‘ν•˜μ„Έμš”.")
740
+
741
+ # μ„ νƒλœ 파일이 μ—†λŠ” 경우
742
+ elif not st.session_state.data_files:
743
+ st.warning("ν‘œμ‹œν•  데이터 파일이 μ—†μŠ΅λ‹ˆλ‹€. νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ `data` 폴더에 μœ νš¨ν•œ JSON νŒŒμΌμ„ μΆ”κ°€ν•˜μ„Έμš”.")
744
+ else: # 데이터 νŒŒμΌμ€ μžˆμœΌλ‚˜ μ„ νƒν•˜μ§€ μ•Šμ€ 경우
745
+ st.info("πŸ‘ˆ μ‚¬μ΄λ“œλ°”μ—μ„œ 뢄석할 데이터 νŒŒμΌμ„ μ„ νƒν•΄μ£Όμ„Έμš”.")
746
+
747
+
748
+ # --- ν•˜λ‹¨ 정보 μ„Ήμ…˜ ---
749
+ st.divider()
750
+ with st.expander("ℹ️ 이 μ‹œκ°ν™” 도ꡬ에 λŒ€ν•˜μ—¬"):
751
+ st.markdown(f"""
752
+ 이 λ„κ΅¬λŠ” λ‹€μŒκ³Ό 같은 과정을 톡해 ν•œκ΅­μ–΄ 단어 λ„€νŠΈμ›Œν¬λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€:
753
+
754
+ 1. **데이터 λ‘œλ”©:** μ‚¬μš©μžκ°€ μ œκ³΅ν•œ JSON νŒŒμΌμ—μ„œ 'word' ν•„λ“œλ₯Ό κ°€μ§„ 단어 λͺ©λ‘μ„ μΆ”μΆœν•©λ‹ˆλ‹€.
755
+ 2. **단어 μž„λ² λ”© (FastText):** 각 단어λ₯Ό **사전 ν•™μŠ΅λœ FastText λͺ¨λΈ**(`{os.path.basename(FASTTEXT_MODEL_PATH)}` μ‚¬μš© 쀑)을 μ‚¬μš©ν•˜μ—¬ κ³ μ°¨μ›μ˜ 의미 λ²‘ν„°λ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€.
756
+ 3. **μœ μ‚¬λ„ 계산:** 단어 벑터 κ°„μ˜ **코사인 μœ μ‚¬λ„**λ₯Ό κ³„μ‚°ν•©λ‹ˆλ‹€.
757
+ 4. **차원 μΆ•μ†Œ (t-SNE):** 고차원 벑터λ₯Ό 3μ°¨μ›μœΌλ‘œ μΆ•μ†Œν•˜μ—¬ μ‹œκ°ν™”ν•©λ‹ˆλ‹€. t-SNE νŒŒλΌλ―Έν„°(Perplexity: {st.session_state.perplexity}, Learning Rate: {st.session_state.learning_rate}, Iterations: {st.session_state.n_iter})λ₯Ό μ‘°μ ˆν•˜μ—¬ κ΅°μ§‘ ν˜•νƒœλ₯Ό λ―Έμ„Έ μ‘°μ •ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
758
+ 5. **κ·Έλž˜ν”„ 생성 및 μ‹œκ°ν™”:** μœ μ‚¬λ„κ°€ μ„€μ •λœ μž„κ³„κ°’(ν˜„μž¬: {st.session_state.threshold:.2f}) 이상인 단어듀을 μ—°κ²°ν•˜μ—¬ 3D λ„€νŠΈμ›Œν¬ κ·Έλž˜ν”„λ₯Ό μƒμ„±ν•˜κ³  ν‘œμ‹œν•©λ‹ˆλ‹€.
759
+ """)