wldud7568 commited on
Commit
f3b0be8
Β·
verified Β·
1 Parent(s): 8e9954b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +129 -26
src/streamlit_app.py CHANGED
@@ -8,7 +8,7 @@ from pathlib import Path
8
  import logging
9
  from sentence_transformers import SentenceTransformer
10
  import faiss
11
- import pickle
12
  from rank_bm25 import BM25Okapi
13
 
14
  # κΈ°λ³Έ λ‘œκΉ… μ„€μ •
@@ -192,8 +192,104 @@ class HybridMultiCollectionSearcher:
192
 
193
  return boost_score
194
 
195
- def load_collections(self, save_dir: str):
196
- """μ €μž₯된 ν•˜μ΄λΈŒλ¦¬λ“œ μ»¬λ ‰μ…˜λ“€ λ‘œλ“œ (FAISS + BM25)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  save_dir = Path(save_dir)
198
 
199
  if not save_dir.exists():
@@ -215,23 +311,26 @@ class HybridMultiCollectionSearcher:
215
 
216
  faiss_index = faiss.read_index(str(faiss_path))
217
 
218
- # 2. BM25 인덱슀 λ‘œλ“œ
219
- bm25_path = collection_dir / "bm25.pkl"
220
- if not bm25_path.exists():
221
- logger.warning(f"BM25 μΈλ±μŠ€κ°€ μ—†μŠ΅λ‹ˆλ‹€: {bm25_path}")
222
  continue
223
 
224
- with open(bm25_path, 'rb') as f:
225
- bm25_index = pickle.load(f)
226
 
227
- # 3. 메타데이터 λ‘œλ“œ
228
- metadata_path = collection_dir / "metadata.pkl"
 
 
 
229
  if not metadata_path.exists():
230
  logger.warning(f"메타데이터가 μ—†μŠ΅λ‹ˆλ‹€: {metadata_path}")
231
  continue
232
 
233
- with open(metadata_path, 'rb') as f:
234
- save_data = pickle.load(f)
235
 
236
  # μ»¬λ ‰μ…˜ 볡원
237
  self.collections[collection_name] = {
@@ -362,20 +461,24 @@ def main():
362
  # 검색기 μ΄ˆκΈ°ν™”
363
  if 'searcher' not in st.session_state:
364
  with st.spinner('검색 μ‹œμŠ€ν…œ μ΄ˆκΈ°ν™” 쀑...'):
365
- st.session_state.searcher = HybridMultiCollectionSearcher(target_system=target_system)
366
-
367
- # μ €μž₯된 μ»¬λ ‰μ…˜ λ‘œλ“œ μ‹œλ„
368
- collection_path = "./saved_collections" # λ˜λŠ” "./collections"
369
- success = st.session_state.searcher.load_collections(collection_path)
370
-
371
- if success:
372
- available_collections = st.session_state.searcher.list_collections()
373
- st.success(f"βœ… 검색 μ‹œμŠ€ν…œμ΄ μ€€λΉ„λ˜μ—ˆμŠ΅λ‹ˆλ‹€! λ‘œλ“œλœ μ»¬λ ‰μ…˜: {', '.join(available_collections)}")
374
- else:
375
- st.error("❌ μ €μž₯된 μ»¬λ ‰μ…˜μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. μ»¬λ ‰μ…˜ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
376
- st.info("πŸ’‘ `saved_collections` 폴더에 미리 μƒμ„±λœ μ»¬λ ‰μ…˜ νŒŒμΌλ“€(.pkl, .index)을 μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
 
 
 
 
377
 
378
- # μ‚¬μš© κ°€λŠ₯ν•œ μ»¬λ ‰μ…˜ 확인
379
  if 'searcher' in st.session_state:
380
  available_collections = st.session_state.searcher.list_collections()
381
 
 
8
  import logging
9
  from sentence_transformers import SentenceTransformer
10
  import faiss
11
+ import json
12
  from rank_bm25 import BM25Okapi
13
 
14
  # κΈ°λ³Έ λ‘œκΉ… μ„€μ •
 
192
 
193
  return boost_score
194
 
195
+ def create_sample_collection(self, collection_name: str):
196
+ """μƒ˜ν”Œ λ°μ΄ν„°λ‘œ μ»¬λ ‰μ…˜ 생성"""
197
+ try:
198
+ if self.model is None:
199
+ self.model = self.load_model()
200
+ if self.model is None:
201
+ return False
202
+
203
+ # μƒ˜ν”Œ 데이터
204
+ sample_data = [
205
+ {
206
+ 'chunk_id': 'sample_001',
207
+ 'content': 'μˆ˜λ™λ³€μ†κΈ° νƒˆκ±° μ‹œμ—λŠ” λ¨Όμ € 엔진을 μ •μ§€ν•˜κ³  변속기 μ˜€μΌμ„ λ°°μΆœν•©λ‹ˆλ‹€. 클러치λ₯Ό λΆ„λ¦¬ν•œ ν›„ 변속기λ₯Ό νƒˆκ±°ν•©λ‹ˆλ‹€.',
208
+ 'metadata': {
209
+ 'chunk_id': 'sample_001',
210
+ 'content_type': 'νƒˆκ±°λ°©λ²•',
211
+ 'main_topic': 'μˆ˜λ™λ³€μ†κΈ° νƒˆκ±°',
212
+ 'vehicle_info': {'system': 'μˆ˜λ™λ³€μ†κΈ°', 'model': 'μ—μ–΄λ‘œμ‹œν‹°'},
213
+ 'category_levels': ['변속기', 'μˆ˜λ™λ³€μ†κΈ°', 'νƒˆκ±°λ°©λ²•'],
214
+ 'extracted_components': ['변속기', '클러치']
215
+ }
216
+ },
217
+ {
218
+ 'chunk_id': 'sample_002',
219
+ 'content': 'μˆ˜λ™λ³€μ†κΈ° μž₯착은 νƒˆκ±°μ˜ μ—­μˆœμœΌλ‘œ μ§„ν–‰ν•©λ‹ˆλ‹€. 변속기λ₯Ό μ •ν™•ν•œ μœ„μΉ˜μ— κ³ μ •ν•˜κ³  클러치λ₯Ό μ—°κ²°ν•©λ‹ˆλ‹€.',
220
+ 'metadata': {
221
+ 'chunk_id': 'sample_002',
222
+ 'content_type': 'μž₯착방법',
223
+ 'main_topic': 'μˆ˜λ™λ³€μ†κΈ° μž₯μ°©',
224
+ 'vehicle_info': {'system': 'μˆ˜λ™λ³€μ†κΈ°', 'model': 'μ—μ–΄λ‘œμ‹œν‹°'},
225
+ 'category_levels': ['변속기', 'μˆ˜λ™λ³€μ†κΈ°', 'μž₯착방법'],
226
+ 'extracted_components': ['변속기', '클러치']
227
+ }
228
+ },
229
+ {
230
+ 'chunk_id': 'sample_003',
231
+ 'content': '변속기 였일 점검 μ‹œ 였일 레벨과 였일 μƒνƒœλ₯Ό ν™•μΈν•©λ‹ˆλ‹€. κ·œμ •λŸ‰μ€ 2.5L이며 였일 μ˜¨λ„λŠ” 80Β°Cμ—μ„œ μΈ‘μ •ν•©λ‹ˆλ‹€.',
232
+ 'metadata': {
233
+ 'chunk_id': 'sample_003',
234
+ 'content_type': 'μ κ²€μ ˆμ°¨',
235
+ 'main_topic': '였일 점검',
236
+ 'vehicle_info': {'system': 'μˆ˜λ™λ³€μ†κΈ°', 'model': 'μ—μ–΄λ‘œμ‹œν‹°'},
237
+ 'category_levels': ['변속기', 'μˆ˜λ™λ³€μ†κΈ°', 'μ κ²€μ ˆμ°¨'],
238
+ 'extracted_components': ['였일']
239
+ }
240
+ }
241
+ ]
242
+
243
+ # 검색 ν…μŠ€νŠΈ 생성
244
+ search_texts = []
245
+ metadata_list = []
246
+ content_dict = {}
247
+
248
+ for data in sample_data:
249
+ metadata = data['metadata']
250
+ content = data['content']
251
+
252
+ # κ²€μƒ‰μš© ν…μŠ€νŠΈ ꡬ성
253
+ search_components = [
254
+ metadata.get('content_type', ''),
255
+ metadata.get('main_topic', ''),
256
+ ' '.join(metadata.get('category_levels', [])),
257
+ content
258
+ ]
259
+
260
+ search_text = self._extract_nouns_and_verbs(' '.join(search_components))
261
+ search_texts.append(search_text)
262
+ metadata_list.append(metadata)
263
+ content_dict[metadata['chunk_id']] = content
264
+
265
+ # 벑터 μž„λ² λ”© 생성
266
+ embeddings = self.model.encode(search_texts, show_progress_bar=False)
267
+
268
+ # FAISS 인덱슀 생성
269
+ embedding_dim = embeddings.shape[1]
270
+ faiss.normalize_L2(embeddings)
271
+ faiss_index = faiss.IndexFlatIP(embedding_dim)
272
+ faiss_index.add(embeddings.astype(np.float32))
273
+
274
+ # BM25 인덱슀 생성
275
+ tokenized_docs = [text.split() for text in search_texts]
276
+ bm25_index = BM25Okapi(tokenized_docs)
277
+
278
+ # μ»¬λ ‰μ…˜ μ €μž₯
279
+ self.collections[collection_name] = {
280
+ 'metadata_list': metadata_list,
281
+ 'content_dict': content_dict,
282
+ 'search_texts': search_texts,
283
+ 'faiss_index': faiss_index
284
+ }
285
+
286
+ self.bm25_indexes[collection_name] = bm25_index
287
+ return True
288
+
289
+ except Exception as e:
290
+ logger.error(f"μƒ˜ν”Œ μ»¬λ ‰μ…˜ 생성 μ‹€νŒ¨: {e}")
291
+ return False
292
+ """μ €μž₯된 ν•˜μ΄λΈŒλ¦¬λ“œ μ»¬λ ‰μ…˜λ“€ λ‘œλ“œ (FAISS + BM25) - pickle 없이"""
293
  save_dir = Path(save_dir)
294
 
295
  if not save_dir.exists():
 
311
 
312
  faiss_index = faiss.read_index(str(faiss_path))
313
 
314
+ # 2. BM25 토큰 데이터 λ‘œλ“œ (JSON)
315
+ bm25_tokens_path = collection_dir / "bm25_tokens.json"
316
+ if not bm25_tokens_path.exists():
317
+ logger.warning(f"BM25 토큰 데이터가 μ—†μŠ΅λ‹ˆλ‹€: {bm25_tokens_path}")
318
  continue
319
 
320
+ with open(bm25_tokens_path, 'r', encoding='utf-8') as f:
321
+ tokenized_docs = json.load(f)
322
 
323
+ # BM25 인덱슀 μž¬μƒμ„±
324
+ bm25_index = BM25Okapi(tokenized_docs)
325
+
326
+ # 3. 메타데이터 λ‘œλ“œ (JSON)
327
+ metadata_path = collection_dir / "metadata.json"
328
  if not metadata_path.exists():
329
  logger.warning(f"메타데이터가 μ—†μŠ΅λ‹ˆλ‹€: {metadata_path}")
330
  continue
331
 
332
+ with open(metadata_path, 'r', encoding='utf-8') as f:
333
+ save_data = json.load(f)
334
 
335
  # μ»¬λ ‰μ…˜ 볡원
336
  self.collections[collection_name] = {
 
461
  # 검색기 μ΄ˆκΈ°ν™”
462
  if 'searcher' not in st.session_state:
463
  with st.spinner('검색 μ‹œμŠ€ν…œ μ΄ˆκΈ°ν™” 쀑...'):
464
+ try:
465
+ st.session_state.searcher = HybridMultiCollectionSearcher(target_system=target_system)
466
+
467
+ # λ¨Όμ € μƒ˜ν”Œ λ°μ΄ν„°λ‘œ ν…ŒμŠ€νŠΈ
468
+ st.info("πŸ§ͺ μƒ˜ν”Œ λ°μ΄ν„°λ‘œ ν…ŒμŠ€νŠΈ 쀑...")
469
+ success = st.session_state.searcher.create_sample_collection("ν…ŒμŠ€νŠΈ")
470
+
471
+ if success:
472
+ st.success("βœ… μƒ˜ν”Œ 검색 μ‹œμŠ€ν…œμ΄ μ€€λΉ„λ˜μ—ˆμŠ΅λ‹ˆλ‹€!")
473
+ st.info("πŸ’‘ μ‹€μ œ μ»¬λ ‰μ…˜μ„ μ‚¬μš©ν•˜λ €λ©΄ `saved_collections` 폴더λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš”.")
474
+ else:
475
+ st.error("❌ μ‹œμŠ€ν…œ μ΄ˆκΈ°ν™”μ— μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€.")
476
+
477
+ except Exception as e:
478
+ st.error(f"❌ μ΄ˆκΈ°ν™” 였λ₯˜: {str(e)}")
479
+ st.info("πŸ”§ 문제λ₯Ό ν•΄κ²°ν•˜λŠ” μ€‘μž…λ‹ˆλ‹€...")
480
 
481
+ # 검색기가 μžˆλŠ” κ²½μš°μ—λ§Œ μ§„ν–‰
482
  if 'searcher' in st.session_state:
483
  available_collections = st.session_state.searcher.list_collections()
484