wldud7568 commited on
Commit
8652256
Β·
verified Β·
1 Parent(s): 61347c5

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ saved_collections/μˆ˜λ™λ³€μ†κΈ°/faiss.index filter=lfs diff=lfs merge=lfs -text
37
+ saved_collections/μ—”μ§„[[:space:]]기계[[:space:]]μ‹œμŠ€ν…œ/faiss.index filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,21 +1,22 @@
1
- FROM python:3.9-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- software-properties-common \
9
- git \
10
- && rm -rf /var/lib/apt/lists/*
11
-
12
- COPY requirements.txt ./
13
- COPY src/ ./src/
14
-
15
- RUN pip3 install -r requirements.txt
16
-
17
- EXPOSE 8501
18
-
19
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
-
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ # μ‹œμŠ€ν…œ νŒ¨ν‚€μ§€ μ„€μΉ˜ (컴파일 ν•„μš”ν•œ νŒ¨ν‚€μ§€λ“€ λ•Œλ¬Έμ—)
6
+ RUN apt-get update && apt-get install -y \
7
+ gcc \
8
+ g++ \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # requirements.txt 볡사 및 νŒ¨ν‚€μ§€ μ„€μΉ˜
12
+ COPY ./requirements.txt /code/requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
14
+
15
+ # μ•± νŒŒμΌλ“€ 볡사
16
+ COPY . /code
17
+
18
+ # 포트 μ„€μ • (ν—ˆκΉ…νŽ˜μ΄μŠ€λŠ” 7860 포트 μ‚¬μš©)
19
+ EXPOSE 7860
20
+
21
+ # Streamlit μ‹€ν–‰
22
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.headless=true", "--server.enableCORS=false", "--server.enableXsrfProtection=false"]
app.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import re
5
+ import numpy as np
6
+ from typing import List, Dict, Tuple, Optional
7
+ from pathlib import Path
8
+ import logging
9
+ from sentence_transformers import SentenceTransformer
10
+ import faiss
11
+ import pickle
12
+ from rank_bm25 import BM25Okapi
13
+
14
+ # κΈ°λ³Έ λ‘œκΉ… μ„€μ •
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # νŽ˜μ΄μ§€ μ„€μ •
19
+ st.set_page_config(
20
+ page_title="ν•˜μ΄λΈŒλ¦¬λ“œ μ°¨λŸ‰ μ •λΉ„ 검색 μ‹œμŠ€ν…œ",
21
+ page_icon="πŸ”§",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded"
24
+ )
25
+
26
+ # CSS μŠ€νƒ€μΌ
27
+ st.markdown("""
28
+ <style>
29
+ .main-header {
30
+ font-size: 2.5rem;
31
+ color: #1f4e79;
32
+ text-align: center;
33
+ margin-bottom: 2rem;
34
+ font-weight: bold;
35
+ }
36
+
37
+ .search-container {
38
+ background-color: #f8f9fa;
39
+ padding: 2rem;
40
+ border-radius: 10px;
41
+ margin-bottom: 2rem;
42
+ border-left: 5px solid #1f4e79;
43
+ }
44
+
45
+ .result-card {
46
+ background-color: white;
47
+ padding: 1.5rem;
48
+ border-radius: 8px;
49
+ margin-bottom: 1rem;
50
+ border: 1px solid #dee2e6;
51
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
52
+ }
53
+
54
+ .score-badge {
55
+ background-color: #e3f2fd;
56
+ color: #1565c0;
57
+ padding: 0.25rem 0.75rem;
58
+ border-radius: 15px;
59
+ font-size: 0.8rem;
60
+ font-weight: bold;
61
+ }
62
+
63
+ .category-badge {
64
+ background-color: #f3e5f5;
65
+ color: #7b1fa2;
66
+ padding: 0.25rem 0.75rem;
67
+ border-radius: 15px;
68
+ font-size: 0.8rem;
69
+ margin-right: 0.5rem;
70
+ }
71
+
72
+ .content-text {
73
+ background-color: #f8f9fa;
74
+ padding: 1rem;
75
+ border-radius: 5px;
76
+ border-left: 3px solid #28a745;
77
+ margin-top: 1rem;
78
+ line-height: 1.6;
79
+ }
80
+
81
+ .metric-card {
82
+ background-color: #e8f5e8;
83
+ padding: 1rem;
84
+ border-radius: 5px;
85
+ text-align: center;
86
+ margin: 0.5rem;
87
+ }
88
+ </style>
89
+ """, unsafe_allow_html=True)
90
+
91
+ # κ°„λ‹¨ν•œ λΆ€ν’ˆ 사전 (μ‹€μ œ vocab.py λŒ€μ‹  μ‚¬μš©)
92
+ PARTS = [
93
+ "μˆ˜λ™λ³€μ†κΈ°", "클러치", "브레이크", "μ—”μ§„", "타이어", "배터리",
94
+ "였일", "ν•„ν„°", "벨트", "호슀", "νŽŒν”„", "μ„Όμ„œ", "νŠΈλžœμŠ€λ―Έμ…˜",
95
+ "λ””μŠ€ν¬", "νŒ¨λ“œ", "슈", "λ‘œν„°", "캘리퍼", "λ§ˆμŠ€ν„°μ‹€λ¦°λ”"
96
+ ]
97
+
98
+ # κ°„λ‹¨ν•œ μ‹œμŠ€ν…œ λ§€ν•‘ (μ‹€μ œ parts_config.py λŒ€μ‹  μ‚¬μš©)
99
+ SYSTEM_PARTS_MAP = {
100
+ "μˆ˜λ™λ³€μ†κΈ°": ["클러치", "변속기", "λ“œλΌμ΄λΈŒμƒ€ν”„νŠΈ", "λ””νΌλ Œμ…œ"],
101
+ "μ—”μ§„": ["ν”ΌμŠ€ν†€", "싀린더", "ν¬λž­ν¬μƒ€ν”„νŠΈ", "μΊ μƒ€ν”„νŠΈ"],
102
+ "브레이크": ["λΈŒλ ˆμ΄ν¬νŒ¨λ“œ", "λΈŒλ ˆμ΄ν¬λ””μŠ€ν¬", "캘리퍼", "λ§ˆμŠ€ν„°μ‹€λ¦°λ”"]
103
+ }
104
+
105
+ def get_specific_parts_for_system(system_name: str) -> list:
106
+ return SYSTEM_PARTS_MAP.get(system_name, [])
107
+
108
+ def get_all_specific_parts() -> list:
109
+ all_parts = []
110
+ for parts in SYSTEM_PARTS_MAP.values():
111
+ all_parts.extend(parts)
112
+ return list(set(all_parts))
113
+
114
+ class SimpleMecab:
115
+ """MeCab λŒ€μ‹  μ‚¬μš©ν•  κ°„λ‹¨ν•œ ν˜•νƒœμ†Œ 뢄석기"""
116
+ def pos(self, text):
117
+ # κ°„λ‹¨ν•œ λͺ…사/동사 μΆ”μΆœ (μ‹€μ œ ν™˜κ²½μ—μ„œλŠ” MeCab μ‚¬μš©)
118
+ words = text.split()
119
+ return [(word, 'NN') for word in words if len(word) > 1]
120
+
121
+ class HybridMultiCollectionSearcher:
122
+ def __init__(self, model_name: str = "upskyy/bge-m3-korean", target_system: str = None):
123
+ """
124
+ ν•˜μ΄λΈŒλ¦¬λ“œ 닀쀑 μ»¬λ ‰μ…˜ 검색기 (벑터 + ν‚€μ›Œλ“œ 검색)
125
+ """
126
+ self.model = None # λ‚˜μ€‘μ— λ‘œλ“œ
127
+ self.collections = {}
128
+ self.bm25_indexes = {}
129
+ self.target_system = target_system
130
+ self.mecab = SimpleMecab() # κ°„λ‹¨ν•œ 뢄석기 μ‚¬μš©
131
+ self.model_name = model_name
132
+
133
+ @st.cache_resource
134
+ def load_model(_self):
135
+ """λͺ¨λΈμ„ μΊμ‹œμ™€ ν•¨κ»˜ λ‘œλ“œ"""
136
+ try:
137
+ return SentenceTransformer(_self.model_name)
138
+ except Exception as e:
139
+ st.error(f"λͺ¨λΈ λ‘œλ“œ μ‹€νŒ¨: {e}")
140
+ return None
141
+
142
+ def _extract_nouns_and_verbs(self, text: str) -> str:
143
+ """κ°„λ‹¨ν•œ λͺ…사와 동사 μΆ”μΆœ"""
144
+ try:
145
+ # λΆ€ν’ˆλͺ… μš°μ„  처리
146
+ for part in PARTS:
147
+ if part in text:
148
+ text = text.replace(part, f" {part} ")
149
+
150
+ # κ°„λ‹¨ν•œ λͺ…사 μΆ”μΆœ (μ‹€μ œλ‘œλŠ” MeCab μ‚¬μš©)
151
+ morphs = self.mecab.pos(text)
152
+ meaningful_words = []
153
+
154
+ for word, pos in morphs:
155
+ if len(word) > 1 and not word.isspace():
156
+ meaningful_words.append(word)
157
+
158
+ return ' '.join(meaningful_words)
159
+ except Exception as e:
160
+ return text
161
+
162
+ def _normalize_text_for_matching(self, text: str) -> str:
163
+ normalized = text.lower()
164
+ normalized = re.sub(r'[.]', '', normalized)
165
+ return normalized
166
+
167
+ def _normalize_scores(self, scores: np.ndarray) -> np.ndarray:
168
+ """점���λ₯Ό 0-1 λ²”μœ„λ‘œ μ •κ·œν™”"""
169
+ scores = np.array(scores)
170
+ if len(scores) == 0 or scores.max() == scores.min():
171
+ return np.ones_like(scores) * 0.5
172
+ return (scores - scores.min()) / (scores.max() - scores.min())
173
+
174
+ def _calculate_boost_score(self, original_query: str, processed_query: str, metadata: Dict, content: str) -> float:
175
+ """κ°„λ‹¨ν•œ λΆ€μŠ€νŒ… 점수 계산"""
176
+ boost_score = 0
177
+ query_lower = original_query.lower()
178
+
179
+ # μ½˜ν…μΈ  νƒ€μž… λ§€μΉ­
180
+ content_type = metadata.get('content_type', '')
181
+ if 'νƒˆκ±°' in query_lower and 'νƒˆκ±°' in content_type:
182
+ boost_score += 0.5
183
+ if 'μž₯μ°©' in query_lower and 'μž₯μ°©' in content_type:
184
+ boost_score += 0.5
185
+ if '점검' in query_lower and '점검' in content_type:
186
+ boost_score += 0.5
187
+
188
+ # μ‹œμŠ€ν…œ λ§€μΉ­
189
+ system = metadata.get('vehicle_info', {}).get('system', '')
190
+ if system and any(word in system.lower() for word in query_lower.split()):
191
+ boost_score += 0.3
192
+
193
+ return boost_score
194
+
195
+ def load_collections(self, save_dir: str):
196
+ """μ €μž₯된 ν•˜μ΄λΈŒλ¦¬λ“œ μ»¬λ ‰μ…˜λ“€ λ‘œλ“œ (FAISS + BM25)"""
197
+ save_dir = Path(save_dir)
198
+
199
+ if not save_dir.exists():
200
+ logger.warning(f"μ»¬λ ‰μ…˜ 디렉토리가 μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€: {save_dir}")
201
+ return False
202
+
203
+ loaded_collections = []
204
+
205
+ for collection_dir in save_dir.iterdir():
206
+ if collection_dir.is_dir():
207
+ collection_name = collection_dir.name
208
+
209
+ try:
210
+ # 1. FAISS 인덱슀 λ‘œλ“œ
211
+ faiss_path = collection_dir / "faiss.index"
212
+ if not faiss_path.exists():
213
+ logger.warning(f"FAISS μΈλ±μŠ€κ°€ μ—†μŠ΅λ‹ˆλ‹€: {faiss_path}")
214
+ continue
215
+
216
+ faiss_index = faiss.read_index(str(faiss_path))
217
+
218
+ # 2. BM25 인덱슀 λ‘œλ“œ
219
+ bm25_path = collection_dir / "bm25.pkl"
220
+ if not bm25_path.exists():
221
+ logger.warning(f"BM25 μΈλ±μŠ€κ°€ μ—†μŠ΅λ‹ˆλ‹€: {bm25_path}")
222
+ continue
223
+
224
+ with open(bm25_path, 'rb') as f:
225
+ bm25_index = pickle.load(f)
226
+
227
+ # 3. 메타데이터 λ‘œλ“œ
228
+ metadata_path = collection_dir / "metadata.pkl"
229
+ if not metadata_path.exists():
230
+ logger.warning(f"메타데이터가 μ—†μŠ΅λ‹ˆλ‹€: {metadata_path}")
231
+ continue
232
+
233
+ with open(metadata_path, 'rb') as f:
234
+ save_data = pickle.load(f)
235
+
236
+ # μ»¬λ ‰μ…˜ 볡원
237
+ self.collections[collection_name] = {
238
+ 'faiss_index': faiss_index,
239
+ **save_data
240
+ }
241
+
242
+ self.bm25_indexes[collection_name] = bm25_index
243
+ loaded_collections.append(collection_name)
244
+
245
+ logger.info(f"μ»¬λ ‰μ…˜ '{collection_name}' λ‘œλ“œ μ™„λ£Œ")
246
+
247
+ except Exception as e:
248
+ logger.error(f"μ»¬λ ‰μ…˜ '{collection_name}' λ‘œλ“œ μ‹€νŒ¨: {e}")
249
+ continue
250
+
251
+ if loaded_collections:
252
+ logger.info(f"ν•˜μ΄λΈŒλ¦¬λ“œ μ»¬λ ‰μ…˜ λ‘œλ“œ μ™„λ£Œ: {loaded_collections}")
253
+ return True
254
+ else:
255
+ logger.error("λ‘œλ“œλœ μ»¬λ ‰μ…˜μ΄ μ—†μŠ΅λ‹ˆλ‹€.")
256
+ return False
257
+
258
+ def list_collections(self) -> List[str]:
259
+ """λ“±λ‘λœ μ»¬λ ‰μ…˜ λͺ©λ‘ λ°˜ν™˜"""
260
+ return list(self.collections.keys())
261
+
262
+ def search_collection(self, collection_name: str, query: str, top_k: int = 5, alpha: float = 0.7) -> List[Dict]:
263
+ """ν•˜μ΄λΈŒλ¦¬λ“œ 검색 μˆ˜ν–‰"""
264
+ if collection_name not in self.collections:
265
+ return []
266
+
267
+ if self.model is None:
268
+ self.model = self.load_model()
269
+ if self.model is None:
270
+ return []
271
+
272
+ collection = self.collections[collection_name]
273
+ faiss_index = collection['faiss_index']
274
+ metadata_list = collection['metadata_list']
275
+ content_dict = collection['content_dict']
276
+ bm25_index = self.bm25_indexes[collection_name]
277
+
278
+ # 쿼리 처리
279
+ processed_query = self._extract_nouns_and_verbs(query)
280
+
281
+ # 벑터 검색
282
+ query_embedding = self.model.encode([processed_query])
283
+ faiss.normalize_L2(query_embedding)
284
+
285
+ search_k = min(len(metadata_list), top_k * 3)
286
+ dense_similarities, dense_indices = faiss_index.search(
287
+ query_embedding.astype(np.float32), search_k
288
+ )
289
+
290
+ # ν‚€μ›Œλ“œ 검색
291
+ query_tokens = processed_query.split()
292
+ sparse_scores = bm25_index.get_scores(query_tokens)
293
+
294
+ # 점수 μ •κ·œν™”
295
+ dense_scores_norm = self._normalize_scores(dense_similarities[0])
296
+ sparse_scores_norm = self._normalize_scores(sparse_scores)
297
+
298
+ # κ²°κ³Ό 생성
299
+ results = []
300
+ for i, (similarity, idx) in enumerate(zip(dense_similarities[0], dense_indices[0])):
301
+ if idx == -1:
302
+ continue
303
+
304
+ metadata = metadata_list[idx]
305
+ chunk_id = metadata['chunk_id']
306
+ content = content_dict.get(chunk_id, '')
307
+
308
+ dense_score = dense_scores_norm[i]
309
+ sparse_score = sparse_scores_norm[idx] if idx < len(sparse_scores_norm) else 0
310
+ boost_score = self._calculate_boost_score(query, processed_query, metadata, content)
311
+
312
+ hybrid_score = (alpha * dense_score + (1 - alpha) * sparse_score + boost_score)
313
+
314
+ category_levels = metadata.get('category_levels', [])
315
+ category_path = ' > '.join(category_levels)
316
+
317
+ result = {
318
+ 'chunk_id': chunk_id,
319
+ 'content': content,
320
+ 'metadata': metadata,
321
+ 'dense_similarity': float(similarity),
322
+ 'dense_score': dense_score,
323
+ 'sparse_score': sparse_score,
324
+ 'boost_score': boost_score,
325
+ 'hybrid_score': hybrid_score,
326
+ 'vehicle_info': metadata.get('vehicle_info', {}),
327
+ 'content_type': metadata.get('content_type', ''),
328
+ 'main_topic': metadata.get('main_topic', ''),
329
+ 'category_path': category_path,
330
+ 'processed_query': processed_query,
331
+ }
332
+ results.append(result)
333
+
334
+ results.sort(key=lambda x: x['hybrid_score'], reverse=True)
335
+ return results[:top_k]
336
+
337
+ # Streamlit μ•± μ‹œμž‘
338
+ def main():
339
+ # 제λͺ©
340
+ st.markdown('<h1 class="main-header">πŸ”§ ν•˜μ΄λΈŒλ¦¬λ“œ μ°¨λŸ‰ μ •λΉ„ 검색 μ‹œμŠ€ν…œ</h1>', unsafe_allow_html=True)
341
+
342
+ # μ‚¬μ΄λ“œλ°”
343
+ with st.sidebar:
344
+ st.header("βš™οΈ μ„€μ •")
345
+
346
+ # 검색 νŒŒλΌλ―Έν„°
347
+ st.subheader("검색 μ„€μ •")
348
+ top_k = st.slider("결과 개수", min_value=1, max_value=10, value=5)
349
+ alpha = st.slider("벑터 검색 κ°€μ€‘μΉ˜", min_value=0.0, max_value=1.0, value=0.7, step=0.1)
350
+
351
+ st.info(f"벑터 검색: {alpha:.1f}, ν‚€μ›Œλ“œ 검색: {1-alpha:.1f}")
352
+
353
+ # μ‹œμŠ€ν…œ 선택
354
+ st.subheader("λŒ€μƒ μ‹œμŠ€ν…œ")
355
+ target_system = st.selectbox(
356
+ "μ‹œμŠ€ν…œ 선택",
357
+ ["μˆ˜λ™λ³€μ†κΈ°", "μ—”μ§„", "브레이크"],
358
+ index=0
359
+ )
360
+
361
+ # 메인 μ˜μ—­
362
+ # 검색기 μ΄ˆκΈ°ν™”
363
+ if 'searcher' not in st.session_state:
364
+ with st.spinner('검색 μ‹œμŠ€ν…œ μ΄ˆκΈ°ν™” 쀑...'):
365
+ st.session_state.searcher = HybridMultiCollectionSearcher(target_system=target_system)
366
+
367
+ # μ €μž₯된 μ»¬λ ‰μ…˜ λ‘œλ“œ μ‹œλ„
368
+ collection_path = "./saved_collections" # λ˜λŠ” "./collections"
369
+ success = st.session_state.searcher.load_collections(collection_path)
370
+
371
+ if success:
372
+ available_collections = st.session_state.searcher.list_collections()
373
+ st.success(f"βœ… 검색 μ‹œμŠ€ν…œμ΄ μ€€λΉ„λ˜μ—ˆμŠ΅λ‹ˆλ‹€! λ‘œλ“œλœ μ»¬λ ‰μ…˜: {', '.join(available_collections)}")
374
+ else:
375
+ st.error("❌ μ €μž₯된 μ»¬λ ‰μ…˜μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. μ»¬λ ‰μ…˜ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
376
+ st.info("πŸ’‘ `saved_collections` 폴더에 미리 μƒμ„±λœ μ»¬λ ‰μ…˜ νŒŒμΌλ“€(.pkl, .index)을 μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
377
+
378
+ # μ‚¬μš© κ°€λŠ₯ν•œ μ»¬λ ‰μ…˜ 확인
379
+ if 'searcher' in st.session_state:
380
+ available_collections = st.session_state.searcher.list_collections()
381
+
382
+ # μ»¬λ ‰μ…˜μ΄ μžˆλŠ” κ²½μš°μ—λ§Œ 검색 μΈν„°νŽ˜μ΄μŠ€ ν‘œμ‹œ
383
+ if available_collections:
384
+ # μ»¬λ ‰μ…˜ 선택
385
+ st.subheader("πŸ“š 검색 λŒ€μƒ μ»¬λ ‰μ…˜")
386
+ selected_collection = st.selectbox(
387
+ "μ»¬λ ‰μ…˜ 선택",
388
+ available_collections,
389
+ help="검색할 μ»¬λ ‰μ…˜μ„ μ„ νƒν•˜μ„Έμš”"
390
+ )
391
+
392
+ # 검색 μΈν„°νŽ˜μ΄μŠ€
393
+ with st.container():
394
+ st.markdown('<div class="search-container">', unsafe_allow_html=True)
395
+
396
+ # 검색어 μž…λ ₯
397
+ query = st.text_input(
398
+ "πŸ” μ§ˆλ¬Έμ„ μž…λ ₯οΏ½οΏ½οΏ½μ„Έμš”",
399
+ placeholder="예: μˆ˜λ™λ³€μ†κΈ° νƒˆκ±°λŠ” μ–΄λ–»κ²Œ ν•˜λ‚˜μš”?",
400
+ help="μ°¨λŸ‰ 정비에 κ΄€ν•œ μ§ˆλ¬Έμ„ 자유둭게 μž…λ ₯ν•˜μ„Έμš”."
401
+ )
402
+
403
+ # 검색 λ²„νŠΌ
404
+ col1, col2, col3 = st.columns([1, 2, 1])
405
+ with col2:
406
+ search_button = st.button("πŸ” κ²€μƒ‰ν•˜κΈ°", type="primary", use_container_width=True)
407
+
408
+ st.markdown('</div>', unsafe_allow_html=True)
409
+
410
+ # 검색 μ‹€ν–‰
411
+ if search_button and query:
412
+ with st.spinner('검색 쀑...'):
413
+ results = st.session_state.searcher.search_collection(
414
+ selected_collection,
415
+ query,
416
+ top_k=top_k,
417
+ alpha=alpha
418
+ )
419
+
420
+ if results:
421
+ st.success(f"βœ… {len(results)}개의 검색 κ²°κ³Όλ₯Ό μ°Ύμ•˜μŠ΅λ‹ˆλ‹€.")
422
+
423
+ # 검색 톡계
424
+ col1, col2, col3, col4 = st.columns(4)
425
+ with col1:
426
+ st.markdown('<div class="metric-card"><strong>검색 κ²°κ³Ό</strong><br>' + f'{len(results)}개</div>', unsafe_allow_html=True)
427
+ with col2:
428
+ avg_score = np.mean([r['hybrid_score'] for r in results])
429
+ st.markdown('<div class="metric-card"><strong>평균 점수</strong><br>' + f'{avg_score:.3f}</div>', unsafe_allow_html=True)
430
+ with col3:
431
+ max_score = max([r['hybrid_score'] for r in results])
432
+ st.markdown('<div class="metric-card"><strong>졜고 점수</strong><br>' + f'{max_score:.3f}</div>', unsafe_allow_html=True)
433
+ with col4:
434
+ st.markdown('<div class="metric-card"><strong>μ»¬λ ‰μ…˜</strong><br>' + f'{selected_collection}</div>', unsafe_allow_html=True)
435
+
436
+ st.markdown("---")
437
+
438
+ # 검색 κ²°κ³Ό ν‘œμ‹œ
439
+ for i, result in enumerate(results, 1):
440
+ st.markdown('<div class="result-card">', unsafe_allow_html=True)
441
+
442
+ # 헀더
443
+ col1, col2 = st.columns([3, 1])
444
+ with col1:
445
+ st.markdown(f"### πŸ“„ κ²°κ³Ό {i}: {result['main_topic']}")
446
+ with col2:
447
+ st.markdown(f'<span class="score-badge">점수: {result["hybrid_score"]:.3f}</span>', unsafe_allow_html=True)
448
+
449
+ # 메타데이터
450
+ col1, col2 = st.columns(2)
451
+ with col1:
452
+ st.markdown(f'<span class="category-badge">{result["content_type"]}</span>', unsafe_allow_html=True)
453
+ st.markdown(f"**경둜:** {result['category_path']}")
454
+
455
+ with col2:
456
+ if result['vehicle_info']:
457
+ vehicle = result['vehicle_info']
458
+ st.markdown(f"**μ°¨λŸ‰:** {vehicle.get('model', 'N/A')}")
459
+ st.markdown(f"**μ‹œμŠ€ν…œ:** {vehicle.get('system', 'N/A')}")
460
+
461
+ # λ‚΄μš©
462
+ st.markdown('<div class="content-text">', unsafe_allow_html=True)
463
+ st.markdown(f"**πŸ“‹ λ‚΄μš©:**\n\n{result['content']}")
464
+ st.markdown('</div>', unsafe_allow_html=True)
465
+
466
+ # 상세 점수 (ν™•μž₯ κ°€λŠ₯)
467
+ with st.expander("πŸ” 상세 점수 보기"):
468
+ score_col1, score_col2, score_col3 = st.columns(3)
469
+ with score_col1:
470
+ st.metric("벑터 점수", f"{result['dense_score']:.3f}")
471
+ with score_col2:
472
+ st.metric("ν‚€μ›Œλ“œ 점수", f"{result['sparse_score']:.3f}")
473
+ with score_col3:
474
+ st.metric("λΆ€μŠ€νŒ… 점수", f"{result['boost_score']:.3f}")
475
+
476
+ st.markdown(f"**처리된 쿼리:** `{result['processed_query']}`")
477
+ st.markdown(f"**청크 ID:** `{result['chunk_id']}`")
478
+
479
+ st.markdown('</div>', unsafe_allow_html=True)
480
+ st.markdown("---")
481
+
482
+ else:
483
+ st.warning("πŸ€” 검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€. λ‹€λ₯Έ ν‚€μ›Œλ“œλ‘œ κ²€μƒ‰ν•΄λ³΄μ„Έμš”.")
484
+
485
+ elif search_button and not query:
486
+ st.warning("⚠️ 검색어λ₯Ό μž…λ ₯ν•΄μ£Όμ„Έμš”.")
487
+
488
+ else:
489
+ # μ»¬λ ‰μ…˜μ΄ μ—†λŠ” 경우
490
+ st.warning("⚠️ λ‘œλ“œλœ μ»¬λ ‰μ…˜μ΄ μ—†μŠ΅λ‹ˆλ‹€.")
491
+ st.markdown("""
492
+ ### πŸ“ μ»¬λ ‰μ…˜ 파일 μ—…λ‘œλ“œ 방법
493
+
494
+ 1. **λ‘œμ»¬μ—μ„œ μ»¬λ ‰μ…˜ 생성**:
495
+ ```python
496
+ # 원본 μ½”λ“œ μ‚¬μš©
497
+ searcher = HybridMultiCollectionSearcher()
498
+ searcher.add_collection("μˆ˜λ™λ³€μ†κΈ°", metadata_dir, chunks_dir)
499
+ searcher.save_collections("./saved_collections")
500
+ ```
501
+
502
+ 2. **μƒμ„±λœ νŒŒμΌλ“€μ„ ν—ˆκΉ…νŽ˜μ΄μŠ€ Space에 μ—…λ‘œλ“œ**:
503
+ - `saved_collections/` 폴더 전체λ₯Ό μ—…λ‘œλ“œ
504
+ - 각 μ»¬λ ‰μ…˜λ³„λ‘œ `.pkl`, `.index` νŒŒμΌλ“€μ΄ 포함됨
505
+
506
+ 3. **μ•± μž¬μ‹œμž‘** ν›„ 검색 κ°€λŠ₯
507
+ """)
508
+
509
+ # μ‚¬μš© κ°€μ΄λ“œ (μ»¬λ ‰μ…˜μ΄ μžˆμ„ λ•Œλ§Œ ν‘œμ‹œ)
510
+ if 'searcher' in st.session_state and st.session_state.searcher.list_collections() and not query:
511
+ st.markdown("### πŸ’‘ μ‚¬μš© κ°€μ΄λ“œ")
512
+
513
+ col1, col2 = st.columns(2)
514
+ with col1:
515
+ st.markdown("""
516
+ **πŸ”§ μ •λΉ„ μž‘μ—… 질문:**
517
+ - "μˆ˜λ™λ³€μ†κΈ° νƒˆκ±°λŠ” μ–΄λ–»κ²Œ ν•˜λ‚˜μš”?"
518
+ - "클러치 점검 방법을 μ•Œλ €μ£Όμ„Έμš”"
519
+ - "변속기 였일 κ΅ν™˜ μ ˆμ°¨λŠ”?"
520
+ """)
521
+
522
+ with col2:
523
+ st.markdown("""
524
+ **βš™οΈ λΆ€ν’ˆ 정보 질문:**
525
+ - "브레이크 νŒ¨λ“œ 사양은?"
526
+ - "μ—”μ§„ 였일 μš©λŸ‰μ€ μ–Όλ§ˆμΈκ°€μš”?"
527
+ - "타이어 곡기압 κΈ°μ€€μΉ˜λŠ”?"
528
+ """)
529
+
530
+ if __name__ == "__main__":
531
+ main()
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ streamlit==1.28.1
2
+ sentence-transformers==2.2.2
3
+ faiss-cpu==1.7.4
4
+ numpy==1.24.3
5
+ pandas==2.0.3
6
+ torch==2.0.1
7
+ transformers==4.33.2
8
+ rank-bm25==0.2.2
9
+ scikit-learn==1.3.0
saved_collections/μˆ˜λ™λ³€μ†κΈ°/bm25.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1414e1588cf69e42b9d22254641dcea6c7d0e425bd4b3f7c3c3555f0963f2817
3
+ size 41734
saved_collections/μˆ˜λ™λ³€μ†κΈ°/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:845b86924ba0b695d37f1264ff67db01b16bd1f60c73a67d5f6d364faee462e2
3
+ size 204845
saved_collections/μˆ˜λ™λ³€μ†κΈ°/metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b7b1bff89410720e269c322e49464b077162fe9396a9600dfa6946570f88222
3
+ size 123296
saved_collections/μ—”μ§„ 기계 μ‹œμŠ€ν…œ/bm25.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65844c84c094f82dc68d7bb0cd9afcf4df2d9b2f9ac8bf398ee20330716c3c7
3
+ size 100903
saved_collections/μ—”μ§„ 기계 μ‹œμŠ€ν…œ/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1461e04386997b0aadbde77529004029833b5e79a6bcf0d9bea221819450c44f
3
+ size 479277
saved_collections/μ—”μ§„ 기계 μ‹œμŠ€ν…œ/metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07de308e2f3ea9c540cea6da9d8a9764086e9df4c1b764c79f951cc6842b507b
3
+ size 369562