Spaces:
Runtime error
Runtime error
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +129 -26
src/streamlit_app.py
CHANGED
|
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
| 8 |
import logging
|
| 9 |
from sentence_transformers import SentenceTransformer
|
| 10 |
import faiss
|
| 11 |
-
import
|
| 12 |
from rank_bm25 import BM25Okapi
|
| 13 |
|
| 14 |
# κΈ°λ³Έ λ‘κΉ
μ€μ
|
|
@@ -192,8 +192,104 @@ class HybridMultiCollectionSearcher:
|
|
| 192 |
|
| 193 |
return boost_score
|
| 194 |
|
| 195 |
-
def
|
| 196 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
save_dir = Path(save_dir)
|
| 198 |
|
| 199 |
if not save_dir.exists():
|
|
@@ -215,23 +311,26 @@ class HybridMultiCollectionSearcher:
|
|
| 215 |
|
| 216 |
faiss_index = faiss.read_index(str(faiss_path))
|
| 217 |
|
| 218 |
-
# 2. BM25
|
| 219 |
-
|
| 220 |
-
if not
|
| 221 |
-
logger.warning(f"BM25
|
| 222 |
continue
|
| 223 |
|
| 224 |
-
with open(
|
| 225 |
-
|
| 226 |
|
| 227 |
-
#
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
| 229 |
if not metadata_path.exists():
|
| 230 |
logger.warning(f"λ©νλ°μ΄ν°κ° μμ΅λλ€: {metadata_path}")
|
| 231 |
continue
|
| 232 |
|
| 233 |
-
with open(metadata_path, '
|
| 234 |
-
save_data =
|
| 235 |
|
| 236 |
# 컬λ μ
볡μ
|
| 237 |
self.collections[collection_name] = {
|
|
@@ -362,20 +461,24 @@ def main():
|
|
| 362 |
# κ²μκΈ° μ΄κΈ°ν
|
| 363 |
if 'searcher' not in st.session_state:
|
| 364 |
with st.spinner('κ²μ μμ€ν
μ΄κΈ°ν μ€...'):
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
-
#
|
| 379 |
if 'searcher' in st.session_state:
|
| 380 |
available_collections = st.session_state.searcher.list_collections()
|
| 381 |
|
|
|
|
| 8 |
import logging
|
| 9 |
from sentence_transformers import SentenceTransformer
|
| 10 |
import faiss
|
| 11 |
+
import json
|
| 12 |
from rank_bm25 import BM25Okapi
|
| 13 |
|
| 14 |
# κΈ°λ³Έ λ‘κΉ
μ€μ
|
|
|
|
| 192 |
|
| 193 |
return boost_score
|
| 194 |
|
| 195 |
+
def create_sample_collection(self, collection_name: str):
|
| 196 |
+
"""μν λ°μ΄ν°λ‘ 컬λ μ
μμ±"""
|
| 197 |
+
try:
|
| 198 |
+
if self.model is None:
|
| 199 |
+
self.model = self.load_model()
|
| 200 |
+
if self.model is None:
|
| 201 |
+
return False
|
| 202 |
+
|
| 203 |
+
# μν λ°μ΄ν°
|
| 204 |
+
sample_data = [
|
| 205 |
+
{
|
| 206 |
+
'chunk_id': 'sample_001',
|
| 207 |
+
'content': 'μλλ³μκΈ° νκ±° μμλ λ¨Όμ μμ§μ μ μ§νκ³ λ³μκΈ° μ€μΌμ λ°°μΆν©λλ€. ν΄λ¬μΉλ₯Ό λΆλ¦¬ν ν λ³μκΈ°λ₯Ό νκ±°ν©λλ€.',
|
| 208 |
+
'metadata': {
|
| 209 |
+
'chunk_id': 'sample_001',
|
| 210 |
+
'content_type': 'νκ±°λ°©λ²',
|
| 211 |
+
'main_topic': 'μλλ³μκΈ° νκ±°',
|
| 212 |
+
'vehicle_info': {'system': 'μλλ³μκΈ°', 'model': 'μμ΄λ‘μν°'},
|
| 213 |
+
'category_levels': ['λ³μκΈ°', 'μλλ³μκΈ°', 'νκ±°λ°©λ²'],
|
| 214 |
+
'extracted_components': ['λ³μκΈ°', 'ν΄λ¬μΉ']
|
| 215 |
+
}
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
'chunk_id': 'sample_002',
|
| 219 |
+
'content': 'μλλ³μκΈ° μ₯μ°©μ νκ±°μ μμμΌλ‘ μ§νν©λλ€. λ³μκΈ°λ₯Ό μ νν μμΉμ κ³ μ νκ³ ν΄λ¬μΉλ₯Ό μ°κ²°ν©λλ€.',
|
| 220 |
+
'metadata': {
|
| 221 |
+
'chunk_id': 'sample_002',
|
| 222 |
+
'content_type': 'μ₯μ°©λ°©λ²',
|
| 223 |
+
'main_topic': 'μλλ³μκΈ° μ₯μ°©',
|
| 224 |
+
'vehicle_info': {'system': 'μλλ³μκΈ°', 'model': 'μμ΄λ‘μν°'},
|
| 225 |
+
'category_levels': ['λ³μκΈ°', 'μλλ³μκΈ°', 'μ₯μ°©λ°©λ²'],
|
| 226 |
+
'extracted_components': ['λ³μκΈ°', 'ν΄λ¬μΉ']
|
| 227 |
+
}
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
'chunk_id': 'sample_003',
|
| 231 |
+
'content': 'λ³μκΈ° μ€μΌ μ κ² μ μ€μΌ λ 벨과 μ€μΌ μνλ₯Ό νμΈν©λλ€. κ·μ λμ 2.5Lμ΄λ©° μ€μΌ μ¨λλ 80Β°Cμμ μΈ‘μ ν©λλ€.',
|
| 232 |
+
'metadata': {
|
| 233 |
+
'chunk_id': 'sample_003',
|
| 234 |
+
'content_type': 'μ κ²μ μ°¨',
|
| 235 |
+
'main_topic': 'μ€μΌ μ κ²',
|
| 236 |
+
'vehicle_info': {'system': 'μλλ³μκΈ°', 'model': 'μμ΄λ‘μν°'},
|
| 237 |
+
'category_levels': ['λ³μκΈ°', 'μλλ³μκΈ°', 'μ κ²μ μ°¨'],
|
| 238 |
+
'extracted_components': ['μ€μΌ']
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
]
|
| 242 |
+
|
| 243 |
+
# κ²μ ν
μ€νΈ μμ±
|
| 244 |
+
search_texts = []
|
| 245 |
+
metadata_list = []
|
| 246 |
+
content_dict = {}
|
| 247 |
+
|
| 248 |
+
for data in sample_data:
|
| 249 |
+
metadata = data['metadata']
|
| 250 |
+
content = data['content']
|
| 251 |
+
|
| 252 |
+
# κ²μμ© ν
μ€νΈ ꡬμ±
|
| 253 |
+
search_components = [
|
| 254 |
+
metadata.get('content_type', ''),
|
| 255 |
+
metadata.get('main_topic', ''),
|
| 256 |
+
' '.join(metadata.get('category_levels', [])),
|
| 257 |
+
content
|
| 258 |
+
]
|
| 259 |
+
|
| 260 |
+
search_text = self._extract_nouns_and_verbs(' '.join(search_components))
|
| 261 |
+
search_texts.append(search_text)
|
| 262 |
+
metadata_list.append(metadata)
|
| 263 |
+
content_dict[metadata['chunk_id']] = content
|
| 264 |
+
|
| 265 |
+
# λ²‘ν° μλ² λ© μμ±
|
| 266 |
+
embeddings = self.model.encode(search_texts, show_progress_bar=False)
|
| 267 |
+
|
| 268 |
+
# FAISS μΈλ±μ€ μμ±
|
| 269 |
+
embedding_dim = embeddings.shape[1]
|
| 270 |
+
faiss.normalize_L2(embeddings)
|
| 271 |
+
faiss_index = faiss.IndexFlatIP(embedding_dim)
|
| 272 |
+
faiss_index.add(embeddings.astype(np.float32))
|
| 273 |
+
|
| 274 |
+
# BM25 μΈλ±μ€ μμ±
|
| 275 |
+
tokenized_docs = [text.split() for text in search_texts]
|
| 276 |
+
bm25_index = BM25Okapi(tokenized_docs)
|
| 277 |
+
|
| 278 |
+
# 컬λ μ
μ μ₯
|
| 279 |
+
self.collections[collection_name] = {
|
| 280 |
+
'metadata_list': metadata_list,
|
| 281 |
+
'content_dict': content_dict,
|
| 282 |
+
'search_texts': search_texts,
|
| 283 |
+
'faiss_index': faiss_index
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
self.bm25_indexes[collection_name] = bm25_index
|
| 287 |
+
return True
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.error(f"μν 컬λ μ
μμ± μ€ν¨: {e}")
|
| 291 |
+
return False
|
| 292 |
+
"""μ μ₯λ νμ΄λΈλ¦¬λ 컬λ μ
λ€ λ‘λ (FAISS + BM25) - pickle μμ΄"""
|
| 293 |
save_dir = Path(save_dir)
|
| 294 |
|
| 295 |
if not save_dir.exists():
|
|
|
|
| 311 |
|
| 312 |
faiss_index = faiss.read_index(str(faiss_path))
|
| 313 |
|
| 314 |
+
# 2. BM25 ν ν° λ°μ΄ν° λ‘λ (JSON)
|
| 315 |
+
bm25_tokens_path = collection_dir / "bm25_tokens.json"
|
| 316 |
+
if not bm25_tokens_path.exists():
|
| 317 |
+
logger.warning(f"BM25 ν ν° λ°μ΄ν°κ° μμ΅λλ€: {bm25_tokens_path}")
|
| 318 |
continue
|
| 319 |
|
| 320 |
+
with open(bm25_tokens_path, 'r', encoding='utf-8') as f:
|
| 321 |
+
tokenized_docs = json.load(f)
|
| 322 |
|
| 323 |
+
# BM25 μΈλ±μ€ μ¬μμ±
|
| 324 |
+
bm25_index = BM25Okapi(tokenized_docs)
|
| 325 |
+
|
| 326 |
+
# 3. λ©νλ°μ΄ν° λ‘λ (JSON)
|
| 327 |
+
metadata_path = collection_dir / "metadata.json"
|
| 328 |
if not metadata_path.exists():
|
| 329 |
logger.warning(f"λ©νλ°μ΄ν°κ° μμ΅λλ€: {metadata_path}")
|
| 330 |
continue
|
| 331 |
|
| 332 |
+
with open(metadata_path, 'r', encoding='utf-8') as f:
|
| 333 |
+
save_data = json.load(f)
|
| 334 |
|
| 335 |
# 컬λ μ
볡μ
|
| 336 |
self.collections[collection_name] = {
|
|
|
|
| 461 |
# κ²μκΈ° μ΄κΈ°ν
|
| 462 |
if 'searcher' not in st.session_state:
|
| 463 |
with st.spinner('κ²μ μμ€ν
μ΄κΈ°ν μ€...'):
|
| 464 |
+
try:
|
| 465 |
+
st.session_state.searcher = HybridMultiCollectionSearcher(target_system=target_system)
|
| 466 |
+
|
| 467 |
+
# λ¨Όμ μν λ°μ΄ν°λ‘ ν
μ€νΈ
|
| 468 |
+
st.info("π§ͺ μν λ°μ΄ν°λ‘ ν
μ€νΈ μ€...")
|
| 469 |
+
success = st.session_state.searcher.create_sample_collection("ν
μ€νΈ")
|
| 470 |
+
|
| 471 |
+
if success:
|
| 472 |
+
st.success("β
μν κ²μ μμ€ν
μ΄ μ€λΉλμμ΅λλ€!")
|
| 473 |
+
st.info("π‘ μ€μ 컬λ μ
μ μ¬μ©νλ €λ©΄ `saved_collections` ν΄λλ₯Ό μ
λ‘λνμΈμ.")
|
| 474 |
+
else:
|
| 475 |
+
st.error("β μμ€ν
μ΄κΈ°νμ μ€ν¨νμ΅λλ€.")
|
| 476 |
+
|
| 477 |
+
except Exception as e:
|
| 478 |
+
st.error(f"β μ΄κΈ°ν μ€λ₯: {str(e)}")
|
| 479 |
+
st.info("π§ λ¬Έμ λ₯Ό ν΄κ²°νλ μ€μ
λλ€...")
|
| 480 |
|
| 481 |
+
# κ²μκΈ°κ° μλ κ²½μ°μλ§ μ§ν
|
| 482 |
if 'searcher' in st.session_state:
|
| 483 |
available_collections = st.session_state.searcher.list_collections()
|
| 484 |
|