Spaces:
Sleeping
Sleeping
| """ | |
| λΉ λ₯Έ ν μ€νΈμ© λ²‘ν° DB μμ± μ€ν¬λ¦½νΈ | |
| κ°λ¨ν κΈμ΅ κ΄λ ¨ μν λ¬Έμλ‘ λ²‘ν° DB μμ± (1λΆ μ΄λ΄) | |
| """ | |
| import sys | |
| import os | |
| from pathlib import Path | |
| # νλ‘μ νΈ λ£¨νΈλ₯Ό Python pathμ μΆκ° | |
| project_root = Path(__file__).parent.parent | |
| sys.path.insert(0, str(project_root)) | |
| from services.vector_store import VectorStore | |
| from services.embedder import Embedder | |
| from loguru import logger | |
| # μν κΈμ΅/κ²½μ λ¬Έμ (νκ΅μ΄) | |
| SAMPLE_DOCUMENTS = [ | |
| { | |
| "text": "ν¬νΈν΄λ¦¬μ€ λ€κ°νλ ν¬μ μνμ λΆμ°μν€λ μ€μν μ λ΅μ λλ€. μλ‘ λ€λ₯Έ μμ° ν΄λμ€μ ν¬μν¨μΌλ‘μ¨ νΉμ μμ°μ νλ½μ΄ μ 체 ν¬νΈν΄λ¦¬μ€μ λ―ΈμΉλ μν₯μ μ΅μνν μ μμ΅λλ€.", | |
| "metadata": {"source": "portfolio_theory.pdf", "page": 1, "title": "ν¬νΈν΄λ¦¬μ€ μ΄λ‘ "} | |
| }, | |
| { | |
| "text": "νλ ν¬νΈν΄λ¦¬μ€ μ΄λ‘ (MPT)μ λ°λ₯΄λ©΄, ν¨μ¨μ ν¬μμ μ μ£Όμ΄μ§ μν μμ€μμ μ΅λ μμ΅μ μ 곡νλ ν¬νΈν΄λ¦¬μ€λ€μ μ§ν©μ λλ€. λ§μ½μμΈ λ μ΄ μ΄λ‘ μΌλ‘ λ Έλ²¨ κ²½μ νμμ μμνμ΅λλ€.", | |
| "metadata": {"source": "portfolio_theory.pdf", "page": 2, "title": "ν¬νΈν΄λ¦¬μ€ μ΄λ‘ "} | |
| }, | |
| { | |
| "text": "κΈμ΅μκΈ°λ λμ²΄λ‘ κ³Όλν λ λ²λ¦¬μ§, μμ° κ°κ²© κ±°ν, μμ€ν μ 리μ€ν¬μ μΆμ μΌλ‘ μΈν΄ λ°μν©λλ€. 2008λ κΈλ‘λ² κΈμ΅μκΈ°λ μλΈνλΌμ λͺ¨κΈ°μ§ μμ₯μ λΆκ΄΄μμ μμλμμ΅λλ€.", | |
| "metadata": {"source": "financial_crisis.pdf", "page": 1, "title": "κΈμ΅μκΈ°μ μμΈ"} | |
| }, | |
| { | |
| "text": "κΈμ΅μκΈ° μλ°©μ μν΄μλ 건μ ν κ·μ , μ μ ν μλ³Έ μꡬμ¬ν, κ·Έλ¦¬κ³ μμ€ν μ 리μ€ν¬ λͺ¨λν°λ§μ΄ νμμ μ λλ€. λ°μ €IIIλ μ΄λ¬ν κ·μ κ°νμ λνμ μΈ μμ λλ€.", | |
| "metadata": {"source": "financial_crisis.pdf", "page": 2, "title": "κΈμ΅μκΈ°μ μμΈ"} | |
| }, | |
| { | |
| "text": "ν¨μ¨μ μμ₯ κ°μ€(EMH)μ μ£Όκ°κ° λͺ¨λ κ°μ© μ 보λ₯Ό λ°μνλ€κ³ μ£Όμ₯ν©λλ€. μ½ν, μ€κ°ν, κ°ν ν¨μ¨μ±μ μΈ κ°μ§ ννλ‘ κ΅¬λΆλ©λλ€.", | |
| "metadata": {"source": "market_efficiency.pdf", "page": 1, "title": "μμ₯ ν¨μ¨μ±"} | |
| }, | |
| { | |
| "text": "νλμ¬λ¬΄νμ ν¬μμλ€μ λΉν©λ¦¬μ νλμ΄ μμ₯μ λ―ΈμΉλ μν₯μ μ°κ΅¬ν©λλ€. κ³Όμλ°μ, κ³Όμλ°μ, κ΅°μ§νλ λ±μ΄ λνμ μΈ νμμ λλ€.", | |
| "metadata": {"source": "behavioral_finance.pdf", "page": 1, "title": "νλμ¬λ¬΄ν"} | |
| }, | |
| { | |
| "text": "μμ°κ°κ²©κ²°μ λͺ¨ν(CAPM)μ κΈ°λμμ΅λ₯ μ΄ μμ₯ μν ν리미μκ³Ό λ² νμ κ³±μ 무μν μμ΅λ₯ μ λν κ°μ΄λΌκ³ μ€λͺ ν©λλ€. λ² νλ μμ₯ λλΉ μμ°μ λ³λμ±μ λνλ λλ€.", | |
| "metadata": {"source": "asset_pricing.pdf", "page": 1, "title": "μμ°κ°κ²©κ²°μ "} | |
| }, | |
| { | |
| "text": "μ΅μ κ°κ²©κ²°μ μλ λΈλ-μμ¦ λͺ¨νμ΄ λ리 μ¬μ©λ©λλ€. μ΄ λͺ¨νμ μ£Όκ°μ λ‘κ·Έμ κ·λΆν¬ κ°μ νμ μ λ½ν μ΅μ μ μ΄λ‘ κ°κ²©μ κ³μ°ν©λλ€.", | |
| "metadata": {"source": "derivatives.pdf", "page": 1, "title": "νμμν"} | |
| }, | |
| { | |
| "text": "μν κ΄λ¦¬μμ VaR(Value at Risk)λ νΉμ μ λ’°μμ€κ³Ό κΈ°κ°μμ μμλλ μ΅λ μμ€μ μΈ‘μ ν©λλ€. κΈμ΅κΈ°κ΄μ 리μ€ν¬ κ΄λ¦¬μ λ리 μ¬μ©λ©λλ€.", | |
| "metadata": {"source": "risk_management.pdf", "page": 1, "title": "μν κ΄λ¦¬"} | |
| }, | |
| { | |
| "text": "μ€μμνμ ν΅νμ μ± μ κΈλ¦¬ μ‘°μ , 곡κ°μμ₯μ΄μ, μ§κΈμ€λΉμ¨ λ³κ²½ λ±μ ν΅ν΄ κ²½μ μμ νλ₯Ό μΆκ΅¬ν©λλ€. μΈνλ μ΄μ νκ²ν μ΄ μ£Όμ μ μ± νλ μμν¬μ λλ€.", | |
| "metadata": {"source": "monetary_policy.pdf", "page": 1, "title": "ν΅νμ μ± "} | |
| }, | |
| { | |
| "text": "ESG ν¬μλ νκ²½(Environmental), μ¬ν(Social), μ§λ°°κ΅¬μ‘°(Governance) μμλ₯Ό κ³ λ €νλ ν¬μ μ λ΅μ λλ€. μ§μκ°λ₯ν ν¬μμ μ€μμ±μ΄ μ¦κ°νκ³ μμ΅λλ€.", | |
| "metadata": {"source": "esg_investing.pdf", "page": 1, "title": "ESG ν¬μ"} | |
| }, | |
| { | |
| "text": "μνΈννλ λΈλ‘μ²΄μΈ κΈ°μ μ κΈ°λ°μΌλ‘ ν λμ§νΈ μμ°μ λλ€. λΉνΈμ½μΈ, μ΄λ리μ λ±μ΄ λνμ μ΄λ©°, νμ€μν κΈμ΅(DeFi)μ κΈ°λ°μ΄ λκ³ μμ΅λλ€.", | |
| "metadata": {"source": "cryptocurrency.pdf", "page": 1, "title": "μνΈνν"} | |
| }, | |
| { | |
| "text": "μ μ©νκ°λ μ±λ¬΄μμ μ±λ¬΄ μ΄ν λ₯λ ₯μ νκ°ν©λλ€. μ μ©λ±κΈμ ν¬μ μμ¬κ²°μ κ³Ό κΈλ¦¬ κ²°μ μ μ€μν μν₯μ λ―ΈμΉ©λλ€.", | |
| "metadata": {"source": "credit_rating.pdf", "page": 1, "title": "μ μ©νκ°"} | |
| }, | |
| { | |
| "text": "μ£Όμμμ₯μ μ΄μνμ(anomaly)μλ κ·λͺ¨ν¨κ³Ό, κ°μΉν¨κ³Ό, λͺ¨λ©ν ν¨κ³Ό λ±μ΄ μμ΅λλ€. μ΄λ ν¨μ¨μ μμ₯ κ°μ€μ λν λ°λ‘λ‘ μ μλ©λλ€.", | |
| "metadata": {"source": "market_anomalies.pdf", "page": 1, "title": "μμ₯ μ΄μνμ"} | |
| }, | |
| { | |
| "text": "ꡬ쑰ν κΈμ΅μ μμ°μ λνμ¦κΆ(ABS), λ΄λ³΄λΆμ±κΆ(CDO) λ± λ³΅μ‘ν κΈμ΅μνμ ν¬ν¨ν©λλ€. 2008λ κΈμ΅μκΈ°μμ μ€μν μν μ νμ΅λλ€.", | |
| "metadata": {"source": "structured_finance.pdf", "page": 1, "title": "ꡬ쑰ν κΈμ΅"} | |
| }, | |
| ] | |
| def main(): | |
| """ν μ€νΈμ© λ²‘ν° DB λΉ λ₯΄κ² μμ±""" | |
| logger.info("=" * 80) | |
| logger.info("ν μ€νΈμ© λ²‘ν° DB μμ± μμ...") | |
| logger.info("=" * 80) | |
| try: | |
| # 1. Embedder μ΄κΈ°ν (λ¬΄λ£ sentence-transformers μ¬μ©) | |
| logger.info("1οΈβ£ Embedder μ΄κΈ°ν μ€...") | |
| embedder = Embedder( | |
| model_type="sentence-transformers", | |
| model_name="all-MiniLM-L6-v2" | |
| ) | |
| logger.info(f"β Embedder μ€λΉ μλ£ ({embedder.get_embedding_dimension()}μ°¨μ)") | |
| # 2. Vector Store μ΄κΈ°ν | |
| logger.info("2οΈβ£ Vector Store μ΄κΈ°ν μ€...") | |
| persist_dir = project_root / "data" / "chroma_db" | |
| persist_dir.mkdir(parents=True, exist_ok=True) | |
| vector_store = VectorStore( | |
| persist_directory=str(persist_dir), | |
| collection_name="financial_papers" | |
| ) | |
| logger.info("β Vector Store μ€λΉ μλ£") | |
| # 3. μν λ¬Έμ μλ² λ© μμ± | |
| logger.info(f"3οΈβ£ {len(SAMPLE_DOCUMENTS)}κ° μν λ¬Έμ μλ² λ© μ€...") | |
| texts = [doc["text"] for doc in SAMPLE_DOCUMENTS] | |
| embeddings = embedder.embed_batch(texts) | |
| logger.info(f"β μλ² λ© μμ± μλ£ ({len(embeddings)}κ°)") | |
| # 4. Vector Storeμ μΆκ° (chunks νμμΌλ‘ λ³ν) | |
| logger.info("4οΈβ£ Vector Storeμ λ¬Έμ μΆκ° μ€...") | |
| chunks = [] | |
| for i, doc in enumerate(SAMPLE_DOCUMENTS): | |
| chunks.append({ | |
| 'text': doc['text'], | |
| 'source_filename': doc['metadata']['source'], | |
| 'source_filepath': f"test_data/{doc['metadata']['source']}", | |
| 'chunk_id': i, # κ³ μ IDλ₯Ό μν΄ μΈλ±μ€ μ¬μ© | |
| 'total_chunks': 1, | |
| 'metadata': doc['metadata'], | |
| 'page_count': doc['metadata'].get('page', 1) | |
| }) | |
| vector_store.add_documents( | |
| chunks=chunks, | |
| embeddings=embeddings | |
| ) | |
| logger.info("β λ¬Έμ μΆκ° μλ£") | |
| # 5. κ²μ¦ | |
| logger.info("5οΈβ£ μμ±λ DB κ²μ¦ μ€...") | |
| count = vector_store.collection.count() | |
| logger.info("=" * 80) | |
| logger.info(f"β¨ ν μ€νΈ λ²‘ν° DB μμ± μλ£!") | |
| logger.info(f"π μ΄ λ¬Έμ: {count}κ°") | |
| logger.info(f"π μμΉ: {persist_dir}") | |
| logger.info(f"π 컬λ μ : financial_papers") | |
| logger.info("=" * 80) | |
| # 6. κ°λ¨ν ν μ€νΈ κ²μ (μ νμ¬ν) | |
| try: | |
| logger.info("\nπ§ͺ ν μ€νΈ κ²μ μ€ν μ€...") | |
| test_query = "ν¬νΈν΄λ¦¬μ€ λ€κ°νλ 무μμΈκ°μ?" | |
| query_embedding = embedder.embed_batch([test_query])[0] | |
| results = vector_store.search( | |
| query_embedding=query_embedding, | |
| top_k=3 | |
| ) | |
| logger.info(f"μ§λ¬Έ: {test_query}") | |
| logger.info(f"κ²μ κ²°κ³Ό: {len(results['documents'])}κ° λ¬Έμ λ°κ²¬") | |
| for i, (doc, distance) in enumerate(zip(results['documents'], results['distances']), 1): | |
| metadata = results['metadatas'][i-1] if i-1 < len(results['metadatas']) else {} | |
| title = metadata.get('title', 'Unknown') | |
| logger.info(f" [{i}] {title} (거리: {distance:.4f})") | |
| except Exception as e: | |
| logger.warning(f"ν μ€νΈ κ²μ μ€ν¨ (무μ κ°λ₯): {str(e)}") | |
| logger.info("\nβ DB μμ± μλ£! λ°±μλ μλ²λ₯Ό μ€νν μ€λΉκ° λμμ΅λλ€.") | |
| except Exception as e: | |
| logger.error(f"β μ€λ₯ λ°μ: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |