Spaces:
Paused
Paused
| import os | |
| import json | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from langchain_core.documents import Document | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.embeddings import FakeEmbeddings | |
| # Load environment variables | |
| ROOT_DIR = Path(__file__).resolve().parent.parent | |
| load_dotenv(ROOT_DIR / ".env") | |
| RAW_LAWS_PATH = ROOT_DIR / "data" / "raw_laws.json" | |
| RAW_CASES_PATH = ROOT_DIR / "data" / "raw_cases.json" | |
| FAISS_INDEX_DIR = ROOT_DIR / "data" / "faiss_index" | |
| def load_data(): | |
| with open(RAW_LAWS_PATH, "r", encoding="utf-8") as f: | |
| laws = json.load(f) | |
| with open(RAW_CASES_PATH, "r", encoding="utf-8") as f: | |
| cases = json.load(f) | |
| return laws, cases | |
| def create_documents(laws, cases): | |
| documents = [] | |
| # 1. ๋ฒ๋ น ๋ฐ์ดํฐ ๋ฌธ๋จ ์์ฑ | |
| for law in laws: | |
| # ๊ฒ์์ฉ ํ ์คํธ ๊ตฌ์ฑ: ์กฐํญ ๋ด์ฉ + ๊ด๋ จ ํค์๋ | |
| page_content = f"๋ฒ๋ น๋ช : {law['law_name']}\n์กฐํญ: {law['article_no']} {law.get('paragraph_no', '')}\n์ ๋ชฉ: {law['title']}\n๋ด์ฉ: {law['content']}\nํค์๋: {', '.join(law['keywords'])}" | |
| metadata = { | |
| "type": "law", | |
| "law_name": law["law_name"], | |
| "article_no": law["article_no"], | |
| "paragraph_no": law.get("paragraph_no", ""), | |
| "title": law["title"] | |
| } | |
| documents.append(Document(page_content=page_content, metadata=metadata)) | |
| # 2. ๊ณต์ ์ ์ ์ฌ ์ฌ๋ก ๋ฐ์ดํฐ ๋ฌธ๋จ ์์ฑ | |
| for case in cases: | |
| # ๊ฒ์์ฉ ํ ์คํธ ๊ตฌ์ฑ: ์ฌ๋ก ์ ๋ชฉ + ์ฌ์ค ๊ด๊ณ + ํ๋จ ์ด์ + ํค์๋ | |
| page_content = f"๊ณต์ ์ ์ฌ๋ก: {case['case_title']}\n์๋ฐ ๋ฒ๋ น: {case['violated_law']}\n์กฐ์น ๋ด์ฉ: {case['sanction_details']}\n์ฌ์ค ๊ด๊ณ: {case['facts']}\nํ๋จ ์ด์ : {case['reasoning']}\nํค์๋: {', '.join(case['keywords'])}" | |
| metadata = { | |
| "type": "case", | |
| "case_title": case["case_title"], | |
| "violated_law": case["violated_law"], | |
| "sanction_details": case["sanction_details"] | |
| } | |
| documents.append(Document(page_content=page_content, metadata=metadata)) | |
| return documents | |
| def build_db(): | |
| print("[Info] Loading data...") | |
| laws, cases = load_data() | |
| docs = create_documents(laws, cases) | |
| print(f"[Info] Loaded {len(docs)} documents (Laws: {len(laws)}, Cases: {len(cases)})") | |
| openai_key = os.getenv("OPENAI_API_KEY") | |
| # OpenAI API Key๊ฐ ํ๋ ์ด์คํ๋์ด๊ฑฐ๋ ์์ ๊ฒฝ์ฐ ๊ฐ์ง ์๋ฒ ๋ฉ ์ฌ์ฉ | |
| if not openai_key or "your_openai_api_key" in openai_key or openai_key.strip() == "": | |
| print("[Warning] OPENAI_API_KEY is not set or is using placeholder.") | |
| print("[Warning] Using FakeEmbeddings for testing to build FAISS index.") | |
| embeddings = FakeEmbeddings(size=1536) | |
| else: | |
| print("[Info] Using OpenAI text-embedding-3-small model...") | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
| print("[Info] Building FAISS Vector DB...") | |
| db = FAISS.from_documents(docs, embeddings) | |
| # ๋๋ ํ ๋ฆฌ ์์ฑ ํ ์ ์ฅ | |
| FAISS_INDEX_DIR.mkdir(parents=True, exist_ok=True) | |
| db.save_local(str(FAISS_INDEX_DIR)) | |
| print(f"[Success] FAISS index saved successfully to: {FAISS_INDEX_DIR}") | |
| if __name__ == "__main__": | |
| build_db() | |