AdSafeLight / backend /scripts /build_vector_db.py
Gae Zang
feat: setup Docker and CI/CD deploy pipeline for Hugging Face Space
b0fb923
Raw
History Blame Contribute Delete
3.41 kB
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import FakeEmbeddings
# Load environment variables
ROOT_DIR = Path(__file__).resolve().parent.parent
load_dotenv(ROOT_DIR / ".env")
RAW_LAWS_PATH = ROOT_DIR / "data" / "raw_laws.json"
RAW_CASES_PATH = ROOT_DIR / "data" / "raw_cases.json"
FAISS_INDEX_DIR = ROOT_DIR / "data" / "faiss_index"
def load_data():
with open(RAW_LAWS_PATH, "r", encoding="utf-8") as f:
laws = json.load(f)
with open(RAW_CASES_PATH, "r", encoding="utf-8") as f:
cases = json.load(f)
return laws, cases
def create_documents(laws, cases):
documents = []
# 1. ๋ฒ•๋ น ๋ฐ์ดํ„ฐ ๋ฌธ๋‹จ ์ƒ์„ฑ
for law in laws:
# ๊ฒ€์ƒ‰์šฉ ํ…์ŠคํŠธ ๊ตฌ์„ฑ: ์กฐํ•ญ ๋‚ด์šฉ + ๊ด€๋ จ ํ‚ค์›Œ๋“œ
page_content = f"๋ฒ•๋ น๋ช…: {law['law_name']}\n์กฐํ•ญ: {law['article_no']} {law.get('paragraph_no', '')}\n์ œ๋ชฉ: {law['title']}\n๋‚ด์šฉ: {law['content']}\nํ‚ค์›Œ๋“œ: {', '.join(law['keywords'])}"
metadata = {
"type": "law",
"law_name": law["law_name"],
"article_no": law["article_no"],
"paragraph_no": law.get("paragraph_no", ""),
"title": law["title"]
}
documents.append(Document(page_content=page_content, metadata=metadata))
# 2. ๊ณต์ •์œ„ ์ œ์žฌ ์‚ฌ๋ก€ ๋ฐ์ดํ„ฐ ๋ฌธ๋‹จ ์ƒ์„ฑ
for case in cases:
# ๊ฒ€์ƒ‰์šฉ ํ…์ŠคํŠธ ๊ตฌ์„ฑ: ์‚ฌ๋ก€ ์ œ๋ชฉ + ์‚ฌ์‹ค ๊ด€๊ณ„ + ํŒ๋‹จ ์ด์œ  + ํ‚ค์›Œ๋“œ
page_content = f"๊ณต์ •์œ„ ์‚ฌ๋ก€: {case['case_title']}\n์œ„๋ฐ˜ ๋ฒ•๋ น: {case['violated_law']}\n์กฐ์น˜ ๋‚ด์šฉ: {case['sanction_details']}\n์‚ฌ์‹ค ๊ด€๊ณ„: {case['facts']}\nํŒ๋‹จ ์ด์œ : {case['reasoning']}\nํ‚ค์›Œ๋“œ: {', '.join(case['keywords'])}"
metadata = {
"type": "case",
"case_title": case["case_title"],
"violated_law": case["violated_law"],
"sanction_details": case["sanction_details"]
}
documents.append(Document(page_content=page_content, metadata=metadata))
return documents
def build_db():
print("[Info] Loading data...")
laws, cases = load_data()
docs = create_documents(laws, cases)
print(f"[Info] Loaded {len(docs)} documents (Laws: {len(laws)}, Cases: {len(cases)})")
openai_key = os.getenv("OPENAI_API_KEY")
# OpenAI API Key๊ฐ€ ํ”Œ๋ ˆ์ด์Šคํ™€๋”์ด๊ฑฐ๋‚˜ ์—†์„ ๊ฒฝ์šฐ ๊ฐ€์งœ ์ž„๋ฒ ๋”ฉ ์‚ฌ์šฉ
if not openai_key or "your_openai_api_key" in openai_key or openai_key.strip() == "":
print("[Warning] OPENAI_API_KEY is not set or is using placeholder.")
print("[Warning] Using FakeEmbeddings for testing to build FAISS index.")
embeddings = FakeEmbeddings(size=1536)
else:
print("[Info] Using OpenAI text-embedding-3-small model...")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
print("[Info] Building FAISS Vector DB...")
db = FAISS.from_documents(docs, embeddings)
# ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ ํ›„ ์ €์žฅ
FAISS_INDEX_DIR.mkdir(parents=True, exist_ok=True)
db.save_local(str(FAISS_INDEX_DIR))
print(f"[Success] FAISS index saved successfully to: {FAISS_INDEX_DIR}")
if __name__ == "__main__":
build_db()