nexus-ai-api / scripts /seed_data.py
github-actions[bot]
Auto Deploy from Monorepo: Merge pull request #92 from human13th2team/dev
1e1f1bf
import os, uuid, json, time
import redis as redis_lib
import pandas as pd
import anthropic
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from pathlib import Path
load_dotenv(Path(__file__).parent.parent / ".env")
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
AUTOCOMPLETE_KEY = "equipment:autocomplete"
BASE_DIR = Path(__file__).parent
EXCEL_EQUIPMENT = BASE_DIR / "μŒμ‹_μ„€λΉ„.xlsx"
EXCEL_TYPES = BASE_DIR / "μŒμ‹μ—…μ’…_λΆ„λ₯˜.xlsx"
CSV_SURVIVAL = BASE_DIR / "μ˜μ„Έ_μžμ˜μ—…_μ‹ μƒκΈ°μ—…μƒμ‘΄μœ¨_20260511154402.csv"
DB_URL = os.environ["DATABASE_URL"].replace("postgresql+asyncpg://", "postgresql://")
API_KEY = os.environ["ANTHROPIC_API_KEY"]
KOSIS_CATEGORIES = [
"ν•œμ‹ 일반 μŒμ‹μ μ—…", "ν•œμ‹ λ©΄ μš”λ¦¬ 전문점", "ν•œμ‹ 윑λ₯˜ μš”λ¦¬ 전문점",
"ν•œμ‹ ν•΄μ‚°λ¬Ό μš”λ¦¬ 전문점", "쀑식 μŒμ‹μ μ—…", "일식 μŒμ‹μ μ—…",
"μ„œμ–‘μ‹ μŒμ‹μ μ—…", "기타 외ꡭ식 μŒμ‹μ μ—…", "μ œκ³Όμ μ—…",
"ν”Όμž 햄버거 μƒŒλ“œμœ„μΉ˜ 및 μœ μ‚¬ μŒμ‹μ μ—…", "μΉ˜ν‚¨ 전문점",
"κΉ€λ°₯ 및 기타 간이 μŒμ‹μ μ—…", "간이 μŒμ‹ 포μž₯ 판맀 전문점",
"생λ§₯μ£Ό 전문점", "기타 주점업", "컀피 전문점", "기타 λΉ„μ•Œμ½”μ˜¬ μŒλ£Œμ μ—…",
]
# λ‹¨λž€μ£Όμ μ€ μ„œλΉ„μŠ€ 제곡 μ•ˆν•¨ -> μœ„λ½ λΆ„λ₯˜ 제거
BUILDING_USE = {
"1μ’…κ·Όλ¦°": ["제과점", "카페", "λ–‘μ§‘", "λ””μ €νŠΈ 전문점", "베이컀리 카페", "브런치 카페",
"ν† μŠ€νŠΈκ°€κ²Œ", "포μž₯마차", "μ£½μ§‘"],
"2μ’…κ·Όλ¦°": ["고기ꡬ이 전문점", "μƒ€λΈŒμƒ€λΈŒμ§‘", "ν•œμ •μ‹μ§‘", "쀑ꡭ집", "일식집",
"λ™λ‚¨μ•„μ‹œμ•„ 식당", "인도 식당", "νšŸμ§‘", "λ·”νŽ˜", "뢄식집", "νŒ¨μŠ€νŠΈν‘Έλ“œ",
"μΉ˜ν‚¨ 전문점", "ν”Όμž 전문점", "햄버거 전문점", "κ΅­λ°₯μ§‘ (λŒ€ν˜•μ†₯, μž₯μ‹œκ°„ 윑수λ₯˜)",
"κ΅­μˆ˜μ§‘", "λ§Œλ‘ 찐빡집", "κ΅­λ¬Όμš”λ¦¬ 전문점", "μ°œλ‹­ 전문점", "μ–‘κΌ¬μΉ˜ 전문점",
"λ§ˆλΌνƒ•κ°€κ²Œ", "포케 전문점", "νŒŒμŠ€νƒ€ 전문점", "λ„μ‹œλ½ 전문점",
"쑱발, 보쌈 전문점", "κ³±μ°½, 막창 전문점", "λ‹­λ°œ 전문점", "λ¬΄ν•œλ¦¬ν•„ κ³ κΈ°μ§‘",
"꼬치 전문점 ", "μ΄μžμΉ΄μ•Ό", "νšŒμ „μ΄ˆλ°₯ μ§‘", "μŠ€ν…Œμ΄ν¬ 전문점",
"μƒλŸ¬λ“œ 전문점", "라멘 전문점", "μ˜€λ§ˆμΉ΄μ„Έ", "κΉ€λ°₯ 전문점", "μƒŒλ“œμœ„μΉ˜ 전문점","ν˜Έν”„μ§‘", "고기주점", "와인바", "μΉ΅ν…ŒμΌλ°”"]
}
def load_equipment():
df = pd.read_excel(EXCEL_EQUIPMENT)
rows = []
for category in df.columns:
for name in df[category].dropna():
name = str(name).strip()
if name:
rows.append({"id": str(uuid.uuid4()), "name": name, "category": category})
return rows
def load_restaurant_types():
df = pd.read_excel(EXCEL_TYPES, header=None)
rows = []
for name in df[0].dropna():
name = str(name).strip()
if name:
building_use = next(
(code for code, names in BUILDING_USE.items() if name in names), "2μ’…κ·Όλ¦°"
)
rows.append({
"id": str(uuid.uuid4()),
"name": name,
"building_use_code": building_use,
})
return rows
def load_survival_rates():
df = pd.read_csv(CSV_SURVIVAL, encoding="CP949")
df = df.iloc[1:].reset_index(drop=True)
df.columns = ["μ—…μ’…","1y_19","2y_19","3y_19","4y_19","5y_19",
"1y_20","2y_20","3y_20","4y_20","5y_20",
"1y_21","2y_21","3y_21","4y_21","5y_21"]
num_cols = df.columns[1:]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")
df["avg_1y"] = ((df["1y_19"]+df["1y_20"]+df["1y_21"])/3).round(2)
df["avg_3y"] = ((df["3y_19"]+df["3y_20"]+df["3y_21"])/3).round(2)
df["avg_5y"] = ((df["5y_19"]+df["5y_20"]+df["5y_21"])/3).round(2)
return df[["μ—…μ’…","avg_1y","avg_3y","avg_5y"]].set_index("μ—…μ’…").to_dict("index")
def llm_map_kosis(client, restaurant_name):
res = client.messages.create(
model="claude-opus-4-5",
max_tokens=50,
messages=[{"role": "user", "content":
f"μŒμ‹μ  μ—…μ’… '{restaurant_name}'은 μ•„λž˜ KOSIS 톡계 μΉ΄ν…Œκ³ λ¦¬ 쀑 어디에 ν•΄λ‹Ήν•΄?\n"
f"κ°€μž₯ μœ μ‚¬ν•œ μΉ΄ν…Œκ³ λ¦¬ ν•˜λ‚˜λ§Œ κ³¨λΌμ„œ μ •ν™•νžˆ κ·Έ μ΄λ¦„λ§Œ 응닡해. λ‹€λ₯Έ 말 ν•˜μ§€ 마.\n\n"
f"{json.dumps(KOSIS_CATEGORIES, ensure_ascii=False)}"
}]
)
return res.content[0].text.strip()
def llm_map_equipment(client, restaurant_name, equipment_list):
res = client.messages.create(
model="claude-opus-4-5",
max_tokens=4096,
messages=[{"role": "user", "content":
f"μŒμ‹μ  μ—…μ’…: {restaurant_name}\n\n"
f"μ•„λž˜ μ„€λΉ„ λͺ©λ‘μ—μ„œ 이 μ—…μ’… 창업에 ν•„μš”ν•œ μ„€λΉ„λ₯Ό μ„ νƒν•΄μ€˜.\n"
f"- required: μ—†μœΌλ©΄ μ˜μ—… λΆˆκ°€λŠ₯ν•œ ν•„μˆ˜ μ„€λΉ„\n"
f"- optional: 있으면 μ’‹μ§€λ§Œ 없어도 λ˜λŠ” μ„€λΉ„\n"
f"- weight: ν•΄λ‹Ή μ„€λΉ„κ°€ μ—…μ’… νŠΉμ„±μ„ μ–Όλ§ˆλ‚˜ λŒ€ν‘œν•˜λŠ”μ§€ (0.1 ~ 1.0)\n\n"
f"μ„€λΉ„ λͺ©λ‘:\n{json.dumps(equipment_list, ensure_ascii=False)}\n\n"
f"λ°˜λ“œμ‹œ μ•„λž˜ JSON ν˜•μ‹μœΌλ‘œλ§Œ 응닡해. λ‹€λ₯Έ 말 ν•˜μ§€ 마.\n"
f'[{{"name": "μ„€λΉ„λͺ…", "is_required": true, "weight": 0.9}}]'
}]
)
raw = res.content[0].text.strip().replace("```json","").replace("```","").strip()
print(f" RAW: {raw[:200]}")
return json.loads(raw)
def insert_equipment(engine, rows):
with engine.begin() as conn:
for r in rows:
conn.execute(text(
"INSERT INTO equipment (id, name, category) VALUES (:id, :name, :category) ON CONFLICT (name) DO NOTHING"
), r)
print(f"[equipment] {len(rows)}개 μ™„λ£Œ")
def insert_restaurant_types(engine, rows):
with engine.begin() as conn:
for r in rows:
conn.execute(text(
"INSERT INTO restaurant_types (id, name, building_use_code) VALUES (:id, :name, :building_use_code) ON CONFLICT (name) DO NOTHING"
), r)
print(f"[restaurant_types] {len(rows)}개 μ™„λ£Œ")
def update_kosis_and_survival(engine, rt_rows, survival_rates, client):
with engine.begin() as conn:
for r in rt_rows:
kosis_cat = llm_map_kosis(client, r["name"])
sv = survival_rates.get(kosis_cat, {})
conn.execute(text("""
UPDATE restaurant_types
SET kosis_category=:kosis, survival_rate_1y=:s1, survival_rate_3y=:s3, survival_rate_5y=:s5
WHERE id=:id
"""), {"id": r["id"], "kosis": kosis_cat,
"s1": sv.get("avg_1y"), "s3": sv.get("avg_3y"), "s5": sv.get("avg_5y")})
print(f" {r['name']} β†’ {kosis_cat}")
time.sleep(0.3)
print("[KOSIS λ§€ν•‘ + μƒμ‘΄μœ¨ μ™„λ£Œ]")
def insert_equipment_map(engine, rt_rows, eq_rows, client):
eq_names = [e["name"] for e in eq_rows]
eq_by_name = {e["name"]: e["id"] for e in eq_rows}
with engine.begin() as conn:
for r in rt_rows:
print(f" λ§€ν•‘ 쀑: {r['name']}")
try:
items = llm_map_equipment(client, r["name"], eq_names)
except Exception as e:
print(f" [ERROR] {r['name']}: {e}")
import traceback; traceback.print_exc()
continue
for item in items:
eq_id = eq_by_name.get(item["name"])
if not eq_id:
continue
conn.execute(text("""
INSERT INTO restaurant_equipment_map (restaurant_type_id, equipment_id, is_required, weight)
VALUES (:rt_id, :eq_id, :req, :w)
ON CONFLICT DO NOTHING
"""), {"rt_id": r["id"], "eq_id": eq_id,
"req": item.get("is_required", True), "w": item.get("weight", 1.0)})
time.sleep(0.5)
print("[restaurant_equipment_map μ™„λ£Œ]")
def cache_equipment_to_redis(eq_rows):
r = redis_lib.from_url(REDIS_URL, decode_responses=True)
pipe = r.pipeline()
pipe.delete("equipment:autocomplete")
for eq in eq_rows:
name = eq["name"]
pipe.zadd("equipment:autocomplete", {name: 0})
pipe.hset(f"equipment:detail:{name}", mapping={
"id": eq["id"],
"category": eq["category"],
})
pipe.execute()
r.close()
print(f"[Redis] equipment:autocomplete {len(eq_rows)}개 캐싱 μ™„λ£Œ")
def main():
print("=== seed_data.py μ‹œμž‘ ===")
client = anthropic.Anthropic(api_key=API_KEY)
engine = create_engine(DB_URL)
eq_rows = load_equipment()
insert_equipment(engine, eq_rows)
cache_equipment_to_redis(eq_rows)
rt_rows = load_restaurant_types()
insert_restaurant_types(engine, rt_rows)
survival_rates = load_survival_rates()
update_kosis_and_survival(engine, rt_rows, survival_rates, client)
with engine.connect() as conn:
result = conn.execute(text("SELECT id, name FROM equipment"))
eq_rows = [{"id": str(r.id), "name": r.name} for r in result.fetchall()]
insert_equipment_map(engine, rt_rows, eq_rows, client)
print("=== μ™„λ£Œ ===")
if __name__ == "__main__":
main()