leedami's picture
Deploy from Team Script
41cc6f7 verified
import os
import pickle
import numpy as np
# import lancedb # 🦁 Lite: Not used
# import torch
# from sentence_transformers import SentenceTransformer
# from kiwipiepy import Kiwi
from sqlalchemy import create_engine, text # 🦁 SQL Support
class NyangRagEngine:
def __init__(self, base_dir=None):
# 🦁 [Engine Init] Lightweight Mode (SQL Only)
curr_file = os.path.abspath(__file__)
core_dir = os.path.dirname(curr_file)
chatbot_dir = os.path.dirname(core_dir)
back_dir = os.path.dirname(chatbot_dir)
from huggingface_hub import snapshot_download
# 🦁 [Force Update] κΈ°μ‘΄ DB 파일 μ‚­μ œ (μ΅œμ‹  버전 κ°•μ œ λ‹€μš΄λ‘œλ“œ)
db_download_path = os.path.join(back_dir, "hf_db_storage")
old_db_file = os.path.join(db_download_path, "petshop.db")
if os.path.exists(old_db_file):
try:
os.remove(old_db_file)
print(f"🧹 [Engine-Lite] Deleted old DB file: {old_db_file}")
except Exception as e:
print(f"⚠️ Failed to delete old DB: {e}")
# 🦁 [Lite] Vector DB Download REMOVED (Handled by Google Cloud Run)
# 🦁 [Lite] SQL Data Download (Keep this for Product Search)
hf_db_repo = os.getenv("HF_DB_REPO")
db_download_path = os.path.join(back_dir, "hf_db_storage")
if hf_db_repo:
try:
print(f"πŸš€ [Engine-Lite] Downloading SQL Data from HF: {hf_db_repo}")
os.makedirs(db_download_path, exist_ok=True)
snapshot_download(
repo_id=hf_db_repo,
repo_type="dataset",
local_dir=db_download_path,
token=os.getenv("HF_TOKEN")
)
print("βœ… SQL Data Download Complete!")
except Exception as e:
print(f"❌ SQL Data Download Failed: {e}")
# 1. LanceDB & Cache Paths (Not used in Backend)
self.data_dir = None
self.cache_path = None
# 2. SQL Database Path Strategy
potential_paths = [
os.path.join(db_download_path, "petshop.db"), # HF Download
os.path.join(back_dir, "instance", "petshop.db"), # Default Local
os.path.join(back_dir, "petshop.db") # Root Local
]
self.sql_path = None
for p in potential_paths:
if os.path.exists(p):
self.sql_path = p
print(f"🦁 [Engine-Lite] Selected SQL DB: {self.sql_path}")
break
if not self.sql_path:
print(f"❌ Error: petshop.db NOT FOUND!")
# Initialize attributes to None (Not used)
self.db = None
self.embed_model = None
self.kiwi = None
self.coords_cache = None
self.meta_cache = []
self.id_to_idx = {}
def load_resources(self):
print("\n" + "="*50)
print("πŸš€ [Engine-Lite] Skipping Heavy Resources (RAG is on Cloud Run)")
print(f" - SQL Path: {self.sql_path}")
# 🦁 No heavy loading here!
if self.sql_path and os.path.exists(self.sql_path):
print("βœ… SQL DB is ready.")
else:
print("⚠️ SQL DB not found.")
print("πŸŽ‰ [Engine-Lite] READY (Lightweight Mode)")
print("="*50 + "\n")
return True
# 🦁 New: SQL Keyword Search (Multi-Field Weighted Scoring)
def search_sql(self, keywords, limit=15):
if not keywords or not self.sql_path:
print(f"⚠️ SQL Search Skipped: path={self.sql_path}")
return []
try:
sql_engine = create_engine(f"sqlite:///{self.sql_path}")
results = []
# 1. Fetch Candidates (Broad Multi-Field Search)
conditions = []
params = {}
valid_kws = [k for k in keywords if len(k) > 1]
if not valid_kws: return []
# Fields to search
fields = ["title", "content", "category", "sub_category"]
for i, kw in enumerate(valid_kws):
key = f"kw{i}"
field_queries = [f"({f} LIKE :{key})" for f in fields]
conditions.append(f"({' OR '.join(field_queries)})")
params[key] = f"%{kw}%"
# Combine all keyword conditions with OR to get broad candidates
where_clause = " OR ".join(conditions)
query = text(f"SELECT id, title, price, category, content, sub_category, pet_type, stock, review_count, img_url FROM product WHERE {where_clause} LIMIT 500")
with sql_engine.connect() as conn:
rows = conn.execute(query, params).fetchall()
# 2. Advanced Weighting & Scoring
scored_rows = []
targets = ["고양이", "κ°•μ•„μ§€", "cat", "dog", "관상어", "μ†Œλ™λ¬Ό", "μ‘°λ₯˜"]
for r in rows:
score = 0
title = (r[1] or "").lower()
content = (r[4] or "").lower()
category = (r[3] or "").lower()
sub_cat = (r[5] or "").lower()
for kw in valid_kws:
kw_lower = kw.lower()
# 🎯 Title Match (Highest Weight)
if kw_lower in title:
score += 15.0 + (title.count(kw_lower) * 2.0)
# πŸ“‚ Category Match (Medium Weight)
if kw_lower in category or kw_lower in sub_cat:
score += 8.0
# πŸ“ Content Match (Lower Weight)
if kw_lower in content:
score += 3.0 + (content.count(kw_lower) * 0.5)
# πŸ• Pet Type Match (Targeting Boost)
if kw_lower in targets:
score += 10.0
# Bonus for Matching Multiple Different Keywords (Co-occurrence)
matches = sum(1 for kw in valid_kws if kw.lower() in (title + content + category))
if matches > 1:
score *= (1.2 ** (matches - 1)) # Multi-keyword bonus
scored_rows.append((r, score))
# Sort by refined score
scored_rows.sort(key=lambda x: x[1], reverse=True)
# 3. Format Top Results
for r, score in scored_rows[:limit]:
results.append({
"id": f"shop_{r[0]}",
"title": r[1],
"price": r[2],
"category": r[3],
"content": r[4],
"sub_category": r[5],
"pet_type": r[6],
"stock": r[7],
"review_count": r[8],
"img_url": r[9],
"link": f"/product/{r[0]}",
"score": score, # Use the weighted score
"source": "homepage",
"type": "product"
})
return results
except Exception as e:
print(f"❌ SQL Search Failed: {e}")
return []
# 🦁 Stub Methods (No-op in Lite mode)
def extract_keywords(self, text): return []
def cluster_and_analyze(self, s1_data, request_id="unknown"): return {}, {}
def search_hybrid(self, query, top_k=50, request_id="unknown"): return [], [], {}, []
def search_refined(self, queries, request_id="unknown"): return []
def rrf_merge(self, list1, list2_group, k=60): return []