Spaces:
Sleeping
Sleeping
File size: 7,892 Bytes
41cc6f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | import os
import pickle
import numpy as np
# import lancedb # π¦ Lite: Not used
# import torch
# from sentence_transformers import SentenceTransformer
# from kiwipiepy import Kiwi
from sqlalchemy import create_engine, text # π¦ SQL Support
class NyangRagEngine:
def __init__(self, base_dir=None):
# π¦ [Engine Init] Lightweight Mode (SQL Only)
curr_file = os.path.abspath(__file__)
core_dir = os.path.dirname(curr_file)
chatbot_dir = os.path.dirname(core_dir)
back_dir = os.path.dirname(chatbot_dir)
from huggingface_hub import snapshot_download
# π¦ [Force Update] κΈ°μ‘΄ DB νμΌ μμ (μ΅μ λ²μ κ°μ λ€μ΄λ‘λ)
db_download_path = os.path.join(back_dir, "hf_db_storage")
old_db_file = os.path.join(db_download_path, "petshop.db")
if os.path.exists(old_db_file):
try:
os.remove(old_db_file)
print(f"π§Ή [Engine-Lite] Deleted old DB file: {old_db_file}")
except Exception as e:
print(f"β οΈ Failed to delete old DB: {e}")
# π¦ [Lite] Vector DB Download REMOVED (Handled by Google Cloud Run)
# π¦ [Lite] SQL Data Download (Keep this for Product Search)
hf_db_repo = os.getenv("HF_DB_REPO")
db_download_path = os.path.join(back_dir, "hf_db_storage")
if hf_db_repo:
try:
print(f"π [Engine-Lite] Downloading SQL Data from HF: {hf_db_repo}")
os.makedirs(db_download_path, exist_ok=True)
snapshot_download(
repo_id=hf_db_repo,
repo_type="dataset",
local_dir=db_download_path,
token=os.getenv("HF_TOKEN")
)
print("β
SQL Data Download Complete!")
except Exception as e:
print(f"β SQL Data Download Failed: {e}")
# 1. LanceDB & Cache Paths (Not used in Backend)
self.data_dir = None
self.cache_path = None
# 2. SQL Database Path Strategy
potential_paths = [
os.path.join(db_download_path, "petshop.db"), # HF Download
os.path.join(back_dir, "instance", "petshop.db"), # Default Local
os.path.join(back_dir, "petshop.db") # Root Local
]
self.sql_path = None
for p in potential_paths:
if os.path.exists(p):
self.sql_path = p
print(f"π¦ [Engine-Lite] Selected SQL DB: {self.sql_path}")
break
if not self.sql_path:
print(f"β Error: petshop.db NOT FOUND!")
# Initialize attributes to None (Not used)
self.db = None
self.embed_model = None
self.kiwi = None
self.coords_cache = None
self.meta_cache = []
self.id_to_idx = {}
def load_resources(self):
print("\n" + "="*50)
print("π [Engine-Lite] Skipping Heavy Resources (RAG is on Cloud Run)")
print(f" - SQL Path: {self.sql_path}")
# π¦ No heavy loading here!
if self.sql_path and os.path.exists(self.sql_path):
print("β
SQL DB is ready.")
else:
print("β οΈ SQL DB not found.")
print("π [Engine-Lite] READY (Lightweight Mode)")
print("="*50 + "\n")
return True
# π¦ New: SQL Keyword Search (Multi-Field Weighted Scoring)
def search_sql(self, keywords, limit=15):
if not keywords or not self.sql_path:
print(f"β οΈ SQL Search Skipped: path={self.sql_path}")
return []
try:
sql_engine = create_engine(f"sqlite:///{self.sql_path}")
results = []
# 1. Fetch Candidates (Broad Multi-Field Search)
conditions = []
params = {}
valid_kws = [k for k in keywords if len(k) > 1]
if not valid_kws: return []
# Fields to search
fields = ["title", "content", "category", "sub_category"]
for i, kw in enumerate(valid_kws):
key = f"kw{i}"
field_queries = [f"({f} LIKE :{key})" for f in fields]
conditions.append(f"({' OR '.join(field_queries)})")
params[key] = f"%{kw}%"
# Combine all keyword conditions with OR to get broad candidates
where_clause = " OR ".join(conditions)
query = text(f"SELECT id, title, price, category, content, sub_category, pet_type, stock, review_count, img_url FROM product WHERE {where_clause} LIMIT 500")
with sql_engine.connect() as conn:
rows = conn.execute(query, params).fetchall()
# 2. Advanced Weighting & Scoring
scored_rows = []
targets = ["κ³ μμ΄", "κ°μμ§", "cat", "dog", "κ΄μμ΄", "μλλ¬Ό", "μ‘°λ₯"]
for r in rows:
score = 0
title = (r[1] or "").lower()
content = (r[4] or "").lower()
category = (r[3] or "").lower()
sub_cat = (r[5] or "").lower()
for kw in valid_kws:
kw_lower = kw.lower()
# π― Title Match (Highest Weight)
if kw_lower in title:
score += 15.0 + (title.count(kw_lower) * 2.0)
# π Category Match (Medium Weight)
if kw_lower in category or kw_lower in sub_cat:
score += 8.0
# π Content Match (Lower Weight)
if kw_lower in content:
score += 3.0 + (content.count(kw_lower) * 0.5)
# π Pet Type Match (Targeting Boost)
if kw_lower in targets:
score += 10.0
# Bonus for Matching Multiple Different Keywords (Co-occurrence)
matches = sum(1 for kw in valid_kws if kw.lower() in (title + content + category))
if matches > 1:
score *= (1.2 ** (matches - 1)) # Multi-keyword bonus
scored_rows.append((r, score))
# Sort by refined score
scored_rows.sort(key=lambda x: x[1], reverse=True)
# 3. Format Top Results
for r, score in scored_rows[:limit]:
results.append({
"id": f"shop_{r[0]}",
"title": r[1],
"price": r[2],
"category": r[3],
"content": r[4],
"sub_category": r[5],
"pet_type": r[6],
"stock": r[7],
"review_count": r[8],
"img_url": r[9],
"link": f"/product/{r[0]}",
"score": score, # Use the weighted score
"source": "homepage",
"type": "product"
})
return results
except Exception as e:
print(f"β SQL Search Failed: {e}")
return []
# π¦ Stub Methods (No-op in Lite mode)
def extract_keywords(self, text): return []
def cluster_and_analyze(self, s1_data, request_id="unknown"): return {}, {}
def search_hybrid(self, query, top_k=50, request_id="unknown"): return [], [], {}, []
def search_refined(self, queries, request_id="unknown"): return []
def rrf_merge(self, list1, list2_group, k=60): return [] |