Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import numpy as np | |
| # import lancedb # π¦ Lite: Not used | |
| # import torch | |
| # from sentence_transformers import SentenceTransformer | |
| # from kiwipiepy import Kiwi | |
| from sqlalchemy import create_engine, text # π¦ SQL Support | |
| class NyangRagEngine: | |
| def __init__(self, base_dir=None): | |
| # π¦ [Engine Init] Lightweight Mode (SQL Only) | |
| curr_file = os.path.abspath(__file__) | |
| core_dir = os.path.dirname(curr_file) | |
| chatbot_dir = os.path.dirname(core_dir) | |
| back_dir = os.path.dirname(chatbot_dir) | |
| from huggingface_hub import snapshot_download | |
| # π¦ [Force Update] κΈ°μ‘΄ DB νμΌ μμ (μ΅μ λ²μ κ°μ λ€μ΄λ‘λ) | |
| db_download_path = os.path.join(back_dir, "hf_db_storage") | |
| old_db_file = os.path.join(db_download_path, "petshop.db") | |
| if os.path.exists(old_db_file): | |
| try: | |
| os.remove(old_db_file) | |
| print(f"π§Ή [Engine-Lite] Deleted old DB file: {old_db_file}") | |
| except Exception as e: | |
| print(f"β οΈ Failed to delete old DB: {e}") | |
| # π¦ [Lite] Vector DB Download REMOVED (Handled by Google Cloud Run) | |
| # π¦ [Lite] SQL Data Download (Keep this for Product Search) | |
| hf_db_repo = os.getenv("HF_DB_REPO") | |
| db_download_path = os.path.join(back_dir, "hf_db_storage") | |
| if hf_db_repo: | |
| try: | |
| print(f"π [Engine-Lite] Downloading SQL Data from HF: {hf_db_repo}") | |
| os.makedirs(db_download_path, exist_ok=True) | |
| snapshot_download( | |
| repo_id=hf_db_repo, | |
| repo_type="dataset", | |
| local_dir=db_download_path, | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| print("β SQL Data Download Complete!") | |
| except Exception as e: | |
| print(f"β SQL Data Download Failed: {e}") | |
| # 1. LanceDB & Cache Paths (Not used in Backend) | |
| self.data_dir = None | |
| self.cache_path = None | |
| # 2. SQL Database Path Strategy | |
| potential_paths = [ | |
| os.path.join(db_download_path, "petshop.db"), # HF Download | |
| os.path.join(back_dir, "instance", "petshop.db"), # Default Local | |
| os.path.join(back_dir, "petshop.db") # Root Local | |
| ] | |
| self.sql_path = None | |
| for p in potential_paths: | |
| if os.path.exists(p): | |
| self.sql_path = p | |
| print(f"π¦ [Engine-Lite] Selected SQL DB: {self.sql_path}") | |
| break | |
| if not self.sql_path: | |
| print(f"β Error: petshop.db NOT FOUND!") | |
| # Initialize attributes to None (Not used) | |
| self.db = None | |
| self.embed_model = None | |
| self.kiwi = None | |
| self.coords_cache = None | |
| self.meta_cache = [] | |
| self.id_to_idx = {} | |
| def load_resources(self): | |
| print("\n" + "="*50) | |
| print("π [Engine-Lite] Skipping Heavy Resources (RAG is on Cloud Run)") | |
| print(f" - SQL Path: {self.sql_path}") | |
| # π¦ No heavy loading here! | |
| if self.sql_path and os.path.exists(self.sql_path): | |
| print("β SQL DB is ready.") | |
| else: | |
| print("β οΈ SQL DB not found.") | |
| print("π [Engine-Lite] READY (Lightweight Mode)") | |
| print("="*50 + "\n") | |
| return True | |
| # π¦ New: SQL Keyword Search (Multi-Field Weighted Scoring) | |
| def search_sql(self, keywords, limit=15): | |
| if not keywords or not self.sql_path: | |
| print(f"β οΈ SQL Search Skipped: path={self.sql_path}") | |
| return [] | |
| try: | |
| sql_engine = create_engine(f"sqlite:///{self.sql_path}") | |
| results = [] | |
| # 1. Fetch Candidates (Broad Multi-Field Search) | |
| conditions = [] | |
| params = {} | |
| valid_kws = [k for k in keywords if len(k) > 1] | |
| if not valid_kws: return [] | |
| # Fields to search | |
| fields = ["title", "content", "category", "sub_category"] | |
| for i, kw in enumerate(valid_kws): | |
| key = f"kw{i}" | |
| field_queries = [f"({f} LIKE :{key})" for f in fields] | |
| conditions.append(f"({' OR '.join(field_queries)})") | |
| params[key] = f"%{kw}%" | |
| # Combine all keyword conditions with OR to get broad candidates | |
| where_clause = " OR ".join(conditions) | |
| query = text(f"SELECT id, title, price, category, content, sub_category, pet_type, stock, review_count, img_url FROM product WHERE {where_clause} LIMIT 500") | |
| with sql_engine.connect() as conn: | |
| rows = conn.execute(query, params).fetchall() | |
| # 2. Advanced Weighting & Scoring | |
| scored_rows = [] | |
| targets = ["κ³ μμ΄", "κ°μμ§", "cat", "dog", "κ΄μμ΄", "μλλ¬Ό", "μ‘°λ₯"] | |
| for r in rows: | |
| score = 0 | |
| title = (r[1] or "").lower() | |
| content = (r[4] or "").lower() | |
| category = (r[3] or "").lower() | |
| sub_cat = (r[5] or "").lower() | |
| for kw in valid_kws: | |
| kw_lower = kw.lower() | |
| # π― Title Match (Highest Weight) | |
| if kw_lower in title: | |
| score += 15.0 + (title.count(kw_lower) * 2.0) | |
| # π Category Match (Medium Weight) | |
| if kw_lower in category or kw_lower in sub_cat: | |
| score += 8.0 | |
| # π Content Match (Lower Weight) | |
| if kw_lower in content: | |
| score += 3.0 + (content.count(kw_lower) * 0.5) | |
| # π Pet Type Match (Targeting Boost) | |
| if kw_lower in targets: | |
| score += 10.0 | |
| # Bonus for Matching Multiple Different Keywords (Co-occurrence) | |
| matches = sum(1 for kw in valid_kws if kw.lower() in (title + content + category)) | |
| if matches > 1: | |
| score *= (1.2 ** (matches - 1)) # Multi-keyword bonus | |
| scored_rows.append((r, score)) | |
| # Sort by refined score | |
| scored_rows.sort(key=lambda x: x[1], reverse=True) | |
| # 3. Format Top Results | |
| for r, score in scored_rows[:limit]: | |
| results.append({ | |
| "id": f"shop_{r[0]}", | |
| "title": r[1], | |
| "price": r[2], | |
| "category": r[3], | |
| "content": r[4], | |
| "sub_category": r[5], | |
| "pet_type": r[6], | |
| "stock": r[7], | |
| "review_count": r[8], | |
| "img_url": r[9], | |
| "link": f"/product/{r[0]}", | |
| "score": score, # Use the weighted score | |
| "source": "homepage", | |
| "type": "product" | |
| }) | |
| return results | |
| except Exception as e: | |
| print(f"β SQL Search Failed: {e}") | |
| return [] | |
| # π¦ Stub Methods (No-op in Lite mode) | |
| def extract_keywords(self, text): return [] | |
| def cluster_and_analyze(self, s1_data, request_id="unknown"): return {}, {} | |
| def search_hybrid(self, query, top_k=50, request_id="unknown"): return [], [], {}, [] | |
| def search_refined(self, queries, request_id="unknown"): return [] | |
| def rrf_merge(self, list1, list2_group, k=60): return [] |