Spaces:

leedami
/

daitdanyang-backend

Sleeping

File size: 7,892 Bytes

41cc6f7

import os
import pickle
import numpy as np
# import lancedb # 🦁 Lite: Not used
# import torch
# from sentence_transformers import SentenceTransformer
# from kiwipiepy import Kiwi
from sqlalchemy import create_engine, text # 🦁 SQL Support

class NyangRagEngine:
    def __init__(self, base_dir=None):
        # 🦁 [Engine Init] Lightweight Mode (SQL Only)
        curr_file = os.path.abspath(__file__)
        core_dir = os.path.dirname(curr_file) 
        chatbot_dir = os.path.dirname(core_dir) 
        back_dir = os.path.dirname(chatbot_dir) 
        
        from huggingface_hub import snapshot_download

        # 🦁 [Force Update] 기존 DB 파일 삭제 (최신 버전 강제 다운로드)
        db_download_path = os.path.join(back_dir, "hf_db_storage")
        old_db_file = os.path.join(db_download_path, "petshop.db")
        if os.path.exists(old_db_file):
            try:
                os.remove(old_db_file)
                print(f"🧹 [Engine-Lite] Deleted old DB file: {old_db_file}")
            except Exception as e:
                print(f"⚠️ Failed to delete old DB: {e}")

        # 🦁 [Lite] Vector DB Download REMOVED (Handled by Google Cloud Run)
        
        # 🦁 [Lite] SQL Data Download (Keep this for Product Search)
        hf_db_repo = os.getenv("HF_DB_REPO")
        db_download_path = os.path.join(back_dir, "hf_db_storage")
        
        if hf_db_repo:
            try:
                print(f"🚀 [Engine-Lite] Downloading SQL Data from HF: {hf_db_repo}")
                os.makedirs(db_download_path, exist_ok=True)
                snapshot_download(
                    repo_id=hf_db_repo,
                    repo_type="dataset",
                    local_dir=db_download_path,
                    token=os.getenv("HF_TOKEN")
                )
                print("✅ SQL Data Download Complete!")
            except Exception as e:
                print(f"❌ SQL Data Download Failed: {e}")

        # 1. LanceDB & Cache Paths (Not used in Backend)
        self.data_dir = None
        self.cache_path = None
        
        # 2. SQL Database Path Strategy
        potential_paths = [
            os.path.join(db_download_path, "petshop.db"),          # HF Download
            os.path.join(back_dir, "instance", "petshop.db"),      # Default Local
            os.path.join(back_dir, "petshop.db")                   # Root Local
        ]
        
        self.sql_path = None
        for p in potential_paths:
            if os.path.exists(p):
                self.sql_path = p
                print(f"🦁 [Engine-Lite] Selected SQL DB: {self.sql_path}")
                break
        
        if not self.sql_path:
             print(f"❌ Error: petshop.db NOT FOUND!")

        # Initialize attributes to None (Not used)
        self.db = None
        self.embed_model = None
        self.kiwi = None
        self.coords_cache = None
        self.meta_cache = []
        self.id_to_idx = {}
        
    def load_resources(self):
        print("\n" + "="*50)
        print("🚀 [Engine-Lite] Skipping Heavy Resources (RAG is on Cloud Run)")
        print(f"   - SQL Path: {self.sql_path}")
        
        # 🦁 No heavy loading here!
        if self.sql_path and os.path.exists(self.sql_path):
            print("✅ SQL DB is ready.")
        else:
            print("⚠️ SQL DB not found.")
            
        print("🎉 [Engine-Lite] READY (Lightweight Mode)")
        print("="*50 + "\n")
        return True

    # 🦁 New: SQL Keyword Search (Multi-Field Weighted Scoring)
    def search_sql(self, keywords, limit=15):
        if not keywords or not self.sql_path: 
            print(f"⚠️ SQL Search Skipped: path={self.sql_path}")
            return []
        
        try:
            sql_engine = create_engine(f"sqlite:///{self.sql_path}")
            results = []
            
            # 1. Fetch Candidates (Broad Multi-Field Search)
            conditions = []
            params = {}
            valid_kws = [k for k in keywords if len(k) > 1]
            if not valid_kws: return []

            # Fields to search
            fields = ["title", "content", "category", "sub_category"]

            for i, kw in enumerate(valid_kws):
                key = f"kw{i}"
                field_queries = [f"({f} LIKE :{key})" for f in fields]
                conditions.append(f"({' OR '.join(field_queries)})")
                params[key] = f"%{kw}%"
            
            # Combine all keyword conditions with OR to get broad candidates
            where_clause = " OR ".join(conditions)
            query = text(f"SELECT id, title, price, category, content, sub_category, pet_type, stock, review_count, img_url FROM product WHERE {where_clause} LIMIT 500")
            
            with sql_engine.connect() as conn:
                rows = conn.execute(query, params).fetchall()
                
            # 2. Advanced Weighting & Scoring
            scored_rows = []
            targets = ["고양이", "강아지", "cat", "dog", "관상어", "소동물", "조류"]
            
            for r in rows:
                score = 0
                title = (r[1] or "").lower()
                content = (r[4] or "").lower()
                category = (r[3] or "").lower()
                sub_cat = (r[5] or "").lower()
                
                for kw in valid_kws:
                    kw_lower = kw.lower()
                    
                    # 🎯 Title Match (Highest Weight)
                    if kw_lower in title:
                        score += 15.0 + (title.count(kw_lower) * 2.0)
                    
                    # 📂 Category Match (Medium Weight)
                    if kw_lower in category or kw_lower in sub_cat:
                        score += 8.0
                        
                    # 📝 Content Match (Lower Weight)
                    if kw_lower in content:
                        score += 3.0 + (content.count(kw_lower) * 0.5)
                        
                    # 🐕 Pet Type Match (Targeting Boost)
                    if kw_lower in targets:
                        score += 10.0
                
                # Bonus for Matching Multiple Different Keywords (Co-occurrence)
                matches = sum(1 for kw in valid_kws if kw.lower() in (title + content + category))
                if matches > 1:
                    score *= (1.2 ** (matches - 1)) # Multi-keyword bonus
                
                scored_rows.append((r, score))
            
            # Sort by refined score
            scored_rows.sort(key=lambda x: x[1], reverse=True)
            
            # 3. Format Top Results
            for r, score in scored_rows[:limit]:
                results.append({
                    "id": f"shop_{r[0]}",
                    "title": r[1],
                    "price": r[2],
                    "category": r[3],
                    "content": r[4],
                    "sub_category": r[5],
                    "pet_type": r[6],
                    "stock": r[7],
                    "review_count": r[8],
                    "img_url": r[9],
                    "link": f"/product/{r[0]}",
                    "score": score, # Use the weighted score
                    "source": "homepage",
                    "type": "product"
                })
            return results
        except Exception as e:
            print(f"❌ SQL Search Failed: {e}")
            return []

    # 🦁 Stub Methods (No-op in Lite mode)
    def extract_keywords(self, text): return []
    def cluster_and_analyze(self, s1_data, request_id="unknown"): return {}, {}
    def search_hybrid(self, query, top_k=50, request_id="unknown"): return [], [], {}, []
    def search_refined(self, queries, request_id="unknown"): return []
    def rrf_merge(self, list1, list2_group, k=60): return []