update match_documents
Browse files- app/supabase_db.py +38 -24
app/supabase_db.py
CHANGED
|
@@ -3,6 +3,8 @@ from postgrest.types import CountMethod
|
|
| 3 |
from supabase.client import create_client, Client
|
| 4 |
from loguru import logger
|
| 5 |
import re
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from .utils import timing_decorator_sync
|
| 8 |
from .constants import VEHICLE_KEYWORD_TO_COLUMN, VIETNAMESE_STOP_WORDS, VIETNAMESE_STOP_PHRASES
|
|
@@ -44,7 +46,7 @@ class SupabaseClient:
|
|
| 44 |
return None
|
| 45 |
|
| 46 |
@timing_decorator_sync
|
| 47 |
-
def match_documents(self, embedding: List[float], match_count: Optional[int] = None, vehicle_keywords: Optional[List[str]] = None, user_question: str = '',
|
| 48 |
"""
|
| 49 |
Truy vấn vector similarity search qua RPC match_documents.
|
| 50 |
Input: embedding (list[float]), match_count (int), vehicle_keywords (list[str] hoặc None)
|
|
@@ -74,30 +76,42 @@ class SupabaseClient:
|
|
| 74 |
# logger.info(f"[DEBUG][RPC]: embedding: {embedding[:5]}...{embedding[-5:]}")
|
| 75 |
logger.info(f"[DEBUG][RPC]: embedding: {embedding}")
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
response = self.client.rpc(
|
| 91 |
-
'match_documents',
|
| 92 |
-
payload
|
| 93 |
-
).execute()
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
@timing_decorator_sync
|
| 103 |
def store_embedding(self, text: str, embedding: List[float], metadata: Dict[str, Any]):
|
|
|
|
| 3 |
from supabase.client import create_client, Client
|
| 4 |
from loguru import logger
|
| 5 |
import re
|
| 6 |
+
import time
|
| 7 |
+
import httpx
|
| 8 |
|
| 9 |
from .utils import timing_decorator_sync
|
| 10 |
from .constants import VEHICLE_KEYWORD_TO_COLUMN, VIETNAMESE_STOP_WORDS, VIETNAMESE_STOP_PHRASES
|
|
|
|
| 46 |
return None
|
| 47 |
|
| 48 |
@timing_decorator_sync
|
| 49 |
+
def match_documents(self, embedding: List[float], match_count: Optional[int] = None, vehicle_keywords: Optional[List[str]] = None, user_question: str = '', keyword_threshold: float = 0.01, vector_threshold: float = 0.3, rrf_k: int = 60):
|
| 50 |
"""
|
| 51 |
Truy vấn vector similarity search qua RPC match_documents.
|
| 52 |
Input: embedding (list[float]), match_count (int), vehicle_keywords (list[str] hoặc None)
|
|
|
|
| 76 |
# logger.info(f"[DEBUG][RPC]: embedding: {embedding[:5]}...{embedding[-5:]}")
|
| 77 |
logger.info(f"[DEBUG][RPC]: embedding: {embedding}")
|
| 78 |
|
| 79 |
+
payload = {
|
| 80 |
+
'query_text': or_query_tsquery,
|
| 81 |
+
'query_embedding': embedding,
|
| 82 |
+
'match_count': match_count,
|
| 83 |
+
'keyword_threshold': keyword_threshold,
|
| 84 |
+
'vector_threshold': vector_threshold,
|
| 85 |
+
'vehicle_filters': None,
|
| 86 |
+
'rrf_k': rrf_k
|
| 87 |
+
}
|
| 88 |
+
if vehicle_keywords:
|
| 89 |
+
vehicle_columns = [VEHICLE_KEYWORD_TO_COLUMN[k] for k in vehicle_keywords if k in VEHICLE_KEYWORD_TO_COLUMN]
|
| 90 |
+
if vehicle_columns:
|
| 91 |
+
payload['vehicle_filters'] = vehicle_columns
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
max_retries = 3
|
| 94 |
+
for attempt in range(max_retries):
|
| 95 |
+
try:
|
| 96 |
+
response = self.client.rpc(
|
| 97 |
+
'match_documents',
|
| 98 |
+
payload
|
| 99 |
+
).execute()
|
| 100 |
+
|
| 101 |
+
if response.data:
|
| 102 |
+
return response.data
|
| 103 |
+
return []
|
| 104 |
+
except httpx.TimeoutException:
|
| 105 |
+
logger.warning(f"Supabase RPC 'match_documents' timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
|
| 106 |
+
if attempt == max_retries - 1:
|
| 107 |
+
logger.error(f"Supabase RPC failed after {max_retries} attempts due to timeout.")
|
| 108 |
+
return []
|
| 109 |
+
time.sleep(1 * (2 ** attempt)) # Exponential backoff: 1s, 2s, 4s
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Error matching documents: {e}")
|
| 112 |
+
return []
|
| 113 |
+
|
| 114 |
+
return [] # Fallback in case loop finishes without returning
|
| 115 |
|
| 116 |
@timing_decorator_sync
|
| 117 |
def store_embedding(self, text: str, embedding: List[float], metadata: Dict[str, Any]):
|