Spaces:

TradaAI
/

Chatopus

Running

App Files Files Community

Chatopus / app /supabase_db.py

VietCat

update log to debug level

96d59ac 5 months ago

raw

history blame contribute delete

16.2 kB

	from typing import Any, Dict, List, Optional
	from postgrest.types import CountMethod
	from supabase.client import create_client, Client, ClientOptions
	from postgrest.exceptions import APIError
	from loguru import logger
	import re
	import httpx
	from tenacity import (
	retry,
	stop_after_attempt,
	wait_exponential,
	retry_if_exception_type,
	)

	from .utils import timing_decorator_sync
	from .constants import (
	VEHICLE_KEYWORD_TO_COLUMN,
	VIETNAMESE_STOP_WORDS,
	VIETNAMESE_STOP_PHRASES,
	)
	from .config import get_settings

	# --- Cơ chế retry mạnh mẽ và có thể tái sử dụng ---
	retry_on_supabase_error = retry(
	stop=stop_after_attempt(4), # 1 lần gọi gốc + 3 lần thử lại
	wait=wait_exponential(multiplier=5, min=10, max=60), # Chờ 10s, 20s, 40s
	retry=retry_if_exception_type((httpx.HTTPError, APIError)),
	before_sleep=lambda retry_state: logger.warning(
	f"Supabase call failed, retrying... Attempt: {retry_state.attempt_number}, Error: {retry_state.outcome.exception()}"
	),
	)


	def remove_stop_phrases(text, stop_phrases):
	for phrase in stop_phrases:
	# Sửa: Không escape dấu cách trong phrase, chỉ escape các ký tự đặc biệt khác
	# Loại bỏ cụm từ, chỉ xóa khi là từ nguyên vẹn
	pattern = rf"\b{phrase}\b"
	text = re.sub(pattern, " ", text)
	return text


	class SupabaseClient:
	def __init__(self, url: str, key: str):
	"""
	Khởi tạo SupabaseClient với url và key.
	Input: url (str), key (str)
	Output: SupabaseClient instance.
	"""
	# Tăng thời gian timeout mặc định của client để xử lý các truy vấn nặng
	opts = ClientOptions(postgrest_client_timeout=60.0)
	self.client: Client = create_client(url, key, options=opts)

	settings = get_settings()
	self.default_match_count = settings.match_count

	@timing_decorator_sync
	@retry_on_supabase_error
	def get_page_token(self, page_id: str):
	"""
	Lấy access token của Facebook page từ Supabase.
	Input: page_id (str)
	Output: access_token (str) hoặc None nếu không có.
	"""
	try:
	response = (
	self.client.table("PageToken")
	.select("token")
	.eq("id", page_id)
	.execute()
	)
	if response.data and len(response.data) > 0:
	return response.data[0]["token"]
	return None
	except (httpx.HTTPError, APIError) as e:
	logger.error(f"Error getting page token after retries: {e}")
	raise # Ném lại lỗi để tenacity có thể bắt và retry
	except Exception as e: # Bắt các lỗi không mong muốn khác
	logger.exception(
	f"An unexpected error occurred while getting page token: {e}"
	)
	return None # Không retry với lỗi không mong muốn

	@timing_decorator_sync
	@retry_on_supabase_error
	def match_documents(
	self,
	embedding: List[float],
	match_count: Optional[int] = None,
	vehicle_keywords: Optional[List[str]] = None,
	user_question: str = "",
	keyword_threshold: float = 0.01,
	vector_threshold: float = 0.3,
	rrf_k: int = 60,
	):
	"""
	Truy vấn vector similarity search qua RPC match_documents.
	Input: embedding (list[float]), match_count (int), vehicle_keywords (list[str] hoặc None)
	Output: list[dict] kết quả truy vấn.
	"""
	# Sử dụng match_count từ config nếu không được truyền vào
	if match_count is None:
	match_count = self.default_match_count

	# Chuẩn bị chuỗi truy vấn trong Python
	# Tách từ và nối bằng '\|'

	"""
	Xử lý câu hỏi thô: tách từ, loại bỏ stop words,
	và trả về chuỗi text sạch để truyền vào RPC.
	"""

	# Lọc bỏ các từ có trong danh sách stop words và nối thành chuỗi với dấu cách
	# 1. Loại bỏ stop phrase (từ ghép)
	cleaned_text = remove_stop_phrases(
	user_question.lower(), VIETNAMESE_STOP_PHRASES
	)
	# 2. Loại bỏ ký tự đặc biệt (chỉ giữ lại chữ cái, số, dấu cách)
	cleaned_text = re.sub(r"[^\w\s]", " ", cleaned_text)
	# 3. Tách từ và loại bỏ stop word đơn lẻ
	words = cleaned_text.split()
	or_query_tsquery = " ".join(
	[word for word in words if word not in VIETNAMESE_STOP_WORDS]
	)
	logger.debug(f"[DEBUG][RPC]: or_query_tsquery: {or_query_tsquery}")
	logger.debug(f"[DEBUG][RPC]: embedding: {embedding}")

	payload = {
	"query_text": or_query_tsquery,
	"query_embedding": embedding,
	"vehicle_filters": None,
	}
	if vehicle_keywords:
	vehicle_columns = [
	VEHICLE_KEYWORD_TO_COLUMN[k]
	for k in vehicle_keywords
	if k in VEHICLE_KEYWORD_TO_COLUMN
	]
	if vehicle_columns:
	payload["vehicle_filters"] = vehicle_columns

	try:
	response = self.client.rpc("match_documents", payload).execute()
	return response.data or []
	except (httpx.HTTPError, APIError) as e:
	logger.error(f"Error matching documents after retries: {e}")
	raise
	except Exception as e:
	logger.exception(f"An unexpected error occurred in match_documents: {e}")
	return []

	@timing_decorator_sync
	@retry_on_supabase_error
	def store_embedding(
	self, text: str, embedding: List[float], metadata: Dict[str, Any]
	):
	"""
	Lưu embedding vào Supabase.
	Input: text (str), embedding (list[float]), metadata (dict)
	Output: bool (True nếu thành công, False nếu lỗi)
	"""
	try:
	response = (
	self.client.table("embeddings")
	.insert({"content": text, "embedding": embedding, "metadata": metadata})
	.execute()
	)

	return bool(response.data)
	except (httpx.HTTPError, APIError) as e:
	logger.error(f"Error storing embedding after retries: {e}")
	raise
	except Exception as e:
	logger.exception(
	f"An unexpected error occurred while storing embedding: {e}"
	)
	return False

	@timing_decorator_sync
	@retry_on_supabase_error
	def store_document_chunk(self, chunk_data: Dict[str, Any]) -> bool:
	"""
	Lưu document chunk vào Supabase.
	Input: chunk_data (dict) - chứa tất cả thông tin chunk
	Output: bool (True nếu thành công, False nếu lỗi)
	"""
	try:
	# Xử lý các giá trị null/empty cho integer fields
	processed_data = chunk_data.copy()

	# Giữ lại embedding để lưu vào database
	if "embedding" in processed_data:
	processed_data["embedding"] = processed_data["embedding"]

	# Xử lý article_number - chỉ gửi nếu có giá trị hợp lệ
	if "article_number" in processed_data:
	if (
	processed_data["article_number"] is None
	or processed_data["article_number"] == ""
	):
	processed_data["article_number"] = None
	elif isinstance(processed_data["article_number"], str):
	try:
	processed_data["article_number"] = int(
	processed_data["article_number"]
	)
	except (ValueError, TypeError):
	processed_data["article_number"] = None

	# Xử lý vanbanid - đảm bảo là integer
	if "vanbanid" in processed_data:
	if isinstance(processed_data["vanbanid"], str):
	try:
	processed_data["vanbanid"] = int(processed_data["vanbanid"])
	except (ValueError, TypeError):
	logger.error(f"Invalid vanbanid: {processed_data['vanbanid']}")
	return False

	# Xử lý các trường text - chuyển empty string thành None
	text_fields = [
	"document_title",
	"article_title",
	"clause_number",
	"sub_clause_letter",
	"context_summary",
	]
	for field in text_fields:
	if field in processed_data and processed_data[field] == "":
	processed_data[field] = None

	# Xử lý cha field - chuyển empty string thành None
	if "cha" in processed_data and processed_data["cha"] == "":
	processed_data["cha"] = None

	response = (
	self.client.table("document_chunks").insert(processed_data).execute()
	)

	if response.data:
	logger.debug(
	f"Successfully stored chunk {processed_data.get('id', 'unknown')}"
	)
	return True
	else:
	logger.error(
	f"Failed to store chunk {processed_data.get('id', 'unknown')}"
	)
	return False

	except (httpx.HTTPError, APIError) as e:
	logger.error(f"Error storing document chunk after retries: {e}")
	raise
	except Exception as e:
	logger.exception(
	f"An unexpected error occurred while storing document chunk: {e}"
	)
	return False

	@timing_decorator_sync
	@retry_on_supabase_error
	def delete_all_document_chunks(self) -> bool:
	"""
	Xóa toàn bộ bảng document_chunks.
	Output: bool (True nếu thành công, False nếu lỗi)
	"""
	try:
	# Xóa tất cả records trong bảng
	response = self.client.table("document_chunks").delete().execute()
	logger.info(f"Successfully deleted all document chunks")
	return True
	except (httpx.HTTPError, APIError) as e:
	logger.error(f"Error deleting all document chunks after retries: {e}")
	raise
	except Exception as e:
	logger.exception(
	f"An unexpected error occurred while deleting all document chunks: {e}"
	)
	return False

	@timing_decorator_sync
	@retry_on_supabase_error
	def get_document_chunks_by_vanbanid(self, vanbanid: int) -> List[Dict[str, Any]]:
	"""
	Lấy tất cả chunks của một văn bản theo vanbanid.
	Input: vanbanid (int)
	Output: List[Dict] - danh sách chunks
	"""
	try:
	response = (
	self.client.table("document_chunks")
	.select("*")
	.eq("vanbanid", vanbanid)
	.execute()
	)
	if response.data:
	logger.debug(
	f"Found {len(response.data)} chunks for vanbanid {vanbanid}"
	)
	return response.data
	return []
	except (httpx.HTTPError, APIError) as e:
	logger.error(
	f"Error getting document chunks for vanbanid {vanbanid} after retries: {e}"
	)
	raise
	except Exception as e:
	logger.exception(
	f"An unexpected error occurred while getting document chunks for vanbanid {vanbanid}: {e}"
	)
	return []

	@timing_decorator_sync
	@retry_on_supabase_error
	def delete_document_chunks_by_vanbanid(self, vanbanid: int) -> bool:
	"""
	Xóa tất cả chunks của một văn bản theo vanbanid.
	Input: vanbanid (int)
	Output: bool (True nếu thành công, False nếu lỗi)
	"""
	try:
	response = (
	self.client.table("document_chunks")
	.delete()
	.eq("vanbanid", vanbanid)
	.execute()
	)
	logger.debug(f"Successfully deleted all chunks for vanbanid {vanbanid}")
	return True
	except (httpx.HTTPError, APIError) as e:
	logger.error(
	f"Error deleting chunks for vanbanid {vanbanid} after retries: {e}"
	)
	raise
	except Exception as e:
	logger.exception(
	f"An unexpected error occurred while deleting chunks for vanbanid {vanbanid}: {e}"
	)
	return False

	@timing_decorator_sync
	@retry_on_supabase_error
	def get_all_document_chunks(self) -> List[Dict[str, Any]]:
	"""
	Lấy toàn bộ dữ liệu từ bảng document_chunks.
	Output: List[Dict] - danh sách tất cả chunks
	"""
	try:
	logger.info("[SUPABASE] Fetching all document chunks")

	# Đếm tổng số records trước
	count_response = (
	self.client.table("document_chunks")
	.select("*", count=CountMethod.exact)
	.execute()
	)
	total_count = (
	count_response.count if hasattr(count_response, "count") else "unknown"
	)
	logger.info(f"[SUPABASE] Total records in table: {total_count}")

	all_chunks = []
	page_size = 1000
	last_id = 0
	page_count = 0

	while True:
	page_count += 1

	# Sử dụng cursor-based pagination với id
	if last_id == 0:
	# Lần đầu: lấy từ đầu
	response = (
	self.client.table("document_chunks")
	.select("*")
	.order("id")
	.limit(page_size)
	.execute()
	)
	else: # noqa
	# Các lần sau: lấy từ id > last_id
	response = (
	self.client.table("document_chunks")
	.select("*")
	.order("id")
	.gt("id", last_id)
	.limit(page_size)
	.execute()
	)

	actual_count = len(response.data) if response.data else 0 # noqa
	logger.debug(
	f"[SUPABASE] Page {page_count}: last_id={last_id}, requested={page_size}, actual={actual_count}"
	)

	if not response.data:
	logger.debug(f"[SUPABASE] No more data after id {last_id}")
	break

	all_chunks.extend(response.data)

	# Cập nhật last_id cho page tiếp theo
	if response.data:
	last_id = max(chunk.get("id", 0) for chunk in response.data)

	if actual_count < page_size: # noqa
	logger.debug(f"[SUPABASE] Last page with {actual_count} records")
	break

	logger.info(
	f"[SUPABASE] Cursor-based pagination fetched {len(all_chunks)} document chunks (expected: {total_count})"
	)
	logger.info(
	f"[SUPABASE] Fetched {page_count} pages with page_size={page_size}"
	)
	return all_chunks

	except (httpx.HTTPError, APIError) as e:
	logger.error(
	f"[SUPABASE] Error fetching document chunks after retries: {e}"
	)
	raise
	except Exception as e:
	logger.exception(
	f"An unexpected error occurred while fetching document chunks: {e}"
	)
	return []