Spaces:

Vedang2004
/

prediction_api

Running

App Files Files Community

prediction_api / solar_api /services /rag_shared.py

Vedang2004

Upload folder using huggingface_hub

4847e7d verified 2 months ago

raw

history blame contribute delete

2.16 kB

	import hashlib
	import os
	import re
	from urllib.parse import urlparse

	import psycopg2
	from dotenv import load_dotenv
	from sentence_transformers import SentenceTransformer

	# =====================================================
	# LOAD ENV
	# =====================================================
	load_dotenv()

	# =====================================================
	# CONFIG
	# =====================================================
	CHUNK_SIZE = 220
	DB_CONFIG = {
	"host": os.getenv("SQL_DATABASE_HOST"),
	"dbname": os.getenv("SQL_DATABASE"),
	"user": os.getenv("SQL_USER"),
	"password": os.getenv("SQL_PASSWORD"),
	"port": os.getenv("SQL_DATABASE_PORT", "5432"),
	"sslmode": "require"
	}

	# =====================================================
	# GLOBALS
	# =====================================================
	_EMBEDDER = None

	def get_embedder():
	"""Lazy load the sentence transformer model."""
	global _EMBEDDER
	if _EMBEDDER is None:
	_EMBEDDER = SentenceTransformer(
	"nomic-ai/nomic-embed-text-v1",
	trust_remote_code=True
	)
	return _EMBEDDER

	# =====================================================
	# DB SETUP
	# =====================================================
	def get_db_connection():
	return psycopg2.connect(**DB_CONFIG)

	# =====================================================
	# UTILS
	# =====================================================
	def normalize_url(url):
	parsed = urlparse(url)
	return f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip("/")

	def clean_text(text):
	return text.replace("\x00", "").strip()

	def page_hash(text):
	return hashlib.sha256(text.encode("utf-8")).hexdigest()

	def chunk_hash(text):
	return hashlib.sha256(text.encode("utf-8")).hexdigest()

	def chunk_text(text, size=200, overlap=50):
	words = text.split()
	step = size - overlap
	for i in range(0, len(words), step):
	yield " ".join(words[i:i + size])

	def extract_keywords(question):
	words = re.findall(r'\b[a-zA-Z]{3,}\b', question.lower())
	return list(set(words))