Spaces:

UNIQ-DEV
/

chat-faq

Runtime error

App Files Files Community

chat-faq / embed.py

AnnasBlackHat

fix time updated

89f6f6b over 1 year ago

raw

history blame contribute delete

4.86 kB

	import pymongo
	from datetime import datetime
	import os
	from dependency import load_db
	import requests
	import time

	client = load_db()
	db = client["chat_support"]
	faq_collection = db["faq"]
	log_collection = db['log']
	question_vector_collection = db['faq-vector']


	hf_token = os.environ['HF_TOKEN']
	embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1"

	def embed(text):
	print('get embed: ', text)
	start_time = time.time()
	while True:
	response = requests.post(
	embedding_url,
	headers={"Authorization": f"Bearer {hf_token}"},
	json={"inputs": text})

	if response.status_code == 200:
	return response.json()
	elif response.status_code == 503:
	if time.time() - start_time > 90: # Stop retrying after 30 seconds
	raise ValueError("Request failed with status code 503 after multiple retries")
	else:
	print("Retrying... after ", time.time() - start_time)
	time.sleep(1) # Wait for 1 second before retrying
	else:
	raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")


	last_sync_doc = log_collection.find_one({"key": "last_text_embedding"})
	if last_sync_doc:
	last_sync_timestamp = int(last_sync_doc["value"])
	else:
	last_sync_timestamp = 0

	# Function to get the last run time from the log collection
	def get_last_run_time():
	log = log_collection.find_one({"key": "vectorize_faq_last_run"})
	if log:
	return log["last_run"]
	return 0

	# Function to save or update the last run time in the log collection
	def update_last_run_time():
	# now = datetime.datetime.utcnow()
	current_timestamp = int(round(time.time() * 1000)) #datetime.now().timestamp() * 1000
	log_collection.replace_one(
	{"key": "vectorize_faq_last_run"},
	{"key": "vectorize_faq_last_run", "last_run": now},
	upsert=True
	)

	# Function to vectorize questions using your embedding function
	def vectorize_questions(faq):
	vectors = []
	for i, question in enumerate(faq["questions"]):
	vector = embed(question)
	vectors.append({"question": question, "vector": vector})
	return vectors

	# Function to save question vectors to the question_vectors collection
	def save_question_vectors(faq_id, vectors):
	for i, vector in enumerate(vectors):
	question_vector_collection.update_one(
	{"_id": f"{faq_id}_{i}"},
	{"$set": {"question": vector["question"], "vector": vector["vector"]}},
	upsert=True
	)

	# Get the last run time
	last_run_time = get_last_run_time()

	# Fetch FAQs that have been updated since the last run time
	faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}})

	# Vectorize and save question vectors for each FAQ
	for faq in faqs:
	# vectors = vectorize_questions(faq)
	# save_question_vectors(str(faq["_id"]), vectors)
	break

	# Update the last run time in the log collection
	# update_last_run_time()

	query = "cara menambah bank"
	# results = question_vector_collection.aggregate([
	# {"$vectorSearch": {
	# "queryVector": embed(query),
	# "path": "vector",
	# "numCandidates": 100,
	# "limit": 4,
	# "index": "vector_index",
	# }}
	# ])

	# print('result: ')
	# for document in results:
	# print(f'DocId: {document["_id"]}\n')

	# results = question_vector_collection.aggregate([
	# {"$vectorSearch": {
	# "queryVector": embed(query),
	# "path": "vector",
	# "numCandidates": 100,
	# "limit": 4,
	# "index": "vector_index",
	# }},
	# {"$project": {
	# "_id": 1,
	# "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
	# "question": 1,
	# "vector": 1
	# }},
	# {"$lookup": {
	# "from": "faq",
	# "localField": "faq_id",
	# "foreignField": "_id",
	# "as": "faq_document"
	# }}
	# ])

	results = question_vector_collection.aggregate([
	{"$vectorSearch": {
	"queryVector": embed(query),
	"path": "vector",
	"numCandidates": 100,
	"limit": 4,
	"index": "vector_index",
	}},
	{"$project": {
	"_id": 1,
	"faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
	"question": 1,
	"vector": 1
	}},
	{"$lookup": {
	"from": "faq",
	"let": {"faq_id_str": "$faq_id"},
	"pipeline": [
	{"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}},
	],
	"as": "faq_document"
	}}
	])

	print('result: ')
	for document in results:
	print(f'DocId: {document["_id"]}')
	print(document.keys())
	print(f'FAQ Document: {document["faq_document"][0]["answer"]}')
	print('-------------------')