chat-faq / embed.py
AnnasBlackHat's picture
fix time updated
89f6f6b
import pymongo
from datetime import datetime
import os
from dependency import load_db
import requests
import time
client = load_db()
db = client["chat_support"]
faq_collection = db["faq"]
log_collection = db['log']
question_vector_collection = db['faq-vector']
hf_token = os.environ['HF_TOKEN']
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1"
def embed(text):
print('get embed: ', text)
start_time = time.time()
while True:
response = requests.post(
embedding_url,
headers={"Authorization": f"Bearer {hf_token}"},
json={"inputs": text})
if response.status_code == 200:
return response.json()
elif response.status_code == 503:
if time.time() - start_time > 90: # Stop retrying after 30 seconds
raise ValueError("Request failed with status code 503 after multiple retries")
else:
print("Retrying... after ", time.time() - start_time)
time.sleep(1) # Wait for 1 second before retrying
else:
raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")
last_sync_doc = log_collection.find_one({"key": "last_text_embedding"})
if last_sync_doc:
last_sync_timestamp = int(last_sync_doc["value"])
else:
last_sync_timestamp = 0
# Function to get the last run time from the log collection
def get_last_run_time():
log = log_collection.find_one({"key": "vectorize_faq_last_run"})
if log:
return log["last_run"]
return 0
# Function to save or update the last run time in the log collection
def update_last_run_time():
# now = datetime.datetime.utcnow()
current_timestamp = int(round(time.time() * 1000)) #datetime.now().timestamp() * 1000
log_collection.replace_one(
{"key": "vectorize_faq_last_run"},
{"key": "vectorize_faq_last_run", "last_run": now},
upsert=True
)
# Function to vectorize questions using your embedding function
def vectorize_questions(faq):
vectors = []
for i, question in enumerate(faq["questions"]):
vector = embed(question)
vectors.append({"question": question, "vector": vector})
return vectors
# Function to save question vectors to the question_vectors collection
def save_question_vectors(faq_id, vectors):
for i, vector in enumerate(vectors):
question_vector_collection.update_one(
{"_id": f"{faq_id}_{i}"},
{"$set": {"question": vector["question"], "vector": vector["vector"]}},
upsert=True
)
# Get the last run time
last_run_time = get_last_run_time()
# Fetch FAQs that have been updated since the last run time
faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}})
# Vectorize and save question vectors for each FAQ
for faq in faqs:
# vectors = vectorize_questions(faq)
# save_question_vectors(str(faq["_id"]), vectors)
break
# Update the last run time in the log collection
# update_last_run_time()
query = "cara menambah bank"
# results = question_vector_collection.aggregate([
# {"$vectorSearch": {
# "queryVector": embed(query),
# "path": "vector",
# "numCandidates": 100,
# "limit": 4,
# "index": "vector_index",
# }}
# ])
# print('result: ')
# for document in results:
# print(f'DocId: {document["_id"]}\n')
# results = question_vector_collection.aggregate([
# {"$vectorSearch": {
# "queryVector": embed(query),
# "path": "vector",
# "numCandidates": 100,
# "limit": 4,
# "index": "vector_index",
# }},
# {"$project": {
# "_id": 1,
# "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
# "question": 1,
# "vector": 1
# }},
# {"$lookup": {
# "from": "faq",
# "localField": "faq_id",
# "foreignField": "_id",
# "as": "faq_document"
# }}
# ])
results = question_vector_collection.aggregate([
{"$vectorSearch": {
"queryVector": embed(query),
"path": "vector",
"numCandidates": 100,
"limit": 4,
"index": "vector_index",
}},
{"$project": {
"_id": 1,
"faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
"question": 1,
"vector": 1
}},
{"$lookup": {
"from": "faq",
"let": {"faq_id_str": "$faq_id"},
"pipeline": [
{"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}},
],
"as": "faq_document"
}}
])
print('result: ')
for document in results:
print(f'DocId: {document["_id"]}')
print(document.keys())
print(f'FAQ Document: {document["faq_document"][0]["answer"]}')
print('-------------------')