File size: 4,855 Bytes
58f8884 89f6f6b 58f8884 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import pymongo
from datetime import datetime
import os
from dependency import load_db
import requests
import time
client = load_db()
db = client["chat_support"]
faq_collection = db["faq"]
log_collection = db['log']
question_vector_collection = db['faq-vector']
hf_token = os.environ['HF_TOKEN']
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1"
def embed(text):
print('get embed: ', text)
start_time = time.time()
while True:
response = requests.post(
embedding_url,
headers={"Authorization": f"Bearer {hf_token}"},
json={"inputs": text})
if response.status_code == 200:
return response.json()
elif response.status_code == 503:
if time.time() - start_time > 90: # Stop retrying after 30 seconds
raise ValueError("Request failed with status code 503 after multiple retries")
else:
print("Retrying... after ", time.time() - start_time)
time.sleep(1) # Wait for 1 second before retrying
else:
raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")
last_sync_doc = log_collection.find_one({"key": "last_text_embedding"})
if last_sync_doc:
last_sync_timestamp = int(last_sync_doc["value"])
else:
last_sync_timestamp = 0
# Function to get the last run time from the log collection
def get_last_run_time():
log = log_collection.find_one({"key": "vectorize_faq_last_run"})
if log:
return log["last_run"]
return 0
# Function to save or update the last run time in the log collection
def update_last_run_time():
# now = datetime.datetime.utcnow()
current_timestamp = int(round(time.time() * 1000)) #datetime.now().timestamp() * 1000
log_collection.replace_one(
{"key": "vectorize_faq_last_run"},
{"key": "vectorize_faq_last_run", "last_run": now},
upsert=True
)
# Function to vectorize questions using your embedding function
def vectorize_questions(faq):
vectors = []
for i, question in enumerate(faq["questions"]):
vector = embed(question)
vectors.append({"question": question, "vector": vector})
return vectors
# Function to save question vectors to the question_vectors collection
def save_question_vectors(faq_id, vectors):
for i, vector in enumerate(vectors):
question_vector_collection.update_one(
{"_id": f"{faq_id}_{i}"},
{"$set": {"question": vector["question"], "vector": vector["vector"]}},
upsert=True
)
# Get the last run time
last_run_time = get_last_run_time()
# Fetch FAQs that have been updated since the last run time
faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}})
# Vectorize and save question vectors for each FAQ
for faq in faqs:
# vectors = vectorize_questions(faq)
# save_question_vectors(str(faq["_id"]), vectors)
break
# Update the last run time in the log collection
# update_last_run_time()
query = "cara menambah bank"
# results = question_vector_collection.aggregate([
# {"$vectorSearch": {
# "queryVector": embed(query),
# "path": "vector",
# "numCandidates": 100,
# "limit": 4,
# "index": "vector_index",
# }}
# ])
# print('result: ')
# for document in results:
# print(f'DocId: {document["_id"]}\n')
# results = question_vector_collection.aggregate([
# {"$vectorSearch": {
# "queryVector": embed(query),
# "path": "vector",
# "numCandidates": 100,
# "limit": 4,
# "index": "vector_index",
# }},
# {"$project": {
# "_id": 1,
# "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
# "question": 1,
# "vector": 1
# }},
# {"$lookup": {
# "from": "faq",
# "localField": "faq_id",
# "foreignField": "_id",
# "as": "faq_document"
# }}
# ])
results = question_vector_collection.aggregate([
{"$vectorSearch": {
"queryVector": embed(query),
"path": "vector",
"numCandidates": 100,
"limit": 4,
"index": "vector_index",
}},
{"$project": {
"_id": 1,
"faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
"question": 1,
"vector": 1
}},
{"$lookup": {
"from": "faq",
"let": {"faq_id_str": "$faq_id"},
"pipeline": [
{"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}},
],
"as": "faq_document"
}}
])
print('result: ')
for document in results:
print(f'DocId: {document["_id"]}')
print(document.keys())
print(f'FAQ Document: {document["faq_document"][0]["answer"]}')
print('-------------------') |