|
|
import pymongo |
|
|
from datetime import datetime |
|
|
import os |
|
|
from dependency import load_db |
|
|
import requests |
|
|
import time |
|
|
|
|
|
client = load_db() |
|
|
db = client["chat_support"] |
|
|
faq_collection = db["faq"] |
|
|
log_collection = db['log'] |
|
|
question_vector_collection = db['faq-vector'] |
|
|
|
|
|
|
|
|
hf_token = os.environ['HF_TOKEN'] |
|
|
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1" |
|
|
|
|
|
def embed(text): |
|
|
print('get embed: ', text) |
|
|
start_time = time.time() |
|
|
while True: |
|
|
response = requests.post( |
|
|
embedding_url, |
|
|
headers={"Authorization": f"Bearer {hf_token}"}, |
|
|
json={"inputs": text}) |
|
|
|
|
|
if response.status_code == 200: |
|
|
return response.json() |
|
|
elif response.status_code == 503: |
|
|
if time.time() - start_time > 90: |
|
|
raise ValueError("Request failed with status code 503 after multiple retries") |
|
|
else: |
|
|
print("Retrying... after ", time.time() - start_time) |
|
|
time.sleep(1) |
|
|
else: |
|
|
raise ValueError(f"Request failed with status code {response.status_code}: {response.text}") |
|
|
|
|
|
|
|
|
last_sync_doc = log_collection.find_one({"key": "last_text_embedding"}) |
|
|
if last_sync_doc: |
|
|
last_sync_timestamp = int(last_sync_doc["value"]) |
|
|
else: |
|
|
last_sync_timestamp = 0 |
|
|
|
|
|
|
|
|
def get_last_run_time(): |
|
|
log = log_collection.find_one({"key": "vectorize_faq_last_run"}) |
|
|
if log: |
|
|
return log["last_run"] |
|
|
return 0 |
|
|
|
|
|
|
|
|
def update_last_run_time(): |
|
|
|
|
|
current_timestamp = int(round(time.time() * 1000)) |
|
|
log_collection.replace_one( |
|
|
{"key": "vectorize_faq_last_run"}, |
|
|
{"key": "vectorize_faq_last_run", "last_run": now}, |
|
|
upsert=True |
|
|
) |
|
|
|
|
|
|
|
|
def vectorize_questions(faq): |
|
|
vectors = [] |
|
|
for i, question in enumerate(faq["questions"]): |
|
|
vector = embed(question) |
|
|
vectors.append({"question": question, "vector": vector}) |
|
|
return vectors |
|
|
|
|
|
|
|
|
def save_question_vectors(faq_id, vectors): |
|
|
for i, vector in enumerate(vectors): |
|
|
question_vector_collection.update_one( |
|
|
{"_id": f"{faq_id}_{i}"}, |
|
|
{"$set": {"question": vector["question"], "vector": vector["vector"]}}, |
|
|
upsert=True |
|
|
) |
|
|
|
|
|
|
|
|
last_run_time = get_last_run_time() |
|
|
|
|
|
|
|
|
faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}}) |
|
|
|
|
|
|
|
|
for faq in faqs: |
|
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
query = "cara menambah bank" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results = question_vector_collection.aggregate([ |
|
|
{"$vectorSearch": { |
|
|
"queryVector": embed(query), |
|
|
"path": "vector", |
|
|
"numCandidates": 100, |
|
|
"limit": 4, |
|
|
"index": "vector_index", |
|
|
}}, |
|
|
{"$project": { |
|
|
"_id": 1, |
|
|
"faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]}, |
|
|
"question": 1, |
|
|
"vector": 1 |
|
|
}}, |
|
|
{"$lookup": { |
|
|
"from": "faq", |
|
|
"let": {"faq_id_str": "$faq_id"}, |
|
|
"pipeline": [ |
|
|
{"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}}, |
|
|
], |
|
|
"as": "faq_document" |
|
|
}} |
|
|
]) |
|
|
|
|
|
print('result: ') |
|
|
for document in results: |
|
|
print(f'DocId: {document["_id"]}') |
|
|
print(document.keys()) |
|
|
print(f'FAQ Document: {document["faq_document"][0]["answer"]}') |
|
|
print('-------------------') |