Spaces:

UNIQ-DEV
/

chat-faq

Runtime error

File size: 4,855 Bytes

import pymongo
from datetime import datetime
import os
from dependency import load_db
import requests
import time

client = load_db()
db = client["chat_support"]
faq_collection = db["faq"]
log_collection = db['log']
question_vector_collection = db['faq-vector']


hf_token = os.environ['HF_TOKEN']
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1"

def embed(text):
    print('get embed: ', text)
    start_time = time.time()
    while True:
        response = requests.post(
            embedding_url,
            headers={"Authorization": f"Bearer {hf_token}"},
            json={"inputs": text})

        if response.status_code == 200:
            return response.json()
        elif response.status_code == 503:
            if time.time() - start_time > 90:  # Stop retrying after 30 seconds
                raise ValueError("Request failed with status code 503 after multiple retries")
            else:
                print("Retrying... after ", time.time() - start_time)
                time.sleep(1)  # Wait for 1 second before retrying
        else:
            raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")


last_sync_doc = log_collection.find_one({"key": "last_text_embedding"})
if last_sync_doc:
    last_sync_timestamp = int(last_sync_doc["value"])
else:
    last_sync_timestamp = 0 

# Function to get the last run time from the log collection
def get_last_run_time():
    log = log_collection.find_one({"key": "vectorize_faq_last_run"})
    if log:
        return log["last_run"]
    return 0

# Function to save or update the last run time in the log collection
def update_last_run_time():
    # now = datetime.datetime.utcnow()
    current_timestamp = int(round(time.time() * 1000)) #datetime.now().timestamp() * 1000
    log_collection.replace_one(
        {"key": "vectorize_faq_last_run"},
        {"key": "vectorize_faq_last_run", "last_run": now},
        upsert=True
    )

# Function to vectorize questions using your embedding function
def vectorize_questions(faq):
    vectors = []
    for i, question in enumerate(faq["questions"]):
        vector = embed(question)
        vectors.append({"question": question, "vector": vector})
    return vectors

# Function to save question vectors to the question_vectors collection
def save_question_vectors(faq_id, vectors):
    for i, vector in enumerate(vectors):
        question_vector_collection.update_one(
            {"_id": f"{faq_id}_{i}"},
            {"$set": {"question": vector["question"], "vector": vector["vector"]}},
            upsert=True
        )

# Get the last run time
last_run_time = get_last_run_time()

# Fetch FAQs that have been updated since the last run time
faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}})

# Vectorize and save question vectors for each FAQ
for faq in faqs:
    # vectors = vectorize_questions(faq)
    # save_question_vectors(str(faq["_id"]), vectors)
    break

# Update the last run time in the log collection
# update_last_run_time()

query = "cara menambah bank"
# results = question_vector_collection.aggregate([
#   {"$vectorSearch": {
#     "queryVector": embed(query),
#     "path": "vector",
#     "numCandidates": 100,
#     "limit": 4,
#     "index": "vector_index",
#       }}
# ])

# print('result: ')
# for document in results:
#     print(f'DocId: {document["_id"]}\n')

# results = question_vector_collection.aggregate([
#   {"$vectorSearch": {
#     "queryVector": embed(query),
#     "path": "vector",
#     "numCandidates": 100,
#     "limit": 4,
#     "index": "vector_index",
#   }},
#   {"$project": {
#     "_id": 1,
#     "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
#     "question": 1,
#     "vector": 1
#   }},
#   {"$lookup": {
#     "from": "faq",
#     "localField": "faq_id",
#     "foreignField": "_id",
#     "as": "faq_document"
#   }}
# ])

results = question_vector_collection.aggregate([
  {"$vectorSearch": {
    "queryVector": embed(query),
    "path": "vector",
    "numCandidates": 100,
    "limit": 4,
    "index": "vector_index",
  }},
  {"$project": {
    "_id": 1,
    "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
    "question": 1,
    "vector": 1
  }},
  {"$lookup": {
    "from": "faq",
    "let": {"faq_id_str": "$faq_id"},
    "pipeline": [
      {"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}},
    ],
    "as": "faq_document"
  }}
])

print('result: ')
for document in results:
    print(f'DocId: {document["_id"]}')
    print(document.keys())
    print(f'FAQ Document: {document["faq_document"][0]["answer"]}')
    print('-------------------')