import pymongo from datetime import datetime import os from dependency import load_db import requests import time client = load_db() db = client["chat_support"] faq_collection = db["faq"] log_collection = db['log'] question_vector_collection = db['faq-vector'] hf_token = os.environ['HF_TOKEN'] embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1" def embed(text): print('get embed: ', text) start_time = time.time() while True: response = requests.post( embedding_url, headers={"Authorization": f"Bearer {hf_token}"}, json={"inputs": text}) if response.status_code == 200: return response.json() elif response.status_code == 503: if time.time() - start_time > 90: # Stop retrying after 30 seconds raise ValueError("Request failed with status code 503 after multiple retries") else: print("Retrying... after ", time.time() - start_time) time.sleep(1) # Wait for 1 second before retrying else: raise ValueError(f"Request failed with status code {response.status_code}: {response.text}") last_sync_doc = log_collection.find_one({"key": "last_text_embedding"}) if last_sync_doc: last_sync_timestamp = int(last_sync_doc["value"]) else: last_sync_timestamp = 0 # Function to get the last run time from the log collection def get_last_run_time(): log = log_collection.find_one({"key": "vectorize_faq_last_run"}) if log: return log["last_run"] return 0 # Function to save or update the last run time in the log collection def update_last_run_time(): # now = datetime.datetime.utcnow() current_timestamp = int(round(time.time() * 1000)) #datetime.now().timestamp() * 1000 log_collection.replace_one( {"key": "vectorize_faq_last_run"}, {"key": "vectorize_faq_last_run", "last_run": now}, upsert=True ) # Function to vectorize questions using your embedding function def vectorize_questions(faq): vectors = [] for i, question in enumerate(faq["questions"]): vector = embed(question) vectors.append({"question": question, "vector": vector}) return vectors # Function to save question vectors to the question_vectors collection def save_question_vectors(faq_id, vectors): for i, vector in enumerate(vectors): question_vector_collection.update_one( {"_id": f"{faq_id}_{i}"}, {"$set": {"question": vector["question"], "vector": vector["vector"]}}, upsert=True ) # Get the last run time last_run_time = get_last_run_time() # Fetch FAQs that have been updated since the last run time faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}}) # Vectorize and save question vectors for each FAQ for faq in faqs: # vectors = vectorize_questions(faq) # save_question_vectors(str(faq["_id"]), vectors) break # Update the last run time in the log collection # update_last_run_time() query = "cara menambah bank" # results = question_vector_collection.aggregate([ # {"$vectorSearch": { # "queryVector": embed(query), # "path": "vector", # "numCandidates": 100, # "limit": 4, # "index": "vector_index", # }} # ]) # print('result: ') # for document in results: # print(f'DocId: {document["_id"]}\n') # results = question_vector_collection.aggregate([ # {"$vectorSearch": { # "queryVector": embed(query), # "path": "vector", # "numCandidates": 100, # "limit": 4, # "index": "vector_index", # }}, # {"$project": { # "_id": 1, # "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]}, # "question": 1, # "vector": 1 # }}, # {"$lookup": { # "from": "faq", # "localField": "faq_id", # "foreignField": "_id", # "as": "faq_document" # }} # ]) results = question_vector_collection.aggregate([ {"$vectorSearch": { "queryVector": embed(query), "path": "vector", "numCandidates": 100, "limit": 4, "index": "vector_index", }}, {"$project": { "_id": 1, "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]}, "question": 1, "vector": 1 }}, {"$lookup": { "from": "faq", "let": {"faq_id_str": "$faq_id"}, "pipeline": [ {"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}}, ], "as": "faq_document" }} ]) print('result: ') for document in results: print(f'DocId: {document["_id"]}') print(document.keys()) print(f'FAQ Document: {document["faq_document"][0]["answer"]}') print('-------------------')