File size: 4,855 Bytes
58f8884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89f6f6b
58f8884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pymongo
from datetime import datetime
import os
from dependency import load_db
import requests
import time

client = load_db()
db = client["chat_support"]
faq_collection = db["faq"]
log_collection = db['log']
question_vector_collection = db['faq-vector']


hf_token = os.environ['HF_TOKEN']
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/mixedbread-ai/mxbai-embed-large-v1"

def embed(text):
    print('get embed: ', text)
    start_time = time.time()
    while True:
        response = requests.post(
            embedding_url,
            headers={"Authorization": f"Bearer {hf_token}"},
            json={"inputs": text})

        if response.status_code == 200:
            return response.json()
        elif response.status_code == 503:
            if time.time() - start_time > 90:  # Stop retrying after 30 seconds
                raise ValueError("Request failed with status code 503 after multiple retries")
            else:
                print("Retrying... after ", time.time() - start_time)
                time.sleep(1)  # Wait for 1 second before retrying
        else:
            raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")


last_sync_doc = log_collection.find_one({"key": "last_text_embedding"})
if last_sync_doc:
    last_sync_timestamp = int(last_sync_doc["value"])
else:
    last_sync_timestamp = 0 

# Function to get the last run time from the log collection
def get_last_run_time():
    log = log_collection.find_one({"key": "vectorize_faq_last_run"})
    if log:
        return log["last_run"]
    return 0

# Function to save or update the last run time in the log collection
def update_last_run_time():
    # now = datetime.datetime.utcnow()
    current_timestamp = int(round(time.time() * 1000)) #datetime.now().timestamp() * 1000
    log_collection.replace_one(
        {"key": "vectorize_faq_last_run"},
        {"key": "vectorize_faq_last_run", "last_run": now},
        upsert=True
    )

# Function to vectorize questions using your embedding function
def vectorize_questions(faq):
    vectors = []
    for i, question in enumerate(faq["questions"]):
        vector = embed(question)
        vectors.append({"question": question, "vector": vector})
    return vectors

# Function to save question vectors to the question_vectors collection
def save_question_vectors(faq_id, vectors):
    for i, vector in enumerate(vectors):
        question_vector_collection.update_one(
            {"_id": f"{faq_id}_{i}"},
            {"$set": {"question": vector["question"], "vector": vector["vector"]}},
            upsert=True
        )

# Get the last run time
last_run_time = get_last_run_time()

# Fetch FAQs that have been updated since the last run time
faqs = faq_collection.find({"time_updated": {"$gt": last_run_time}})

# Vectorize and save question vectors for each FAQ
for faq in faqs:
    # vectors = vectorize_questions(faq)
    # save_question_vectors(str(faq["_id"]), vectors)
    break

# Update the last run time in the log collection
# update_last_run_time()

query = "cara menambah bank"
# results = question_vector_collection.aggregate([
#   {"$vectorSearch": {
#     "queryVector": embed(query),
#     "path": "vector",
#     "numCandidates": 100,
#     "limit": 4,
#     "index": "vector_index",
#       }}
# ])

# print('result: ')
# for document in results:
#     print(f'DocId: {document["_id"]}\n')

# results = question_vector_collection.aggregate([
#   {"$vectorSearch": {
#     "queryVector": embed(query),
#     "path": "vector",
#     "numCandidates": 100,
#     "limit": 4,
#     "index": "vector_index",
#   }},
#   {"$project": {
#     "_id": 1,
#     "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
#     "question": 1,
#     "vector": 1
#   }},
#   {"$lookup": {
#     "from": "faq",
#     "localField": "faq_id",
#     "foreignField": "_id",
#     "as": "faq_document"
#   }}
# ])

results = question_vector_collection.aggregate([
  {"$vectorSearch": {
    "queryVector": embed(query),
    "path": "vector",
    "numCandidates": 100,
    "limit": 4,
    "index": "vector_index",
  }},
  {"$project": {
    "_id": 1,
    "faq_id": {"$substrCP": ["$_id", 0, {"$strLenCP": {"$substrCP": ["$_id", 0, {"$indexOfCP": ["$_id", "_"]}]}}]},
    "question": 1,
    "vector": 1
  }},
  {"$lookup": {
    "from": "faq",
    "let": {"faq_id_str": "$faq_id"},
    "pipeline": [
      {"$match": {"$expr": {"$eq": [{"$convert": {"input": "$$faq_id_str", "to": "objectId", "onError": ''}}, "$_id"]}}},
    ],
    "as": "faq_document"
  }}
])

print('result: ')
for document in results:
    print(f'DocId: {document["_id"]}')
    print(document.keys())
    print(f'FAQ Document: {document["faq_document"][0]["answer"]}')
    print('-------------------')