Spaces:
Sleeping
Sleeping
UPDATE: trainChatbot
Browse files- app.py +46 -33
- functions.py +12 -9
app.py
CHANGED
|
@@ -271,7 +271,7 @@ async def loadPDF(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 271 |
.insert({"username": username,
|
| 272 |
"chatbotName": chatbotName,
|
| 273 |
"dataSourceName": fileName,
|
| 274 |
-
"sourceEndpoint": "
|
| 275 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 276 |
.execute()
|
| 277 |
)
|
|
@@ -299,7 +299,7 @@ async def loadImagePDF(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 299 |
.insert({"username": username,
|
| 300 |
"chatbotName": chatbotName,
|
| 301 |
"dataSourceName": fileName,
|
| 302 |
-
"sourceEndpoint": "
|
| 303 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 304 |
.execute()
|
| 305 |
)
|
|
@@ -330,7 +330,7 @@ async def loadText(addTextConfig: AddText):
|
|
| 330 |
.insert({"username": username,
|
| 331 |
"chatbotName": chatbotName,
|
| 332 |
"dataSourceName": fileName,
|
| 333 |
-
"sourceEndpoint": "
|
| 334 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 335 |
.execute()
|
| 336 |
)
|
|
@@ -339,28 +339,6 @@ async def loadText(addTextConfig: AddText):
|
|
| 339 |
}
|
| 340 |
|
| 341 |
|
| 342 |
-
|
| 343 |
-
@app.post("/addText")
|
| 344 |
-
async def addText(addTextConfig: AddText):
|
| 345 |
-
vectorstore, text, source = addTextConfig.vectorstore, addTextConfig.text, addTextConfig.source
|
| 346 |
-
text = base64.b64decode(text.encode("utf-8")).decode("utf-8")
|
| 347 |
-
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 348 |
-
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 349 |
-
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
| 350 |
-
newCount = currentCount + len(text)
|
| 351 |
-
limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
|
| 352 |
-
"tokenLimit"]
|
| 353 |
-
if newCount < int(limit):
|
| 354 |
-
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 355 |
-
"chatbotname", chatbotname).execute()
|
| 356 |
-
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
| 357 |
-
return output
|
| 358 |
-
else:
|
| 359 |
-
return {
|
| 360 |
-
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
| 361 |
-
}
|
| 362 |
-
|
| 363 |
-
|
| 364 |
class AddQAPair(BaseModel):
|
| 365 |
vectorstore: str
|
| 366 |
question: str
|
|
@@ -410,7 +388,7 @@ async def loadWebURLs(loadWebsite: LoadWebsite):
|
|
| 410 |
.insert({"username": username,
|
| 411 |
"chatbotName": chatbotName,
|
| 412 |
"dataSourceName": fileName,
|
| 413 |
-
"sourceEndpoint": "
|
| 414 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 415 |
.execute()
|
| 416 |
)
|
|
@@ -467,8 +445,8 @@ class YtTranscript(BaseModel):
|
|
| 467 |
urls: list[str]
|
| 468 |
|
| 469 |
|
| 470 |
-
@app.post("/
|
| 471 |
-
async def
|
| 472 |
vectorstore, urls = ytTranscript.vectorstore, ytTranscript.urls
|
| 473 |
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 474 |
text = getTranscript(urls=urls)
|
|
@@ -484,7 +462,7 @@ async def getYoutubeTranscript(ytTranscript: YtTranscript):
|
|
| 484 |
.insert({"username": username,
|
| 485 |
"chatbotName": chatbotName,
|
| 486 |
"dataSourceName": fileName,
|
| 487 |
-
"sourceEndpoint": "
|
| 488 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 489 |
.execute()
|
| 490 |
)
|
|
@@ -523,7 +501,42 @@ async def chatHistory(vectorstore: str):
|
|
| 523 |
return response
|
| 524 |
|
| 525 |
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
.insert({"username": username,
|
| 272 |
"chatbotName": chatbotName,
|
| 273 |
"dataSourceName": fileName,
|
| 274 |
+
"sourceEndpoint": "/loadPDF",
|
| 275 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 276 |
.execute()
|
| 277 |
)
|
|
|
|
| 299 |
.insert({"username": username,
|
| 300 |
"chatbotName": chatbotName,
|
| 301 |
"dataSourceName": fileName,
|
| 302 |
+
"sourceEndpoint": "/loadImagePDF",
|
| 303 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 304 |
.execute()
|
| 305 |
)
|
|
|
|
| 330 |
.insert({"username": username,
|
| 331 |
"chatbotName": chatbotName,
|
| 332 |
"dataSourceName": fileName,
|
| 333 |
+
"sourceEndpoint": "/loadText",
|
| 334 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 335 |
.execute()
|
| 336 |
)
|
|
|
|
| 339 |
}
|
| 340 |
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
class AddQAPair(BaseModel):
|
| 343 |
vectorstore: str
|
| 344 |
question: str
|
|
|
|
| 388 |
.insert({"username": username,
|
| 389 |
"chatbotName": chatbotName,
|
| 390 |
"dataSourceName": fileName,
|
| 391 |
+
"sourceEndpoint": "/loadWebURLs",
|
| 392 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 393 |
.execute()
|
| 394 |
)
|
|
|
|
| 445 |
urls: list[str]
|
| 446 |
|
| 447 |
|
| 448 |
+
@app.post("/loadYoutubeTranscript")
|
| 449 |
+
async def loadYoutubeTranscript(ytTranscript: YtTranscript):
|
| 450 |
vectorstore, urls = ytTranscript.vectorstore, ytTranscript.urls
|
| 451 |
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 452 |
text = getTranscript(urls=urls)
|
|
|
|
| 462 |
.insert({"username": username,
|
| 463 |
"chatbotName": chatbotName,
|
| 464 |
"dataSourceName": fileName,
|
| 465 |
+
"sourceEndpoint": "/getYoutubeTranscript",
|
| 466 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
| 467 |
.execute()
|
| 468 |
)
|
|
|
|
| 501 |
return response
|
| 502 |
|
| 503 |
|
| 504 |
+
@app.post("/listChatbotSources")
|
| 505 |
+
async def listChatbotSources(vectorstore: str):
|
| 506 |
+
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 507 |
+
result = supabase.table("ConversAI_ChatbotDataSources").select("*").eq("username", username).eq("chatbotName", chatbotName).execute().data
|
| 508 |
+
return result
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
@app.post("/trainChatbot")
|
| 513 |
+
async def trainChatbot(vectorstore: str):
|
| 514 |
+
texts = []
|
| 515 |
+
sources = []
|
| 516 |
+
fileTypes = [supabase.table("ConversAI_ChatbotDataSources").select("sourceEndpoint").eq("sourceContentURL", x).execute().data[0]["sourceEndpoint"] for x in sources]
|
| 517 |
+
for source, fileType in zip(sources, fileTypes):
|
| 518 |
+
if ((fileType == "/loadPDF") | (fileType == "/loadImagePDF")):
|
| 519 |
+
r = requests.get(source)
|
| 520 |
+
file = eval(r.content.decode("utf-8"))
|
| 521 |
+
content = file["output"]
|
| 522 |
+
fileSource = file["source"]
|
| 523 |
+
texts.append(".".join([base64.b64decode(content[key].encode("utf-8")).decode("utf-8") for key in content.keys()]).replace("\n", " "))
|
| 524 |
+
sources.append(fileSource)
|
| 525 |
+
elif fileType == "/loadText":
|
| 526 |
+
r = requests.get(source)
|
| 527 |
+
file = eval(r.content.decode("utf-8"))
|
| 528 |
+
content = file["output"]
|
| 529 |
+
fileSource = file["source"]
|
| 530 |
+
texts.append(content.replace("\n", " "))
|
| 531 |
+
sources.append(fileSource)
|
| 532 |
+
elif ((fileType == "/loadWebURLs") | (fileType == "/loadYoutubeTranscript")):
|
| 533 |
+
r = requests.get(source)
|
| 534 |
+
file = eval(r.content.decode("utf-8"))
|
| 535 |
+
content = file["output"]
|
| 536 |
+
fileSource = file["source"]
|
| 537 |
+
texts.append(".".join([base64.b64decode(content[key].encode("utf-8")).decode("utf-8") for key in content.keys()]).replace("\n", " "))
|
| 538 |
+
sources.append(fileSource)
|
| 539 |
+
else:
|
| 540 |
+
pass
|
| 541 |
+
texts = [(text, source) for text, source in zip(texts, sources)]
|
| 542 |
+
return addDocuments(texts = texts, vectorstore = vectorstore)
|
functions.py
CHANGED
|
@@ -113,6 +113,7 @@ def createTable(tablename: str):
|
|
| 113 |
prefer_grpc=True,
|
| 114 |
api_key=os.environ["QDRANT_API_KEY"],
|
| 115 |
collection_name=tablename,
|
|
|
|
| 116 |
retrieval_mode=RetrievalMode.HYBRID
|
| 117 |
)
|
| 118 |
return {
|
|
@@ -120,7 +121,7 @@ def createTable(tablename: str):
|
|
| 120 |
}
|
| 121 |
|
| 122 |
|
| 123 |
-
def addDocuments(
|
| 124 |
global vectorEmbeddings
|
| 125 |
global sparseEmbeddings
|
| 126 |
splitter = RecursiveCharacterTextSplitter(
|
|
@@ -128,20 +129,22 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
| 128 |
chunk_overlap=250,
|
| 129 |
add_start_index=True
|
| 130 |
)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
texts = [
|
| 134 |
-
texts =
|
| 135 |
-
|
| 136 |
-
vectorstore = QdrantVectorStore.
|
|
|
|
| 137 |
embedding=vectorEmbeddings,
|
| 138 |
sparse_embedding=sparseEmbeddings,
|
| 139 |
-
collection_name=vectorstore,
|
| 140 |
url=os.environ["QDRANT_URL"],
|
|
|
|
| 141 |
api_key=os.environ["QDRANT_API_KEY"],
|
|
|
|
|
|
|
| 142 |
retrieval_mode=RetrievalMode.HYBRID
|
| 143 |
)
|
| 144 |
-
vectorstore.add_documents(documents=texts, ids=ids)
|
| 145 |
return {
|
| 146 |
"output": "SUCCESS"
|
| 147 |
}
|
|
|
|
| 113 |
prefer_grpc=True,
|
| 114 |
api_key=os.environ["QDRANT_API_KEY"],
|
| 115 |
collection_name=tablename,
|
| 116 |
+
force_recreate=True,
|
| 117 |
retrieval_mode=RetrievalMode.HYBRID
|
| 118 |
)
|
| 119 |
return {
|
|
|
|
| 121 |
}
|
| 122 |
|
| 123 |
|
| 124 |
+
def addDocuments(texts: list[tuple[str]], vectorstore: str):
|
| 125 |
global vectorEmbeddings
|
| 126 |
global sparseEmbeddings
|
| 127 |
splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 129 |
chunk_overlap=250,
|
| 130 |
add_start_index=True
|
| 131 |
)
|
| 132 |
+
sources = [textTuple[1] for textTuple in texts]
|
| 133 |
+
texts = [textTuple[0].replace("\n", " ") for textTuple in texts]
|
| 134 |
+
texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts]
|
| 135 |
+
texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)]
|
| 136 |
+
documents = splitter.split_documents(texts)
|
| 137 |
+
vectorstore = QdrantVectorStore.from_documents(
|
| 138 |
+
documents=documents,
|
| 139 |
embedding=vectorEmbeddings,
|
| 140 |
sparse_embedding=sparseEmbeddings,
|
|
|
|
| 141 |
url=os.environ["QDRANT_URL"],
|
| 142 |
+
prefer_grpc=True,
|
| 143 |
api_key=os.environ["QDRANT_API_KEY"],
|
| 144 |
+
collection_name=vectorstore,
|
| 145 |
+
force_recreate=True,
|
| 146 |
retrieval_mode=RetrievalMode.HYBRID
|
| 147 |
)
|
|
|
|
| 148 |
return {
|
| 149 |
"output": "SUCCESS"
|
| 150 |
}
|