Spaces:
Sleeping
Sleeping
UPDATE: functions
Browse files- app.py +20 -99
- functions.py +17 -10
app.py
CHANGED
|
@@ -13,8 +13,7 @@ from src.api.speech_api import speech_translator_router
|
|
| 13 |
from functions import client as supabase
|
| 14 |
from urllib.parse import urlparse
|
| 15 |
import nltk
|
| 16 |
-
|
| 17 |
-
import uuid
|
| 18 |
|
| 19 |
nltk.download('punkt_tab')
|
| 20 |
|
|
@@ -236,67 +235,34 @@ async def newChatbot(chatbotName: str, username: str):
|
|
| 236 |
return createTable(tablename=chatbotName)
|
| 237 |
|
| 238 |
|
| 239 |
-
@app.post("/
|
| 240 |
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
| 241 |
source = pdf.filename
|
| 242 |
pdf = await pdf.read()
|
| 243 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 244 |
temp_file.write(pdf)
|
| 245 |
temp_file_path = temp_file.name
|
| 246 |
-
start = time.time()
|
| 247 |
text = extractTextFromPdf(temp_file_path)
|
| 248 |
-
textExtraction = time.time()
|
| 249 |
os.remove(temp_file_path)
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
"tokenLimit"]
|
| 255 |
-
newCount = currentCount + len(text)
|
| 256 |
-
if newCount < int(limit):
|
| 257 |
-
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 258 |
-
"chatbotname", chatbotname).execute()
|
| 259 |
-
uploadStart = time.time()
|
| 260 |
-
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
| 261 |
-
uploadEnd = time.time()
|
| 262 |
-
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
| 263 |
-
timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
|
| 264 |
-
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
| 265 |
-
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
| 266 |
-
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
| 267 |
-
newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
|
| 268 |
-
fileId = str(uuid.uuid4())
|
| 269 |
-
with open(f"{fileId}.txt", "w") as file:
|
| 270 |
-
file.write(newText)
|
| 271 |
-
with open(f"{fileId}.txt", "rb") as f:
|
| 272 |
-
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
| 273 |
-
file_options={"content-type": "text/plain"})
|
| 274 |
-
os.remove(f"{fileId}.txt")
|
| 275 |
-
output["supabaseFileName"] = f"{fileId}.txt"
|
| 276 |
-
return output
|
| 277 |
-
else:
|
| 278 |
-
return {
|
| 279 |
-
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
| 280 |
-
}
|
| 281 |
|
| 282 |
|
| 283 |
-
@app.post("/
|
| 284 |
async def returnText(pdf: UploadFile = File(...)):
|
| 285 |
source = pdf.filename
|
| 286 |
pdf = await pdf.read()
|
| 287 |
-
start = time.time()
|
| 288 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
| 289 |
-
end = time.time()
|
| 290 |
-
timeTaken = f"{end - start}s"
|
| 291 |
return {
|
| 292 |
-
"
|
| 293 |
-
"
|
| 294 |
-
"output": text
|
| 295 |
}
|
| 296 |
|
| 297 |
|
| 298 |
@app.post("/addText")
|
| 299 |
-
async def addText(vectorstore: str, text: str, source: str
|
| 300 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 301 |
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 302 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
|
@@ -306,22 +272,7 @@ async def addText(vectorstore: str, text: str, source: str | None = None):
|
|
| 306 |
if newCount < int(limit):
|
| 307 |
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 308 |
"chatbotname", chatbotname).execute()
|
| 309 |
-
uploadStart = time.time()
|
| 310 |
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
| 311 |
-
uploadEnd = time.time()
|
| 312 |
-
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
| 313 |
-
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
| 314 |
-
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
| 315 |
-
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
| 316 |
-
newText = ("=" * 75 + "\n").join([uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
|
| 317 |
-
fileId = str(uuid.uuid4())
|
| 318 |
-
with open(f"{fileId}.txt", "w") as file:
|
| 319 |
-
file.write(newText)
|
| 320 |
-
with open(f"{fileId}.txt", "rb") as f:
|
| 321 |
-
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
| 322 |
-
file_options={"content-type": "text/plain"})
|
| 323 |
-
os.remove(f"{fileId}.txt")
|
| 324 |
-
output["supabaseFileName"] = f"{fileId}.txt"
|
| 325 |
return output
|
| 326 |
else:
|
| 327 |
return {
|
|
@@ -354,44 +305,12 @@ async def addQAPairData(addQaPair: AddQAPair):
|
|
| 354 |
}
|
| 355 |
|
| 356 |
|
| 357 |
-
@app.post("/
|
| 358 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 364 |
-
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
| 365 |
-
newCount = currentCount + len(text)
|
| 366 |
-
limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
|
| 367 |
-
"tokenLimit"]
|
| 368 |
-
if newCount < int(limit):
|
| 369 |
-
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 370 |
-
"chatbotname", chatbotname).execute()
|
| 371 |
-
uploadStart = time.time()
|
| 372 |
-
output = addDocuments(text=text, source=urlparse(websiteUrls[0]).netloc, vectorstore=vectorstore)
|
| 373 |
-
uploadEnd = time.time()
|
| 374 |
-
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
| 375 |
-
timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
|
| 376 |
-
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
| 377 |
-
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
| 378 |
-
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
| 379 |
-
links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
|
| 380 |
-
newText = ("=" * 75 + "\n").join(
|
| 381 |
-
[timeTaken, uploadTime, wordCount, tokenCount, links, "TEXT: \n" + text + "\n"])
|
| 382 |
-
fileId = str(uuid.uuid4())
|
| 383 |
-
with open(f"{fileId}.txt", "w") as file:
|
| 384 |
-
file.write(newText)
|
| 385 |
-
with open(f"{fileId}.txt", "rb") as f:
|
| 386 |
-
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
| 387 |
-
file_options={"content-type": "text/plain"})
|
| 388 |
-
os.remove(f"{fileId}.txt")
|
| 389 |
-
output["supabaseFileName"] = f"{fileId}.txt"
|
| 390 |
-
return output
|
| 391 |
-
else:
|
| 392 |
-
return {
|
| 393 |
-
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
| 394 |
-
}
|
| 395 |
|
| 396 |
|
| 397 |
@app.post("/answerQuery")
|
|
@@ -422,7 +341,8 @@ async def delete(username: str):
|
|
| 422 |
@app.post("/getLinks")
|
| 423 |
async def crawlUrl(baseUrl: str):
|
| 424 |
return {
|
| 425 |
-
"urls": getLinks(url=baseUrl, timeout=30)
|
|
|
|
| 426 |
}
|
| 427 |
|
| 428 |
|
|
@@ -436,9 +356,10 @@ async def getCount(vectorstore: str):
|
|
| 436 |
|
| 437 |
|
| 438 |
@app.post("/getYoutubeTranscript")
|
| 439 |
-
async def getYTTranscript(urls: str):
|
| 440 |
return {
|
| 441 |
-
"
|
|
|
|
| 442 |
}
|
| 443 |
|
| 444 |
|
|
|
|
| 13 |
from functions import client as supabase
|
| 14 |
from urllib.parse import urlparse
|
| 15 |
import nltk
|
| 16 |
+
|
|
|
|
| 17 |
|
| 18 |
nltk.download('punkt_tab')
|
| 19 |
|
|
|
|
| 235 |
return createTable(tablename=chatbotName)
|
| 236 |
|
| 237 |
|
| 238 |
+
@app.post("/loadPDF")
|
| 239 |
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
| 240 |
source = pdf.filename
|
| 241 |
pdf = await pdf.read()
|
| 242 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 243 |
temp_file.write(pdf)
|
| 244 |
temp_file_path = temp_file.name
|
|
|
|
| 245 |
text = extractTextFromPdf(temp_file_path)
|
|
|
|
| 246 |
os.remove(temp_file_path)
|
| 247 |
+
return {
|
| 248 |
+
"output": text,
|
| 249 |
+
"source": source
|
| 250 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
|
| 253 |
+
@app.post("/loadImagePDF")
|
| 254 |
async def returnText(pdf: UploadFile = File(...)):
|
| 255 |
source = pdf.filename
|
| 256 |
pdf = await pdf.read()
|
|
|
|
| 257 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
|
|
|
|
|
|
| 258 |
return {
|
| 259 |
+
"output": text,
|
| 260 |
+
"source": source
|
|
|
|
| 261 |
}
|
| 262 |
|
| 263 |
|
| 264 |
@app.post("/addText")
|
| 265 |
+
async def addText(vectorstore: str, text: str, source: str = "Text"):
|
| 266 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 267 |
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 268 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
|
|
|
| 272 |
if newCount < int(limit):
|
| 273 |
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 274 |
"chatbotname", chatbotname).execute()
|
|
|
|
| 275 |
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
return output
|
| 277 |
else:
|
| 278 |
return {
|
|
|
|
| 305 |
}
|
| 306 |
|
| 307 |
|
| 308 |
+
@app.post("/loadWebURLs")
|
| 309 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
| 310 |
+
text = extractTextFromUrlList(urls=websiteUrls)
|
| 311 |
+
return {
|
| 312 |
+
"output": text
|
| 313 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
|
| 316 |
@app.post("/answerQuery")
|
|
|
|
| 341 |
@app.post("/getLinks")
|
| 342 |
async def crawlUrl(baseUrl: str):
|
| 343 |
return {
|
| 344 |
+
"urls": getLinks(url=baseUrl, timeout=30),
|
| 345 |
+
"source": urlparse(baseUrl).netloc
|
| 346 |
}
|
| 347 |
|
| 348 |
|
|
|
|
| 356 |
|
| 357 |
|
| 358 |
@app.post("/getYoutubeTranscript")
|
| 359 |
+
async def getYTTranscript(urls: list[str]):
|
| 360 |
return {
|
| 361 |
+
"output": getTranscript(urls=urls),
|
| 362 |
+
"source": "www.youtube.com"
|
| 363 |
}
|
| 364 |
|
| 365 |
|
functions.py
CHANGED
|
@@ -56,7 +56,7 @@ INSTRUCTIONS:
|
|
| 56 |
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
| 57 |
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
| 58 |
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
| 59 |
-
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words.
|
| 60 |
CONTEXT:
|
| 61 |
=====================================
|
| 62 |
{context}
|
|
@@ -139,14 +139,19 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
| 139 |
|
| 140 |
|
| 141 |
def format_docs(docs: str):
|
|
|
|
|
|
|
| 142 |
context = ""
|
| 143 |
for doc in docs:
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
| 146 |
if context == "":
|
| 147 |
context = "No context found"
|
| 148 |
else:
|
| 149 |
pass
|
|
|
|
| 150 |
return context
|
| 151 |
|
| 152 |
|
|
@@ -171,6 +176,7 @@ def trimMessages(chain_input):
|
|
| 171 |
def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
|
| 172 |
global prompt
|
| 173 |
global client
|
|
|
|
| 174 |
global vectorEmbeddings
|
| 175 |
global sparseEmbeddings
|
| 176 |
vectorStoreName = vectorstore
|
|
@@ -201,7 +207,8 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
|
|
| 201 |
"output": chain.invoke(
|
| 202 |
{"question": query},
|
| 203 |
{"configurable": {"session_id": vectorStoreName}}
|
| 204 |
-
)
|
|
|
|
| 205 |
}
|
| 206 |
|
| 207 |
|
|
@@ -271,13 +278,12 @@ def getTextFromImagePDF(pdfBytes):
|
|
| 271 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
| 272 |
allImages = convert_from_bytes(pdfBytes)
|
| 273 |
texts = [getText(image) for image in allImages]
|
| 274 |
-
return
|
| 275 |
|
| 276 |
|
| 277 |
def getTranscript(urls: str):
|
| 278 |
-
urls = urls.split(",")
|
| 279 |
texts = []
|
| 280 |
-
for url in urls:
|
| 281 |
try:
|
| 282 |
loader = YoutubeLoader.from_youtube_url(
|
| 283 |
url, add_video_info=False
|
|
@@ -287,10 +293,11 @@ def getTranscript(urls: str):
|
|
| 287 |
except:
|
| 288 |
doc = ""
|
| 289 |
texts.append(doc)
|
| 290 |
-
return
|
| 291 |
|
| 292 |
|
| 293 |
def analyzeData(query, dataframe):
|
|
|
|
| 294 |
llm = ChatGroq(name="llama-3.1-8b-instant")
|
| 295 |
df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
|
| 296 |
response = df.chat(query)
|
|
@@ -312,7 +319,7 @@ def extractTextFromPdf(pdf_path):
|
|
| 312 |
with ThreadPoolExecutor() as executor:
|
| 313 |
texts = list(executor.map(extractTextFromPage, pages))
|
| 314 |
doc.close()
|
| 315 |
-
return
|
| 316 |
|
| 317 |
|
| 318 |
def extractTextFromUrl(url):
|
|
@@ -326,4 +333,4 @@ def extractTextFromUrl(url):
|
|
| 326 |
def extractTextFromUrlList(urls):
|
| 327 |
with ThreadPoolExecutor() as executor:
|
| 328 |
texts = list(executor.map(extractTextFromUrl, urls))
|
| 329 |
-
return
|
|
|
|
| 56 |
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
| 57 |
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
| 58 |
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
| 59 |
+
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
|
| 60 |
CONTEXT:
|
| 61 |
=====================================
|
| 62 |
{context}
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
def format_docs(docs: str):
|
| 142 |
+
global sources
|
| 143 |
+
sources = []
|
| 144 |
context = ""
|
| 145 |
for doc in docs:
|
| 146 |
+
context += f"{doc.page_content}\n\n\n"
|
| 147 |
+
source = doc.metadata
|
| 148 |
+
source = source["source"]
|
| 149 |
+
sources.append(source)
|
| 150 |
if context == "":
|
| 151 |
context = "No context found"
|
| 152 |
else:
|
| 153 |
pass
|
| 154 |
+
sources = list(set(sources))
|
| 155 |
return context
|
| 156 |
|
| 157 |
|
|
|
|
| 176 |
def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
|
| 177 |
global prompt
|
| 178 |
global client
|
| 179 |
+
global sources
|
| 180 |
global vectorEmbeddings
|
| 181 |
global sparseEmbeddings
|
| 182 |
vectorStoreName = vectorstore
|
|
|
|
| 207 |
"output": chain.invoke(
|
| 208 |
{"question": query},
|
| 209 |
{"configurable": {"session_id": vectorStoreName}}
|
| 210 |
+
),
|
| 211 |
+
"sources": sources
|
| 212 |
}
|
| 213 |
|
| 214 |
|
|
|
|
| 278 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
| 279 |
allImages = convert_from_bytes(pdfBytes)
|
| 280 |
texts = [getText(image) for image in allImages]
|
| 281 |
+
return {x + 1: y for x, y in enumerate(texts)}
|
| 282 |
|
| 283 |
|
| 284 |
def getTranscript(urls: str):
|
|
|
|
| 285 |
texts = []
|
| 286 |
+
for url in set(urls):
|
| 287 |
try:
|
| 288 |
loader = YoutubeLoader.from_youtube_url(
|
| 289 |
url, add_video_info=False
|
|
|
|
| 293 |
except:
|
| 294 |
doc = ""
|
| 295 |
texts.append(doc)
|
| 296 |
+
return {x: y for x, y in zip(urls, texts)}
|
| 297 |
|
| 298 |
|
| 299 |
def analyzeData(query, dataframe):
|
| 300 |
+
query += ". In case, you are to plot a chart, make sure the x-axis labels are 90 degree rotated"
|
| 301 |
llm = ChatGroq(name="llama-3.1-8b-instant")
|
| 302 |
df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
|
| 303 |
response = df.chat(query)
|
|
|
|
| 319 |
with ThreadPoolExecutor() as executor:
|
| 320 |
texts = list(executor.map(extractTextFromPage, pages))
|
| 321 |
doc.close()
|
| 322 |
+
return {x + 1: y for x, y in enumerate(texts)}
|
| 323 |
|
| 324 |
|
| 325 |
def extractTextFromUrl(url):
|
|
|
|
| 333 |
def extractTextFromUrlList(urls):
|
| 334 |
with ThreadPoolExecutor() as executor:
|
| 335 |
texts = list(executor.map(extractTextFromUrl, urls))
|
| 336 |
+
return {x: y for x, y in zip(urls, texts)}
|