Spaces:

techconspartners
/

ConversAI

Sleeping

App Files Files Community

Rauhan commited on Aug 22, 2024

Commit

d3176f4

1 Parent(s): c5522cd

UPDATE: functions

Browse files

Files changed (2) hide show

app.py +20 -99
functions.py +17 -10

app.py CHANGED Viewed

@@ -13,8 +13,7 @@ from src.api.speech_api import speech_translator_router
 from functions import client as supabase
 from urllib.parse import urlparse
 import nltk
-import time
-import uuid
 nltk.download('punkt_tab')
@@ -236,67 +235,34 @@ async def newChatbot(chatbotName: str, username: str):
     return createTable(tablename=chatbotName)
-@app.post("/addPDF")
 async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
     source = pdf.filename
     pdf = await pdf.read()
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
         temp_file.write(pdf)
         temp_file_path = temp_file.name
-    start = time.time()
     text = extractTextFromPdf(temp_file_path)
-    textExtraction = time.time()
     os.remove(temp_file_path)
-    username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
-    df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
-    currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
-    limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
-        "tokenLimit"]
-    newCount = currentCount + len(text)
-    if newCount < int(limit):
-        supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
-            "chatbotname", chatbotname).execute()
-        uploadStart = time.time()
-        output = addDocuments(text=text, source=source, vectorstore=vectorstore)
-        uploadEnd = time.time()
-        uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
-        timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
-        tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
-        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
-        wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
-        newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
-        fileId = str(uuid.uuid4())
-        with open(f"{fileId}.txt", "w") as file:
-            file.write(newText)
-        with open(f"{fileId}.txt", "rb") as f:
-            supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
-                                                       file_options={"content-type": "text/plain"})
-        os.remove(f"{fileId}.txt")
-        output["supabaseFileName"] = f"{fileId}.txt"
-        return output
-    else:
-        return {
-            "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
-        }
-@app.post("/scanAndReturnText")
 async def returnText(pdf: UploadFile = File(...)):
     source = pdf.filename
     pdf = await pdf.read()
-    start = time.time()
     text = getTextFromImagePDF(pdfBytes=pdf)
-    end = time.time()
-    timeTaken = f"{end - start}s"
     return {
-        "source": source,
-        "extractionTime": timeTaken,
-        "output": text
     }
 @app.post("/addText")
-async def addText(vectorstore: str, text: str, source: str | None = None):
     username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
     df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
     currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
@@ -306,22 +272,7 @@ async def addText(vectorstore: str, text: str, source: str | None = None):
     if newCount < int(limit):
         supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
             "chatbotname", chatbotname).execute()
-        uploadStart = time.time()
         output = addDocuments(text=text, source=source, vectorstore=vectorstore)
-        uploadEnd = time.time()
-        uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
-        tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
-        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
-        wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
-        newText = ("=" * 75 + "\n").join([uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
-        fileId = str(uuid.uuid4())
-        with open(f"{fileId}.txt", "w") as file:
-            file.write(newText)
-        with open(f"{fileId}.txt", "rb") as f:
-            supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
-                                                       file_options={"content-type": "text/plain"})
-        os.remove(f"{fileId}.txt")
-        output["supabaseFileName"] = f"{fileId}.txt"
         return output
     else:
         return {
@@ -354,44 +305,12 @@ async def addQAPairData(addQaPair: AddQAPair):
         }
-@app.post("/addWebsite")
 async def addWebsite(vectorstore: str, websiteUrls: list[str]):
-    start = time.time()
-    text = extractTextFromUrlList(urls=websiteUrls)
-    textExtraction = time.time()
-    username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
-    df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
-    currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
-    newCount = currentCount + len(text)
-    limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
-        "tokenLimit"]
-    if newCount < int(limit):
-        supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
-            "chatbotname", chatbotname).execute()
-        uploadStart = time.time()
-        output = addDocuments(text=text, source=urlparse(websiteUrls[0]).netloc, vectorstore=vectorstore)
-        uploadEnd = time.time()
-        uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
-        timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
-        tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
-        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
-        wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
-        links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
-        newText = ("=" * 75 + "\n").join(
-            [timeTaken, uploadTime, wordCount, tokenCount, links, "TEXT: \n" + text + "\n"])
-        fileId = str(uuid.uuid4())
-        with open(f"{fileId}.txt", "w") as file:
-            file.write(newText)
-        with open(f"{fileId}.txt", "rb") as f:
-            supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
-                                                       file_options={"content-type": "text/plain"})
-        os.remove(f"{fileId}.txt")
-        output["supabaseFileName"] = f"{fileId}.txt"
-        return output
-    else:
-        return {
-            "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
-        }
 @app.post("/answerQuery")
@@ -422,7 +341,8 @@ async def delete(username: str):
 @app.post("/getLinks")
 async def crawlUrl(baseUrl: str):
     return {
-        "urls": getLinks(url=baseUrl, timeout=30)
     }
@@ -436,9 +356,10 @@ async def getCount(vectorstore: str):
 @app.post("/getYoutubeTranscript")
-async def getYTTranscript(urls: str):
     return {
-        "transcript": getTranscript(urls=urls)
     }

 from functions import client as supabase
 from urllib.parse import urlparse
 import nltk
 nltk.download('punkt_tab')
     return createTable(tablename=chatbotName)
+@app.post("/loadPDF")
 async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
     source = pdf.filename
     pdf = await pdf.read()
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
         temp_file.write(pdf)
         temp_file_path = temp_file.name
     text = extractTextFromPdf(temp_file_path)
     os.remove(temp_file_path)
+    return {
+        "output": text,
+        "source": source
+    }
+@app.post("/loadImagePDF")
 async def returnText(pdf: UploadFile = File(...)):
     source = pdf.filename
     pdf = await pdf.read()
     text = getTextFromImagePDF(pdfBytes=pdf)
     return {
+        "output": text,
+        "source": source
     }
 @app.post("/addText")
+async def addText(vectorstore: str, text: str, source: str = "Text"):
     username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
     df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
     currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
     if newCount < int(limit):
         supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
             "chatbotname", chatbotname).execute()
         output = addDocuments(text=text, source=source, vectorstore=vectorstore)
         return output
     else:
         return {
         }
+@app.post("/loadWebURLs")
 async def addWebsite(vectorstore: str, websiteUrls: list[str]):
+    text = extractTextFromUrlList(urls=websiteUrls)
+    return {
+        "output": text
+    }
 @app.post("/answerQuery")
 @app.post("/getLinks")
 async def crawlUrl(baseUrl: str):
     return {
+        "urls": getLinks(url=baseUrl, timeout=30),
+        "source": urlparse(baseUrl).netloc
     }
 @app.post("/getYoutubeTranscript")
+async def getYTTranscript(urls: list[str]):
     return {
+        "output": getTranscript(urls=urls),
+        "source": "www.youtube.com"
     }

functions.py CHANGED Viewed

@@ -56,7 +56,7 @@ INSTRUCTIONS:
 2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
 3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
 4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
-Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Include sources to support your answers when possible.
 CONTEXT:
 =====================================
 {context}
@@ -139,14 +139,19 @@ def addDocuments(text: str, source: str, vectorstore: str):
 def format_docs(docs: str):
     context = ""
     for doc in docs:
-        print("METADATA ::: ", type(doc.metadata))
-        context += f"CONTENT: {doc.page_content}\nSOURCE: {doc.metadata} \n\n\n"
     if context == "":
         context = "No context found"
     else:
         pass
     return context
@@ -171,6 +176,7 @@ def trimMessages(chain_input):
 def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
     global prompt
     global client
     global vectorEmbeddings
     global sparseEmbeddings
     vectorStoreName = vectorstore
@@ -201,7 +207,8 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
         "output": chain.invoke(
             {"question": query},
             {"configurable": {"session_id": vectorStoreName}}
-        )
     }
@@ -271,13 +278,12 @@ def getTextFromImagePDF(pdfBytes):
         return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
     allImages = convert_from_bytes(pdfBytes)
     texts = [getText(image) for image in allImages]
-    return "\n\n\n".join(texts)
 def getTranscript(urls: str):
-    urls = urls.split(",")
     texts = []
-    for url in urls:
         try:
             loader = YoutubeLoader.from_youtube_url(
                 url, add_video_info=False
@@ -287,10 +293,11 @@ def getTranscript(urls: str):
         except:
             doc = ""
             texts.append(doc)
-    return "\n\n".join(texts)
 def analyzeData(query, dataframe):
     llm = ChatGroq(name="llama-3.1-8b-instant")
     df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
     response = df.chat(query)
@@ -312,7 +319,7 @@ def extractTextFromPdf(pdf_path):
     with ThreadPoolExecutor() as executor:
         texts = list(executor.map(extractTextFromPage, pages))
     doc.close()
-    return '.'.join(texts)
 def extractTextFromUrl(url):
@@ -326,4 +333,4 @@ def extractTextFromUrl(url):
 def extractTextFromUrlList(urls):
     with ThreadPoolExecutor() as executor:
         texts = list(executor.map(extractTextFromUrl, urls))
-    return '.'.join(texts)

 2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
 3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
 4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
+Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
 CONTEXT:
 =====================================
 {context}
 def format_docs(docs: str):
+    global sources
+    sources = []
     context = ""
     for doc in docs:
+        context += f"{doc.page_content}\n\n\n"
+        source = doc.metadata
+        source = source["source"]
+        sources.append(source)
     if context == "":
         context = "No context found"
     else:
         pass
+    sources = list(set(sources))
     return context
 def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
     global prompt
     global client
+    global sources
     global vectorEmbeddings
     global sparseEmbeddings
     vectorStoreName = vectorstore
         "output": chain.invoke(
             {"question": query},
             {"configurable": {"session_id": vectorStoreName}}
+        ),
+        "sources": sources
     }
         return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
     allImages = convert_from_bytes(pdfBytes)
     texts = [getText(image) for image in allImages]
+    return {x + 1: y for x, y in enumerate(texts)}
 def getTranscript(urls: str):
     texts = []
+    for url in set(urls):
         try:
             loader = YoutubeLoader.from_youtube_url(
                 url, add_video_info=False
         except:
             doc = ""
             texts.append(doc)
+    return {x: y for x, y in zip(urls, texts)}
 def analyzeData(query, dataframe):
+    query += ". In case, you are to plot a chart, make sure the x-axis labels are 90 degree rotated"
     llm = ChatGroq(name="llama-3.1-8b-instant")
     df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
     response = df.chat(query)
     with ThreadPoolExecutor() as executor:
         texts = list(executor.map(extractTextFromPage, pages))
     doc.close()
+    return {x + 1: y for x, y in enumerate(texts)}
 def extractTextFromUrl(url):
 def extractTextFromUrlList(urls):
     with ThreadPoolExecutor() as executor:
         texts = list(executor.map(extractTextFromUrl, urls))
+    return {x: y for x, y in zip(urls, texts)}