Spaces:
Sleeping
Sleeping
UPDATE: supabase
Browse files- app.py +26 -2
- functions.py +0 -1
app.py
CHANGED
|
@@ -233,7 +233,9 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 233 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 234 |
temp_file.write(pdf)
|
| 235 |
temp_file_path = temp_file.name
|
|
|
|
| 236 |
text = extractTextFromPdf(temp_file_path)
|
|
|
|
| 237 |
os.remove(temp_file_path)
|
| 238 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 239 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
|
@@ -244,7 +246,23 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 244 |
if newCount < int(limit):
|
| 245 |
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 246 |
"chatbotname", chatbotname).execute()
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
else:
|
| 249 |
return {
|
| 250 |
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
|
@@ -254,8 +272,14 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 254 |
@app.post("/scanAndReturnText")
|
| 255 |
async def returnText(pdf: UploadFile = File(...)):
|
| 256 |
pdf = await pdf.read()
|
|
|
|
| 257 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
@app.post("/addText")
|
|
|
|
| 233 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 234 |
temp_file.write(pdf)
|
| 235 |
temp_file_path = temp_file.name
|
| 236 |
+
start = time.time()
|
| 237 |
text = extractTextFromPdf(temp_file_path)
|
| 238 |
+
textExtraction = time.time()
|
| 239 |
os.remove(temp_file_path)
|
| 240 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 241 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
|
|
|
| 246 |
if newCount < int(limit):
|
| 247 |
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
| 248 |
"chatbotname", chatbotname).execute()
|
| 249 |
+
uploadStart = time.time()
|
| 250 |
+
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
| 251 |
+
uploadEnd = time.time()
|
| 252 |
+
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
| 253 |
+
timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
|
| 254 |
+
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
| 255 |
+
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
| 256 |
+
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
| 257 |
+
newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
|
| 258 |
+
fileId = str(uuid.uuid4())
|
| 259 |
+
with open(f"{fileId}.txt", "w") as file:
|
| 260 |
+
file.write(newText)
|
| 261 |
+
with open(f"{fileId}.txt", "rb") as f:
|
| 262 |
+
supabase.storage.from_("ConversAI").upload(file = f, path = os.path.join("/", f.name), file_options={"content-type": "text/plain"})
|
| 263 |
+
os.remove(f"{fileId}.txt")
|
| 264 |
+
output["supabaseFileName"] = f"{fileId}.txt"
|
| 265 |
+
return output
|
| 266 |
else:
|
| 267 |
return {
|
| 268 |
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
|
|
|
| 272 |
@app.post("/scanAndReturnText")
|
| 273 |
async def returnText(pdf: UploadFile = File(...)):
|
| 274 |
pdf = await pdf.read()
|
| 275 |
+
start = time.time()
|
| 276 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
| 277 |
+
end = time.time()
|
| 278 |
+
timeTaken = f"{end - start}s"
|
| 279 |
+
return {
|
| 280 |
+
"extractionTime": timeTaken,
|
| 281 |
+
"output": text
|
| 282 |
+
}
|
| 283 |
|
| 284 |
|
| 285 |
@app.post("/addText")
|
functions.py
CHANGED
|
@@ -293,7 +293,6 @@ def getTextFromImagePDF(pdfBytes):
|
|
| 293 |
def getText(image):
|
| 294 |
global reader
|
| 295 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
| 296 |
-
|
| 297 |
allImages = convert_from_bytes(pdfBytes)
|
| 298 |
texts = [getText(image) for image in allImages]
|
| 299 |
return "\n\n\n".join(texts)
|
|
|
|
| 293 |
def getText(image):
|
| 294 |
global reader
|
| 295 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
|
|
|
| 296 |
allImages = convert_from_bytes(pdfBytes)
|
| 297 |
texts = [getText(image) for image in allImages]
|
| 298 |
return "\n\n\n".join(texts)
|