Spaces:
Sleeping
Sleeping
UPDATE: New Endpoints
Browse files- Dockerfile +3 -1
- app.py +18 -1
- functions.py +20 -1
- requirements.txt +4 -0
Dockerfile
CHANGED
|
@@ -10,7 +10,9 @@ RUN apt-get update && apt-get install -y \
|
|
| 10 |
build-essential \
|
| 11 |
cmake \
|
| 12 |
&& apt-get clean \
|
| 13 |
-
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
| 14 |
|
| 15 |
RUN mkdir -p /app/nltk_data && chmod -R 777 /app/nltk_data
|
| 16 |
|
|
|
|
| 10 |
build-essential \
|
| 11 |
cmake \
|
| 12 |
&& apt-get clean \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
RUN apt-get install poppler-utils -y
|
| 16 |
|
| 17 |
RUN mkdir -p /app/nltk_data && chmod -R 777 /app/nltk_data
|
| 18 |
|
app.py
CHANGED
|
@@ -8,7 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 8 |
from langchain_community.document_loaders import UnstructuredURLLoader
|
| 9 |
|
| 10 |
|
| 11 |
-
|
| 12 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
| 13 |
app.add_middleware(
|
| 14 |
CORSMiddleware,
|
|
@@ -64,6 +63,24 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 64 |
}
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
@app.post("/addText")
|
| 68 |
async def addText(vectorstore: str, text: str):
|
| 69 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
|
|
|
| 8 |
from langchain_community.document_loaders import UnstructuredURLLoader
|
| 9 |
|
| 10 |
|
|
|
|
| 11 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
| 12 |
app.add_middleware(
|
| 13 |
CORSMiddleware,
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
|
| 66 |
+
@app.post("/addImagePDF")
|
| 67 |
+
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
| 68 |
+
pdf = await pdf.read()
|
| 69 |
+
text = getTextFromImagePDF(pdfBytes = pdf)
|
| 70 |
+
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
| 71 |
+
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 72 |
+
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
| 73 |
+
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
|
| 74 |
+
newCount = currentCount + len(text)
|
| 75 |
+
if newCount < int(limit):
|
| 76 |
+
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
|
| 77 |
+
return addDocuments(text = text, vectorstore = vectorstore)
|
| 78 |
+
else:
|
| 79 |
+
return {
|
| 80 |
+
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
@app.post("/addText")
|
| 85 |
async def addText(vectorstore: str, text: str):
|
| 86 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
functions.py
CHANGED
|
@@ -18,6 +18,9 @@ from langchain.retrievers.document_compressors import FlashrankRerank
|
|
| 18 |
from supabase.client import create_client
|
| 19 |
from qdrant_client import QdrantClient
|
| 20 |
from langchain_groq import ChatGroq
|
|
|
|
|
|
|
|
|
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
from urllib.parse import urlparse, urljoin
|
| 23 |
from supabase import create_client
|
|
@@ -37,6 +40,7 @@ vectorEmbeddings = HuggingFaceEmbeddings(
|
|
| 37 |
model_kwargs = model_kwargs,
|
| 38 |
encode_kwargs = encode_kwargs
|
| 39 |
)
|
|
|
|
| 40 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
| 41 |
prompt = """
|
| 42 |
INSTRUCTIONS:
|
|
@@ -282,4 +286,19 @@ def getLinks(url: str, timeout = 30):
|
|
| 282 |
break
|
| 283 |
else:
|
| 284 |
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
| 285 |
-
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from supabase.client import create_client
|
| 19 |
from qdrant_client import QdrantClient
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
+
from pdf2image import convert_from_bytes
|
| 22 |
+
import numpy as np
|
| 23 |
+
from paddleocr import PaddleOCR
|
| 24 |
from bs4 import BeautifulSoup
|
| 25 |
from urllib.parse import urlparse, urljoin
|
| 26 |
from supabase import create_client
|
|
|
|
| 40 |
model_kwargs = model_kwargs,
|
| 41 |
encode_kwargs = encode_kwargs
|
| 42 |
)
|
| 43 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 44 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
| 45 |
prompt = """
|
| 46 |
INSTRUCTIONS:
|
|
|
|
| 286 |
break
|
| 287 |
else:
|
| 288 |
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
| 289 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def getTextFromImagePDF(pdfBytes):
|
| 293 |
+
global ocr
|
| 294 |
+
allImages = convert_from_bytes(pdfBytes)
|
| 295 |
+
allImages = [np.array(image) for image in allImages]
|
| 296 |
+
pageWiseText = []
|
| 297 |
+
for page in allImages:
|
| 298 |
+
result = ocr.ocr(page)
|
| 299 |
+
if result[0]:
|
| 300 |
+
retrievedText = "\n".join([result[0][x][1][0] for x in range(len(result[0]))])
|
| 301 |
+
else:
|
| 302 |
+
retrievedText = ""
|
| 303 |
+
pageWiseText.append(retrievedText)
|
| 304 |
+
return "\n\n\n".join(pageWiseText)
|
requirements.txt
CHANGED
|
@@ -12,10 +12,14 @@ langchain-qdrant
|
|
| 12 |
langchain-groq
|
| 13 |
langsmith
|
| 14 |
lxml
|
|
|
|
| 15 |
PyPDF2
|
| 16 |
python-dotenv
|
| 17 |
pydantic
|
| 18 |
pandas
|
|
|
|
|
|
|
|
|
|
| 19 |
sentence-transformers
|
| 20 |
supabase
|
| 21 |
unstructured
|
|
|
|
| 12 |
langchain-groq
|
| 13 |
langsmith
|
| 14 |
lxml
|
| 15 |
+
numpy
|
| 16 |
PyPDF2
|
| 17 |
python-dotenv
|
| 18 |
pydantic
|
| 19 |
pandas
|
| 20 |
+
paddlepaddle-gpu
|
| 21 |
+
paddleocr
|
| 22 |
+
pdf2image
|
| 23 |
sentence-transformers
|
| 24 |
supabase
|
| 25 |
unstructured
|