Spaces:
Sleeping
Sleeping
UPDATE: speed ups
Browse files- app.py +2 -9
- functions.py +31 -4
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -2,12 +2,10 @@ import io
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
from functions import *
|
| 5 |
-
from langchain_community.document_loaders import PDFMinerLoader
|
| 6 |
import pandas as pd
|
| 7 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 8 |
from pydantic import BaseModel
|
| 9 |
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
-
from langchain_community.document_loaders import UnstructuredURLLoader
|
| 11 |
from src.api.speech_api import speech_translator_router
|
| 12 |
from functions import client as supabase
|
| 13 |
from urllib.parse import urlparse
|
|
@@ -158,8 +156,7 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
| 158 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 159 |
temp_file.write(pdf)
|
| 160 |
temp_file_path = temp_file.name
|
| 161 |
-
|
| 162 |
-
text = loader.load()[0].page_content
|
| 163 |
os.remove(temp_file_path)
|
| 164 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 165 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
|
@@ -229,11 +226,7 @@ async def addText(addQaPair: AddQAPair):
|
|
| 229 |
|
| 230 |
@app.post("/addWebsite")
|
| 231 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
| 232 |
-
|
| 233 |
-
docs = loader.load()
|
| 234 |
-
text = "\n\n".join(
|
| 235 |
-
[f"{docs[doc].page_content}" for doc in range(len(docs))]
|
| 236 |
-
)
|
| 237 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 238 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 239 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
|
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
from functions import *
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 7 |
from pydantic import BaseModel
|
| 8 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 9 |
from src.api.speech_api import speech_translator_router
|
| 10 |
from functions import client as supabase
|
| 11 |
from urllib.parse import urlparse
|
|
|
|
| 156 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 157 |
temp_file.write(pdf)
|
| 158 |
temp_file_path = temp_file.name
|
| 159 |
+
text = extractTextFromPdf(temp_file_path)
|
|
|
|
| 160 |
os.remove(temp_file_path)
|
| 161 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 162 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
|
|
|
| 226 |
|
| 227 |
@app.post("/addWebsite")
|
| 228 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
| 229 |
+
text = extractTextFromUrlList(urls = websiteUrls)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
| 231 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
| 232 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
functions.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
| 2 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
| 3 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
@@ -38,12 +40,12 @@ qdrantClient = QdrantClient(url=os.environ["QDRANT_URL"], api_key=os.environ["QD
|
|
| 38 |
model_kwargs = {"device": "cuda"}
|
| 39 |
encode_kwargs = {"normalize_embeddings": True}
|
| 40 |
vectorEmbeddings = HuggingFaceEmbeddings(
|
| 41 |
-
model_name="
|
| 42 |
model_kwargs=model_kwargs,
|
| 43 |
encode_kwargs=encode_kwargs
|
| 44 |
)
|
| 45 |
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
|
| 46 |
-
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25")
|
| 47 |
prompt = """
|
| 48 |
INSTRUCTIONS:
|
| 49 |
=====================================
|
|
@@ -123,11 +125,11 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
| 123 |
global sparseEmbeddings
|
| 124 |
global store
|
| 125 |
parentSplitter = RecursiveCharacterTextSplitter(
|
| 126 |
-
chunk_size=
|
| 127 |
add_start_index=True
|
| 128 |
)
|
| 129 |
childSplitter = RecursiveCharacterTextSplitter(
|
| 130 |
-
chunk_size=
|
| 131 |
add_start_index=True
|
| 132 |
)
|
| 133 |
texts = [Document(page_content=text, metadata={"source": source})]
|
|
@@ -323,3 +325,28 @@ def analyzeData(query, dataframe):
|
|
| 323 |
return f"data:image/png;base64,{b64string}"
|
| 324 |
else:
|
| 325 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pymupdf
|
| 2 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 3 |
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
| 4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
| 5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
| 40 |
model_kwargs = {"device": "cuda"}
|
| 41 |
encode_kwargs = {"normalize_embeddings": True}
|
| 42 |
vectorEmbeddings = HuggingFaceEmbeddings(
|
| 43 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
| 44 |
model_kwargs=model_kwargs,
|
| 45 |
encode_kwargs=encode_kwargs
|
| 46 |
)
|
| 47 |
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
|
| 48 |
+
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25", threads = 20 , parallel = 0)
|
| 49 |
prompt = """
|
| 50 |
INSTRUCTIONS:
|
| 51 |
=====================================
|
|
|
|
| 125 |
global sparseEmbeddings
|
| 126 |
global store
|
| 127 |
parentSplitter = RecursiveCharacterTextSplitter(
|
| 128 |
+
chunk_size=2000,
|
| 129 |
add_start_index=True
|
| 130 |
)
|
| 131 |
childSplitter = RecursiveCharacterTextSplitter(
|
| 132 |
+
chunk_size=400,
|
| 133 |
add_start_index=True
|
| 134 |
)
|
| 135 |
texts = [Document(page_content=text, metadata={"source": source})]
|
|
|
|
| 325 |
return f"data:image/png;base64,{b64string}"
|
| 326 |
else:
|
| 327 |
return response
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def extractTextFromPage(page):
|
| 332 |
+
return page.get_text()
|
| 333 |
+
|
| 334 |
+
def extractTextFromPdf(pdf_path):
|
| 335 |
+
doc = pymupdf.open(pdf_path)
|
| 336 |
+
pages = [doc.load_page(i) for i in range(len(doc))]
|
| 337 |
+
with ThreadPoolExecutor() as executor:
|
| 338 |
+
texts = list(executor.map(extractTextFromPage, pages))
|
| 339 |
+
doc.close()
|
| 340 |
+
return '.'.join(texts)
|
| 341 |
+
|
| 342 |
+
def extractTextFromUrl(url):
|
| 343 |
+
response = requests.get(url)
|
| 344 |
+
response.raise_for_status()
|
| 345 |
+
html = response.text
|
| 346 |
+
soup = BeautifulSoup(html, 'lxml')
|
| 347 |
+
return soup.get_text(separator=' ', strip=True)
|
| 348 |
+
|
| 349 |
+
def extractTextFromUrlList(urls):
|
| 350 |
+
with ThreadPoolExecutor() as executor:
|
| 351 |
+
texts = list(executor.map(extractTextFromUrl, urls))
|
| 352 |
+
return '.'.join(texts)
|
requirements.txt
CHANGED
|
@@ -73,6 +73,7 @@ fastembed-gpu
|
|
| 73 |
nest_asyncio
|
| 74 |
beautifulsoup4
|
| 75 |
flashrank
|
|
|
|
| 76 |
langchain
|
| 77 |
langchain-community
|
| 78 |
langchain-cohere
|
|
@@ -80,7 +81,6 @@ langchain-huggingface
|
|
| 80 |
langchain-qdrant
|
| 81 |
langchain-groq
|
| 82 |
lxml
|
| 83 |
-
pdfminer.six
|
| 84 |
python-dotenv
|
| 85 |
pillow
|
| 86 |
pandas
|
|
|
|
| 73 |
nest_asyncio
|
| 74 |
beautifulsoup4
|
| 75 |
flashrank
|
| 76 |
+
PyMuPDF
|
| 77 |
langchain
|
| 78 |
langchain-community
|
| 79 |
langchain-cohere
|
|
|
|
| 81 |
langchain-qdrant
|
| 82 |
langchain-groq
|
| 83 |
lxml
|
|
|
|
| 84 |
python-dotenv
|
| 85 |
pillow
|
| 86 |
pandas
|