Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -180,7 +180,7 @@
|
|
| 180 |
|
| 181 |
|
| 182 |
# v2
|
| 183 |
-
import re
|
| 184 |
import PyPDF2
|
| 185 |
from langchain_community.embeddings import OllamaEmbeddings
|
| 186 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -195,14 +195,13 @@ import logging
|
|
| 195 |
import pypandoc
|
| 196 |
import pdfkit
|
| 197 |
from paddleocr import PaddleOCR
|
| 198 |
-
import fitz
|
| 199 |
import asyncio
|
| 200 |
from langchain_nomic.embeddings import NomicEmbeddings
|
| 201 |
-
import os
|
| 202 |
|
| 203 |
llm_groq = ChatGroq(
|
| 204 |
-
|
| 205 |
-
)
|
| 206 |
|
| 207 |
# Initialize anonymizer
|
| 208 |
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
|
|
@@ -276,11 +275,21 @@ async def extract_text_from_mixed_pdf(file_path):
|
|
| 276 |
pdf_text += text
|
| 277 |
return pdf_text
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
@cl.on_chat_start
|
| 280 |
async def on_chat_start():
|
| 281 |
|
| 282 |
files = None # Initialize variable to store uploaded files
|
| 283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
# Wait for the user to upload a file
|
| 285 |
while files is None:
|
| 286 |
files = await cl.AskFileMessage(
|
|
@@ -308,14 +317,7 @@ async def on_chat_start():
|
|
| 308 |
)
|
| 309 |
|
| 310 |
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
| 311 |
-
|
| 312 |
-
# Clear the existing Chroma vector store
|
| 313 |
-
docsearch = await cl.make_async(Chroma.from_texts)(
|
| 314 |
-
[], embeddings, metadatas=[]
|
| 315 |
-
)
|
| 316 |
-
docsearch.delete()
|
| 317 |
|
| 318 |
-
# Create a new Chroma vector store
|
| 319 |
docsearch = await cl.make_async(Chroma.from_texts)(
|
| 320 |
[anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
|
| 321 |
)
|
|
@@ -345,8 +347,6 @@ async def on_chat_start():
|
|
| 345 |
await msg.update()
|
| 346 |
# Store the chain in user session
|
| 347 |
cl.user_session.set("chain", chain)
|
| 348 |
-
cl.user_session.set("docsearch", docsearch) # Store the docsearch in session
|
| 349 |
-
cl.user_session.set("file_path", file.path) # Store the file path in session
|
| 350 |
|
| 351 |
|
| 352 |
@cl.on_message
|
|
@@ -366,21 +366,3 @@ async def main(message: cl.Message):
|
|
| 366 |
|
| 367 |
# Return results
|
| 368 |
await cl.Message(content=answer, elements=text_elements).send()
|
| 369 |
-
|
| 370 |
-
@cl.on_chat_end
|
| 371 |
-
async def on_chat_end():
|
| 372 |
-
docsearch = cl.user_session.get("docsearch")
|
| 373 |
-
file_path = cl.user_session.get("file_path")
|
| 374 |
-
|
| 375 |
-
if docsearch:
|
| 376 |
-
# Clear the vector store
|
| 377 |
-
docsearch.delete()
|
| 378 |
-
|
| 379 |
-
if file_path and os.path.exists(file_path):
|
| 380 |
-
# Remove the uploaded file
|
| 381 |
-
os.remove(file_path)
|
| 382 |
-
|
| 383 |
-
# Clear the user session data
|
| 384 |
-
cl.user_session.clear()
|
| 385 |
-
|
| 386 |
-
logging.info("User session ended, data cleared.")
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
# v2
|
| 183 |
+
import re
|
| 184 |
import PyPDF2
|
| 185 |
from langchain_community.embeddings import OllamaEmbeddings
|
| 186 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 195 |
import pypandoc
|
| 196 |
import pdfkit
|
| 197 |
from paddleocr import PaddleOCR
|
| 198 |
+
import fitz
|
| 199 |
import asyncio
|
| 200 |
from langchain_nomic.embeddings import NomicEmbeddings
|
|
|
|
| 201 |
|
| 202 |
llm_groq = ChatGroq(
|
| 203 |
+
model_name='llama3-70b-8192'
|
| 204 |
+
)
|
| 205 |
|
| 206 |
# Initialize anonymizer
|
| 207 |
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
|
|
|
|
| 275 |
pdf_text += text
|
| 276 |
return pdf_text
|
| 277 |
|
| 278 |
+
# Function to clear the ChromaDB
|
| 279 |
+
async def clear_chroma_db(chroma_instance):
|
| 280 |
+
await chroma_instance.delete()
|
| 281 |
+
|
| 282 |
@cl.on_chat_start
|
| 283 |
async def on_chat_start():
|
| 284 |
|
| 285 |
files = None # Initialize variable to store uploaded files
|
| 286 |
|
| 287 |
+
# Initialize ChromaDB
|
| 288 |
+
chroma_instance = await cl.make_async(Chroma)()
|
| 289 |
+
|
| 290 |
+
# Clear any existing data in ChromaDB
|
| 291 |
+
await clear_chroma_db(chroma_instance)
|
| 292 |
+
|
| 293 |
# Wait for the user to upload a file
|
| 294 |
while files is None:
|
| 295 |
files = await cl.AskFileMessage(
|
|
|
|
| 317 |
)
|
| 318 |
|
| 319 |
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
|
|
|
| 321 |
docsearch = await cl.make_async(Chroma.from_texts)(
|
| 322 |
[anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
|
| 323 |
)
|
|
|
|
| 347 |
await msg.update()
|
| 348 |
# Store the chain in user session
|
| 349 |
cl.user_session.set("chain", chain)
|
|
|
|
|
|
|
| 350 |
|
| 351 |
|
| 352 |
@cl.on_message
|
|
|
|
| 366 |
|
| 367 |
# Return results
|
| 368 |
await cl.Message(content=answer, elements=text_elements).send()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|