Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import time
|
|
| 5 |
import tempfile
|
| 6 |
import faiss
|
| 7 |
import numpy as np
|
|
|
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
@@ -12,7 +13,7 @@ from langchain.vectorstores import FAISS
|
|
| 12 |
from langchain.docstore.document import Document
|
| 13 |
from keybert import KeyBERT
|
| 14 |
from textblob import TextBlob
|
| 15 |
-
from groq import Groq
|
| 16 |
|
| 17 |
# Load environment
|
| 18 |
load_dotenv()
|
|
@@ -25,7 +26,7 @@ st.title("π¬ Smart PDF ChatBot")
|
|
| 25 |
st.markdown("""
|
| 26 |
Upload one or more PDFs. Get summaries, insights, and interact with AI about the content using a persistent memory chat.
|
| 27 |
""")
|
| 28 |
-
|
| 29 |
uploaded_files = st.file_uploader("π Upload PDF files", type=["pdf"], accept_multiple_files=True)
|
| 30 |
|
| 31 |
# Utilities
|
|
@@ -37,7 +38,7 @@ def extract_text_from_pdf(file):
|
|
| 37 |
return text
|
| 38 |
|
| 39 |
def split_text(text):
|
| 40 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=
|
| 41 |
return splitter.split_text(text)
|
| 42 |
|
| 43 |
def create_vector_store(chunks):
|
|
@@ -48,30 +49,47 @@ def create_vector_store(chunks):
|
|
| 48 |
def summarize_chunks(chunks):
|
| 49 |
chunk_summaries = []
|
| 50 |
for i, chunk in enumerate(chunks):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
try:
|
| 52 |
response = client.chat.completions.create(
|
| 53 |
model=GROQ_MODEL,
|
| 54 |
messages=[
|
| 55 |
-
{"role": "system", "content": "You
|
| 56 |
-
{"role": "user", "content": f"
|
| 57 |
]
|
| 58 |
)
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
except Exception as e:
|
| 61 |
-
|
| 62 |
-
return "\n".join(chunk_summaries)
|
| 63 |
-
|
| 64 |
-
def ask_question(vectorstore, question):
|
| 65 |
-
docs = vectorstore.similarity_search(question, k=3)
|
| 66 |
-
context = "\n".join([d.page_content for d in docs])
|
| 67 |
-
response = client.chat.completions.create(
|
| 68 |
-
model=GROQ_MODEL,
|
| 69 |
-
messages=[
|
| 70 |
-
{"role": "system", "content": "You answer questions based on document context."},
|
| 71 |
-
{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
|
| 72 |
-
]
|
| 73 |
-
)
|
| 74 |
-
return response.choices[0].message.content
|
| 75 |
|
| 76 |
def extract_keywords(text, top_n=10):
|
| 77 |
kw_model = KeyBERT()
|
|
@@ -131,4 +149,4 @@ if uploaded_files:
|
|
| 131 |
st.markdown(f"**You:** {q}")
|
| 132 |
st.markdown(f"**AI:** {a}")
|
| 133 |
else:
|
| 134 |
-
st.info("π₯ Upload one or more PDF files to get started.")
|
|
|
|
| 5 |
import tempfile
|
| 6 |
import faiss
|
| 7 |
import numpy as np
|
| 8 |
+
import json
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 11 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
|
| 13 |
from langchain.docstore.document import Document
|
| 14 |
from keybert import KeyBERT
|
| 15 |
from textblob import TextBlob
|
| 16 |
+
from groq import Groq, RateLimitError
|
| 17 |
|
| 18 |
# Load environment
|
| 19 |
load_dotenv()
|
|
|
|
| 26 |
st.markdown("""
|
| 27 |
Upload one or more PDFs. Get summaries, insights, and interact with AI about the content using a persistent memory chat.
|
| 28 |
""")
|
| 29 |
+
|
| 30 |
uploaded_files = st.file_uploader("π Upload PDF files", type=["pdf"], accept_multiple_files=True)
|
| 31 |
|
| 32 |
# Utilities
|
|
|
|
| 38 |
return text
|
| 39 |
|
| 40 |
def split_text(text):
|
| 41 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
|
| 42 |
return splitter.split_text(text)
|
| 43 |
|
| 44 |
def create_vector_store(chunks):
|
|
|
|
| 49 |
def summarize_chunks(chunks):
|
| 50 |
chunk_summaries = []
|
| 51 |
for i, chunk in enumerate(chunks):
|
| 52 |
+
while True:
|
| 53 |
+
try:
|
| 54 |
+
response = client.chat.completions.create(
|
| 55 |
+
model=GROQ_MODEL,
|
| 56 |
+
messages=[
|
| 57 |
+
{"role": "system", "content": "You are an AI that summarizes documents."},
|
| 58 |
+
{"role": "user", "content": f"Summarize this chunk:\n{chunk}"}
|
| 59 |
+
]
|
| 60 |
+
)
|
| 61 |
+
chunk_summaries.append(response.choices[0].message.content)
|
| 62 |
+
break
|
| 63 |
+
except RateLimitError as e:
|
| 64 |
+
error_data = json.loads(str(e).split(" - ", 1)[-1])
|
| 65 |
+
wait_time = float(error_data["error"]["message"].split("in ")[-1].split("s")[0])
|
| 66 |
+
st.warning(f"Rate limit hit while summarizing. Retrying in {wait_time:.2f} seconds...")
|
| 67 |
+
time.sleep(wait_time)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
chunk_summaries.append(f"[Error summarizing chunk {i}]: {str(e)}")
|
| 70 |
+
break
|
| 71 |
+
return "\n".join(chunk_summaries)
|
| 72 |
+
|
| 73 |
+
def ask_question(vectorstore, question):
|
| 74 |
+
docs = vectorstore.similarity_search(question, k=3)
|
| 75 |
+
context = "\n".join([d.page_content for d in docs])
|
| 76 |
+
while True:
|
| 77 |
try:
|
| 78 |
response = client.chat.completions.create(
|
| 79 |
model=GROQ_MODEL,
|
| 80 |
messages=[
|
| 81 |
+
{"role": "system", "content": "You answer questions based on document context."},
|
| 82 |
+
{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
|
| 83 |
]
|
| 84 |
)
|
| 85 |
+
return response.choices[0].message.content
|
| 86 |
+
except RateLimitError as e:
|
| 87 |
+
error_data = json.loads(str(e).split(" - ", 1)[-1])
|
| 88 |
+
wait_time = float(error_data["error"]["message"].split("in ")[-1].split("s")[0])
|
| 89 |
+
st.warning(f"Rate limit hit. Retrying in {wait_time:.2f} seconds...")
|
| 90 |
+
time.sleep(wait_time)
|
| 91 |
except Exception as e:
|
| 92 |
+
return f"[Error answering question]: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
def extract_keywords(text, top_n=10):
|
| 95 |
kw_model = KeyBERT()
|
|
|
|
| 149 |
st.markdown(f"**You:** {q}")
|
| 150 |
st.markdown(f"**AI:** {a}")
|
| 151 |
else:
|
| 152 |
+
st.info("π₯ Upload one or more PDF files to get started.")
|