Update app.py
Browse files
app.py
CHANGED
|
@@ -88,15 +88,13 @@ def get_chunks(text, chunk_size=500):
|
|
| 88 |
|
| 89 |
return chunks
|
| 90 |
|
| 91 |
-
# Initialize FAISS index with cosine similarity
|
| 92 |
-
|
| 93 |
-
embedding_dim = 768 # NASA Bi-Encoder outputs 768-dimensional embeddings
|
| 94 |
-
index = faiss.IndexFlatIP(embedding_dim) # FAISS inner product (cosine similarity)
|
| 95 |
-
|
| 96 |
def load_and_process_uploaded_pdfs(pdf_files):
|
| 97 |
-
|
| 98 |
"""Extracts text from PDFs, splits into chunks, generates embeddings, and stores in FAISS."""
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
pdf_chunks = [] # Store extracted chunks
|
| 101 |
chunk_embeddings = [] # Store embeddings
|
| 102 |
|
|
@@ -106,21 +104,22 @@ def load_and_process_uploaded_pdfs(pdf_files):
|
|
| 106 |
for page in reader.pages:
|
| 107 |
pdf_text += page.extract_text() + "\n"
|
| 108 |
|
| 109 |
-
#
|
| 110 |
-
chunks = get_chunks(pdf_text, chunk_size=
|
| 111 |
-
pdf_chunks.extend(chunks) # Store
|
| 112 |
|
| 113 |
# Generate embeddings for each chunk
|
| 114 |
for chunk in chunks:
|
| 115 |
chunk_embedding = encode_text(chunk).reshape(1, -1)
|
| 116 |
|
| 117 |
-
# Normalize
|
| 118 |
chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding)
|
| 119 |
|
| 120 |
-
index.add(chunk_embedding) #
|
| 121 |
chunk_embeddings.append(chunk_embedding)
|
| 122 |
|
| 123 |
-
return pdf_chunks, chunk_embeddings # Return
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def retrieve_relevant_context(user_input, context_text, science_objectives="", index=None, pdf_chunks=None, k=3):
|
|
@@ -413,7 +412,7 @@ def gpt_response_to_dataframe(gpt_response):
|
|
| 413 |
def chatbot(user_input, science_objectives="", context="", subdomain="", uploaded_pdfs=None, max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
|
| 414 |
# Load and process uploaded PDFs (if provided)
|
| 415 |
if uploaded_pdfs:
|
| 416 |
-
pdf_chunks, chunk_embeddings = load_and_process_uploaded_pdfs(uploaded_pdfs)
|
| 417 |
else:
|
| 418 |
pdf_chunks, chunk_embeddings = [], [] # Ensure empty list if no PDFs provided
|
| 419 |
|
|
|
|
| 88 |
|
| 89 |
return chunks
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def load_and_process_uploaded_pdfs(pdf_files):
|
|
|
|
| 92 |
"""Extracts text from PDFs, splits into chunks, generates embeddings, and stores in FAISS."""
|
| 93 |
|
| 94 |
+
# **RESET FAISS INDEX on every function call**
|
| 95 |
+
embedding_dim = 768 # NASA Bi-Encoder embedding size
|
| 96 |
+
index = faiss.IndexFlatIP(embedding_dim) # Fresh FAISS index
|
| 97 |
+
|
| 98 |
pdf_chunks = [] # Store extracted chunks
|
| 99 |
chunk_embeddings = [] # Store embeddings
|
| 100 |
|
|
|
|
| 104 |
for page in reader.pages:
|
| 105 |
pdf_text += page.extract_text() + "\n"
|
| 106 |
|
| 107 |
+
# **Reduce Chunk Size for Faster Processing**
|
| 108 |
+
chunks = get_chunks(pdf_text, chunk_size=300)
|
| 109 |
+
pdf_chunks.extend(chunks) # Store for retrieval
|
| 110 |
|
| 111 |
# Generate embeddings for each chunk
|
| 112 |
for chunk in chunks:
|
| 113 |
chunk_embedding = encode_text(chunk).reshape(1, -1)
|
| 114 |
|
| 115 |
+
# Normalize for cosine similarity
|
| 116 |
chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding)
|
| 117 |
|
| 118 |
+
index.add(chunk_embedding) # **Now adding to fresh FAISS index**
|
| 119 |
chunk_embeddings.append(chunk_embedding)
|
| 120 |
|
| 121 |
+
return index, pdf_chunks, chunk_embeddings # Return fresh FAISS index and chunk data
|
| 122 |
+
|
| 123 |
|
| 124 |
|
| 125 |
def retrieve_relevant_context(user_input, context_text, science_objectives="", index=None, pdf_chunks=None, k=3):
|
|
|
|
| 412 |
def chatbot(user_input, science_objectives="", context="", subdomain="", uploaded_pdfs=None, max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
|
| 413 |
# Load and process uploaded PDFs (if provided)
|
| 414 |
if uploaded_pdfs:
|
| 415 |
+
index, pdf_chunks, chunk_embeddings = load_and_process_uploaded_pdfs(uploaded_pdfs)
|
| 416 |
else:
|
| 417 |
pdf_chunks, chunk_embeddings = [], [] # Ensure empty list if no PDFs provided
|
| 418 |
|