Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,16 +22,12 @@ class PDFChatbot:
|
|
| 22 |
self.azure_client = openai.OpenAI()
|
| 23 |
self.conversation_history = []
|
| 24 |
self.pdf_content = ""
|
|
|
|
| 25 |
|
| 26 |
-
def
|
| 27 |
-
"""
|
| 28 |
-
# db = FAISS.load_local('mbaldb', HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'), allow_dangerous_deserialization = True )
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
pdf_directory = "data"
|
| 32 |
all_text = ""
|
| 33 |
-
|
| 34 |
-
# Step 1: Read and extract text from all PDFs
|
| 35 |
for filename in os.listdir(pdf_directory):
|
| 36 |
if filename.lower().endswith(".pdf"):
|
| 37 |
pdf_path = os.path.join(pdf_directory, filename)
|
|
@@ -41,15 +37,15 @@ class PDFChatbot:
|
|
| 41 |
page_text = page.extract_text()
|
| 42 |
if page_text:
|
| 43 |
all_text += page_text + "\n"
|
| 44 |
-
|
| 45 |
-
#
|
| 46 |
words = all_text.split()
|
| 47 |
chunks = []
|
| 48 |
current_chunk = []
|
| 49 |
current_length = 0
|
| 50 |
-
|
| 51 |
for word in words:
|
| 52 |
-
if current_length + len(word) + 1 >
|
| 53 |
if current_chunk:
|
| 54 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
| 55 |
current_chunk = [word]
|
|
@@ -57,20 +53,20 @@ class PDFChatbot:
|
|
| 57 |
else:
|
| 58 |
current_chunk.append(word)
|
| 59 |
current_length += len(word) + 1
|
| 60 |
-
|
| 61 |
if current_chunk:
|
| 62 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
| 63 |
-
|
| 64 |
-
#
|
| 65 |
embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
# Step 4: Perform similarity search
|
| 69 |
-
relevant_chunks = db.similarity_search(user_question, k=3)
|
| 70 |
-
|
| 71 |
-
# Step 5: Return the content of the top relevant chunks
|
| 72 |
-
return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
|
| 75 |
"""Generate response using Azure OpenAI based on PDF content and user question."""
|
| 76 |
# Split PDF content into chunks
|
|
|
|
| 22 |
self.azure_client = openai.OpenAI()
|
| 23 |
self.conversation_history = []
|
| 24 |
self.pdf_content = ""
|
| 25 |
+
self.faiss_index = self.build_faiss_index("data")
|
| 26 |
|
| 27 |
+
def build_faiss_index(self, pdf_directory: str, chunk_size: int = 3000) -> FAISS:
|
| 28 |
+
"""Read PDFs, split into chunks, and build FAISS index."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
all_text = ""
|
| 30 |
+
|
|
|
|
| 31 |
for filename in os.listdir(pdf_directory):
|
| 32 |
if filename.lower().endswith(".pdf"):
|
| 33 |
pdf_path = os.path.join(pdf_directory, filename)
|
|
|
|
| 37 |
page_text = page.extract_text()
|
| 38 |
if page_text:
|
| 39 |
all_text += page_text + "\n"
|
| 40 |
+
|
| 41 |
+
# Split text into ~chunk_size character chunks
|
| 42 |
words = all_text.split()
|
| 43 |
chunks = []
|
| 44 |
current_chunk = []
|
| 45 |
current_length = 0
|
| 46 |
+
|
| 47 |
for word in words:
|
| 48 |
+
if current_length + len(word) + 1 > chunk_size:
|
| 49 |
if current_chunk:
|
| 50 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
| 51 |
current_chunk = [word]
|
|
|
|
| 53 |
else:
|
| 54 |
current_chunk.append(word)
|
| 55 |
current_length += len(word) + 1
|
| 56 |
+
|
| 57 |
if current_chunk:
|
| 58 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
| 59 |
+
|
| 60 |
+
# Embed and index
|
| 61 |
embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
|
| 62 |
+
faiss_index = FAISS.from_documents(chunks, embedding_model)
|
| 63 |
+
return faiss_index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
def get_relevant_context(self, user_question: str) -> List[str]:
|
| 66 |
+
"""Query the FAISS index for the top relevant chunks."""
|
| 67 |
+
relevant_chunks = self.faiss_index.similarity_search(user_question, k=3)
|
| 68 |
+
return "\n\n".join([doc.page_content for doc in relevant_chunks])
|
| 69 |
+
|
| 70 |
def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
|
| 71 |
"""Generate response using Azure OpenAI based on PDF content and user question."""
|
| 72 |
# Split PDF content into chunks
|