MBAL_chatbot

Running

App Files Files Community

ngcanh commited on Jul 21

Commit

6765cfa

verified ·

1 Parent(s): 6ceb7f9

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -22

app.py CHANGED Viewed

@@ -22,16 +22,12 @@ class PDFChatbot:
        self.azure_client = openai.OpenAI()
        self.conversation_history = []
        self.pdf_content = ""
-    def get_relevant_context(self, user_question: str) -> List[str]:
-        """Split text into smaller chunks for better processing."""
-        # db = FAISS.load_local('mbaldb', HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'), allow_dangerous_deserialization = True )
-        pdf_directory = "data"
         all_text = ""
-        # Step 1: Read and extract text from all PDFs
         for filename in os.listdir(pdf_directory):
             if filename.lower().endswith(".pdf"):
                 pdf_path = os.path.join(pdf_directory, filename)
@@ -41,15 +37,15 @@ class PDFChatbot:
                         page_text = page.extract_text()
                         if page_text:
                             all_text += page_text + "\n"
-        # Step 2: Split text into chunks of ~3000 characters
         words = all_text.split()
         chunks = []
         current_chunk = []
         current_length = 0
         for word in words:
-            if current_length + len(word) + 1 > 3000:
                 if current_chunk:
                     chunks.append(Document(page_content=" ".join(current_chunk)))
                 current_chunk = [word]
@@ -57,20 +53,20 @@ class PDFChatbot:
             else:
                 current_chunk.append(word)
                 current_length += len(word) + 1
         if current_chunk:
             chunks.append(Document(page_content=" ".join(current_chunk)))
-        # Step 3: Build the FAISS index
         embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
-        db = FAISS.from_documents(chunks, embedding_model)
-        # Step 4: Perform similarity search
-        relevant_chunks = db.similarity_search(user_question, k=3)
-        # Step 5: Return the content of the top relevant chunks
-        return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
     def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
         """Generate response using Azure OpenAI based on PDF content and user question."""
         # Split PDF content into chunks

        self.azure_client = openai.OpenAI()
        self.conversation_history = []
        self.pdf_content = ""
+       self.faiss_index = self.build_faiss_index("data")
+    def build_faiss_index(self, pdf_directory: str, chunk_size: int = 3000) -> FAISS:
+        """Read PDFs, split into chunks, and build FAISS index."""
         all_text = ""
         for filename in os.listdir(pdf_directory):
             if filename.lower().endswith(".pdf"):
                 pdf_path = os.path.join(pdf_directory, filename)
                         page_text = page.extract_text()
                         if page_text:
                             all_text += page_text + "\n"
+        # Split text into ~chunk_size character chunks
         words = all_text.split()
         chunks = []
         current_chunk = []
         current_length = 0
         for word in words:
+            if current_length + len(word) + 1 > chunk_size:
                 if current_chunk:
                     chunks.append(Document(page_content=" ".join(current_chunk)))
                 current_chunk = [word]
             else:
                 current_chunk.append(word)
                 current_length += len(word) + 1
         if current_chunk:
             chunks.append(Document(page_content=" ".join(current_chunk)))
+        # Embed and index
         embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
+        faiss_index = FAISS.from_documents(chunks, embedding_model)
+        return faiss_index
+    def get_relevant_context(self, user_question: str) -> List[str]:
+        """Query the FAISS index for the top relevant chunks."""
+        relevant_chunks = self.faiss_index.similarity_search(user_question, k=3)
+        return "\n\n".join([doc.page_content for doc in relevant_chunks])
     def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
         """Generate response using Azure OpenAI based on PDF content and user question."""
         # Split PDF content into chunks