MBAL_chatbot

Running

App Files Files Community

ngcanh commited on Jul 21

Commit

bfb4152

verified ·

1 Parent(s): a219d6b

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -21

app.py CHANGED Viewed

@@ -30,39 +30,58 @@ class PDFChatbot:
         pdf_directory = "data"
-        # Duyệt qua các file trong thư mục và đọc từng file PDF
-        for filename in os.listdir(pdf_directory):
-            if filename.lower().endswith(".pdf"):
-                pdf_path = os.path.join(pdf_directory, filename)
-                with open(pdf_path, "rb") as pdf_file:
-                    pdf_reader = PyPDF2.PdfReader(pdf_file)
-                    text = ""
-                    for page_num in range(len(pdf_reader.pages)):
-                        page = pdf_reader.pages[page_num]
-                        text += page.extract_text() + "\n"
-                    # Optional: split into words
-                    words = text.split()
         chunks = []
         current_chunk = []
         current_length = 0
         for word in words:
             if current_length + len(word) + 1 > 3000:
                 if current_chunk:
-                    chunks.append(" ".join(current_chunk))
-                    current_chunk = [word]
-                    current_length = len(word)
             else:
                 current_chunk.append(word)
                 current_length += len(word) + 1
         if current_chunk:
-            chunks.append(" ".join(current_chunk))
-        db = FAISS.from_documents(chunks, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
         relevant_chunks = db.similarity_search(user_question, k=3)
-        return "\n\n".join(relevant_chunks)
     def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
         """Generate response using Azure OpenAI based on PDF content and user question."""
         # Split PDF content into chunks

         pdf_directory = "data"
+import os
+import PyPDF2
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.docstore.document import Document
+pdf_directory = "path_to_your_pdf_folder"
+user_question = "your query here"
+all_text = ""
+# Step 1: Read and extract text from all PDFs
+for filename in os.listdir(pdf_directory):
+    if filename.lower().endswith(".pdf"):
+        pdf_path = os.path.join(pdf_directory, filename)
+        with open(pdf_path, "rb") as pdf_file:
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+            for page in pdf_reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    all_text += page_text + "\n"
+        # Step 2: Split text into chunks of ~3000 characters
+        words = all_text.split()
         chunks = []
         current_chunk = []
         current_length = 0
         for word in words:
             if current_length + len(word) + 1 > 3000:
                 if current_chunk:
+                    chunks.append(Document(page_content=" ".join(current_chunk)))
+                current_chunk = [word]
+                current_length = len(word)
             else:
                 current_chunk.append(word)
                 current_length += len(word) + 1
         if current_chunk:
+            chunks.append(Document(page_content=" ".join(current_chunk)))
+        # Step 3: Build the FAISS index
+        embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
+        db = FAISS.from_documents(chunks, embedding_model)
+        # Step 4: Perform similarity search
         relevant_chunks = db.similarity_search(user_question, k=3)
+        # Step 5: Return the content of the top relevant chunks
+        return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
+    print(return_text)  # Or return from a function if used inside one
     def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
         """Generate response using Azure OpenAI based on PDF content and user question."""
         # Split PDF content into chunks