ngcanh commited on
Commit
bfb4152
·
verified ·
1 Parent(s): a219d6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -21
app.py CHANGED
@@ -30,39 +30,58 @@ class PDFChatbot:
30
 
31
  pdf_directory = "data"
32
 
33
- # Duyệt qua các file trong thư mục và đọc từng file PDF
34
- for filename in os.listdir(pdf_directory):
35
- if filename.lower().endswith(".pdf"):
36
- pdf_path = os.path.join(pdf_directory, filename)
37
- with open(pdf_path, "rb") as pdf_file:
38
- pdf_reader = PyPDF2.PdfReader(pdf_file)
39
- text = ""
40
- for page_num in range(len(pdf_reader.pages)):
41
- page = pdf_reader.pages[page_num]
42
- text += page.extract_text() + "\n"
43
-
44
- # Optional: split into words
45
- words = text.split()
 
 
 
 
 
 
 
 
 
 
 
46
  chunks = []
47
  current_chunk = []
48
  current_length = 0
 
49
  for word in words:
50
  if current_length + len(word) + 1 > 3000:
51
  if current_chunk:
52
- chunks.append(" ".join(current_chunk))
53
- current_chunk = [word]
54
- current_length = len(word)
55
  else:
56
  current_chunk.append(word)
57
  current_length += len(word) + 1
 
58
  if current_chunk:
59
- chunks.append(" ".join(current_chunk))
60
 
61
- db = FAISS.from_documents(chunks, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
62
-
 
 
 
63
  relevant_chunks = db.similarity_search(user_question, k=3)
64
-
65
- return "\n\n".join(relevant_chunks)
 
 
 
66
  def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
67
  """Generate response using Azure OpenAI based on PDF content and user question."""
68
  # Split PDF content into chunks
 
30
 
31
  pdf_directory = "data"
32
 
33
+ import os
34
+ import PyPDF2
35
+ from langchain.vectorstores import FAISS
36
+ from langchain.embeddings import HuggingFaceEmbeddings
37
+ from langchain.docstore.document import Document
38
+
39
+ pdf_directory = "path_to_your_pdf_folder"
40
+ user_question = "your query here"
41
+
42
+ all_text = ""
43
+
44
+ # Step 1: Read and extract text from all PDFs
45
+ for filename in os.listdir(pdf_directory):
46
+ if filename.lower().endswith(".pdf"):
47
+ pdf_path = os.path.join(pdf_directory, filename)
48
+ with open(pdf_path, "rb") as pdf_file:
49
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
50
+ for page in pdf_reader.pages:
51
+ page_text = page.extract_text()
52
+ if page_text:
53
+ all_text += page_text + "\n"
54
+
55
+ # Step 2: Split text into chunks of ~3000 characters
56
+ words = all_text.split()
57
  chunks = []
58
  current_chunk = []
59
  current_length = 0
60
+
61
  for word in words:
62
  if current_length + len(word) + 1 > 3000:
63
  if current_chunk:
64
+ chunks.append(Document(page_content=" ".join(current_chunk)))
65
+ current_chunk = [word]
66
+ current_length = len(word)
67
  else:
68
  current_chunk.append(word)
69
  current_length += len(word) + 1
70
+
71
  if current_chunk:
72
+ chunks.append(Document(page_content=" ".join(current_chunk)))
73
 
74
+ # Step 3: Build the FAISS index
75
+ embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
76
+ db = FAISS.from_documents(chunks, embedding_model)
77
+
78
+ # Step 4: Perform similarity search
79
  relevant_chunks = db.similarity_search(user_question, k=3)
80
+
81
+ # Step 5: Return the content of the top relevant chunks
82
+ return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
83
+ print(return_text) # Or return from a function if used inside one
84
+
85
  def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
86
  """Generate response using Azure OpenAI based on PDF content and user question."""
87
  # Split PDF content into chunks