Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,12 +31,18 @@ class PDFChatbot:
|
|
| 31 |
pdf_directory = "data"
|
| 32 |
|
| 33 |
# Duyệt qua các file trong thư mục và đọc từng file PDF
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
chunks = []
|
| 41 |
current_chunk = []
|
| 42 |
current_length = 0
|
|
|
|
| 31 |
pdf_directory = "data"
|
| 32 |
|
| 33 |
# Duyệt qua các file trong thư mục và đọc từng file PDF
|
| 34 |
+
for filename in os.listdir(pdf_directory):
|
| 35 |
+
if filename.lower().endswith(".pdf"):
|
| 36 |
+
pdf_path = os.path.join(pdf_directory, filename)
|
| 37 |
+
with open(pdf_path, "rb") as pdf_file:
|
| 38 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 39 |
+
text = ""
|
| 40 |
+
for page_num in range(len(pdf_reader.pages)):
|
| 41 |
+
page = pdf_reader.pages[page_num]
|
| 42 |
+
text += page.extract_text() + "\n"
|
| 43 |
+
|
| 44 |
+
# Optional: split into words
|
| 45 |
+
words = text.split()
|
| 46 |
chunks = []
|
| 47 |
current_chunk = []
|
| 48 |
current_length = 0
|