Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import pickle
|
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from huggingface_hub import InferenceClient, HfApi
|
|
|
|
| 8 |
|
| 9 |
# Hugging Face Space persistence
|
| 10 |
HF_REPO_ID = "MoslemBot/kajibuku" # e.g., "username/your-space-name"
|
|
@@ -37,8 +38,14 @@ def save_pdf(file, title):
|
|
| 37 |
os.makedirs(folder, exist_ok=True)
|
| 38 |
|
| 39 |
# Extract text
|
| 40 |
-
reader = PdfReader(file.name)
|
| 41 |
-
full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
print(full_text)
|
| 43 |
|
| 44 |
# Chunk text
|
|
|
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from huggingface_hub import InferenceClient, HfApi
|
| 8 |
+
import pdfplumber
|
| 9 |
|
| 10 |
# Hugging Face Space persistence
|
| 11 |
HF_REPO_ID = "MoslemBot/kajibuku" # e.g., "username/your-space-name"
|
|
|
|
| 38 |
os.makedirs(folder, exist_ok=True)
|
| 39 |
|
| 40 |
# Extract text
|
| 41 |
+
# reader = PdfReader(file.name)
|
| 42 |
+
# full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())
|
| 43 |
+
|
| 44 |
+
with pdfplumber.open(file.name) as pdf:
|
| 45 |
+
full_text = ""
|
| 46 |
+
for page in pdf.pages:
|
| 47 |
+
full_text += page.extract_text() + "\n"
|
| 48 |
+
|
| 49 |
print(full_text)
|
| 50 |
|
| 51 |
# Chunk text
|