Spaces:
Sleeping
Sleeping
File size: 1,125 Bytes
1f8cd6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import fitz
from docx import Document
import os
def extract_text_from_pdf(pdf_path):
"""Extract text from PDF using PyMuPDF."""
try:
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
except Exception as e:
print(f"Error reading PDF {pdf_path}: {e}")
return ""
def extract_text_from_docx(docx_path):
"""Extract text from DOCX using python-docx."""
try:
doc = Document(docx_path)
return "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
print(f"Error reading DOCX {docx_path}: {e}")
return ""
def load_documents(folder="data"):
"""Load all supported documents from a folder."""
texts = []
for file in os.listdir(folder):
path = os.path.join(folder, file)
if file.endswith(".pdf"):
texts.append(extract_text_from_pdf(path))
elif file.endswith(".docx"):
texts.append(extract_text_from_docx(path))
else:
print(f"⚠ Skipped unsupported file: {file}")
return texts
|