Spaces:
Configuration error
Configuration error
File size: 5,335 Bytes
bec06d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
from typing import List, Dict, Any
import PyPDF2
import docx2txt
from bs4 import BeautifulSoup
import markdown
import logging
from preprocessor import TextPreprocessor
logger = logging.getLogger(__name__)
class DocumentLoader:
"""
A utility class to load documents from various formats.
Supports PDF, DOCX, TXT, and HTML files.
"""
@staticmethod
def load_pdf(file_path: str) -> str:
"""Load and extract text from a PDF file."""
try:
with open(file_path, 'rb') as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error(f"Error loading PDF {file_path}: {str(e)}")
return ""
@staticmethod
def load_docx(file_path: str) -> str:
"""Load and extract text from a DOCX file."""
try:
return docx2txt.process(file_path)
except Exception as e:
logger.error(f"Error loading DOCX {file_path}: {str(e)}")
return ""
@staticmethod
def load_txt(file_path: str) -> str:
"""Load and extract text from a TXT file."""
try:
with open(file_path, 'r', encoding='utf-8') as txt_file:
return txt_file.read()
except Exception as e:
logger.error(f"Error loading TXT {file_path}: {str(e)}")
return ""
@staticmethod
def load_html(file_path: str) -> str:
"""Load and extract text from an HTML file."""
try:
with open(file_path, 'r', encoding='utf-8') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
return soup.get_text(separator="\n")
except Exception as e:
logger.error(f"Error loading HTML {file_path}: {str(e)}")
return ""
@staticmethod
def load_md(file_path: str) -> str:
"""Load and extract text from a Markdown file."""
try:
with open(file_path, 'r', encoding='utf-8') as md_file:
md_content = md_file.read()
# Convert Markdown to HTML first, then extract text
html_content = markdown.markdown(md_content)
soup = BeautifulSoup(html_content, 'html.parser')
return soup.get_text(separator="\n")
except Exception as e:
logger.error(f"Error loading MD {file_path}: {str(e)}")
return ""
@classmethod
def load_document(cls, file_path: str) -> str:
"""Load a document based on its extension and preprocess it."""
_, ext = os.path.splitext(file_path.lower())
raw_text = ""
if ext == '.pdf':
raw_text = cls.load_pdf(file_path)
elif ext == '.docx':
raw_text = cls.load_docx(file_path)
elif ext == '.txt':
raw_text = cls.load_txt(file_path)
elif ext in ['.html', '.htm']:
raw_text = cls.load_html(file_path)
elif ext == '.md':
raw_text = cls.load_md(file_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
# Preprocess the text
cleaned_text = TextPreprocessor.clean_text(raw_text)
return cleaned_text
@classmethod
def load_documents_from_directory(cls, directory_path: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
"""Load all supported documents from a directory, with optional chunking."""
documents = []
for root, dirs, files in os.walk(directory_path):
for file in files:
file_path = os.path.join(root, file)
_, ext = os.path.splitext(file.lower())
if ext in ['.pdf', '.docx', '.txt', '.html', '.htm', '.md']:
content = cls.load_document(file_path)
if content.strip(): # Only add non-empty documents
# If the content is too long, chunk it
if len(content) > chunk_size:
chunks = TextPreprocessor.chunk_text(content, chunk_size, overlap)
for i, chunk in enumerate(chunks):
documents.append({
'content': chunk,
'source': file_path,
'metadata': {
'file_name': file,
'file_path': file_path,
'chunk_id': i,
'total_chunks': len(chunks)
}
})
else:
documents.append({
'content': content,
'source': file_path,
'metadata': {'file_name': file, 'file_path': file_path}
})
return documents |