Spaces:

muhammadshaheryar
/

Docker_Deploy

Configuration error

Docker_Deploy / src /python /document_loader.py

Shaheryar Shah

Add backend files for RAG Chatbot Docker deployment

bec06d9 about 2 months ago

5.34 kB

	import os
	from typing import List, Dict, Any
	import PyPDF2
	import docx2txt
	from bs4 import BeautifulSoup
	import markdown
	import logging
	from preprocessor import TextPreprocessor

	logger = logging.getLogger(__name__)

	class DocumentLoader:
	"""
	A utility class to load documents from various formats.
	Supports PDF, DOCX, TXT, and HTML files.
	"""

	@staticmethod
	def load_pdf(file_path: str) -> str:
	"""Load and extract text from a PDF file."""
	try:
	with open(file_path, 'rb') as pdf_file:
	reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	logger.error(f"Error loading PDF {file_path}: {str(e)}")
	return ""

	@staticmethod
	def load_docx(file_path: str) -> str:
	"""Load and extract text from a DOCX file."""
	try:
	return docx2txt.process(file_path)
	except Exception as e:
	logger.error(f"Error loading DOCX {file_path}: {str(e)}")
	return ""

	@staticmethod
	def load_txt(file_path: str) -> str:
	"""Load and extract text from a TXT file."""
	try:
	with open(file_path, 'r', encoding='utf-8') as txt_file:
	return txt_file.read()
	except Exception as e:
	logger.error(f"Error loading TXT {file_path}: {str(e)}")
	return ""

	@staticmethod
	def load_html(file_path: str) -> str:
	"""Load and extract text from an HTML file."""
	try:
	with open(file_path, 'r', encoding='utf-8') as html_file:
	soup = BeautifulSoup(html_file, 'html.parser')
	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()
	return soup.get_text(separator="\n")
	except Exception as e:
	logger.error(f"Error loading HTML {file_path}: {str(e)}")
	return ""

	@staticmethod
	def load_md(file_path: str) -> str:
	"""Load and extract text from a Markdown file."""
	try:
	with open(file_path, 'r', encoding='utf-8') as md_file:
	md_content = md_file.read()
	# Convert Markdown to HTML first, then extract text
	html_content = markdown.markdown(md_content)
	soup = BeautifulSoup(html_content, 'html.parser')
	return soup.get_text(separator="\n")
	except Exception as e:
	logger.error(f"Error loading MD {file_path}: {str(e)}")
	return ""

	@classmethod
	def load_document(cls, file_path: str) -> str:
	"""Load a document based on its extension and preprocess it."""
	_, ext = os.path.splitext(file_path.lower())

	raw_text = ""
	if ext == '.pdf':
	raw_text = cls.load_pdf(file_path)
	elif ext == '.docx':
	raw_text = cls.load_docx(file_path)
	elif ext == '.txt':
	raw_text = cls.load_txt(file_path)
	elif ext in ['.html', '.htm']:
	raw_text = cls.load_html(file_path)
	elif ext == '.md':
	raw_text = cls.load_md(file_path)
	else:
	raise ValueError(f"Unsupported file format: {ext}")

	# Preprocess the text
	cleaned_text = TextPreprocessor.clean_text(raw_text)
	return cleaned_text

	@classmethod
	def load_documents_from_directory(cls, directory_path: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
	"""Load all supported documents from a directory, with optional chunking."""
	documents = []

	for root, dirs, files in os.walk(directory_path):
	for file in files:
	file_path = os.path.join(root, file)
	_, ext = os.path.splitext(file.lower())

	if ext in ['.pdf', '.docx', '.txt', '.html', '.htm', '.md']:
	content = cls.load_document(file_path)

	if content.strip(): # Only add non-empty documents
	# If the content is too long, chunk it
	if len(content) > chunk_size:
	chunks = TextPreprocessor.chunk_text(content, chunk_size, overlap)
	for i, chunk in enumerate(chunks):
	documents.append({
	'content': chunk,
	'source': file_path,
	'metadata': {
	'file_name': file,
	'file_path': file_path,
	'chunk_id': i,
	'total_chunks': len(chunks)
	}
	})
	else:
	documents.append({
	'content': content,
	'source': file_path,
	'metadata': {'file_name': file, 'file_path': file_path}
	})

	return documents