Spaces:
Paused
Paused
File size: 2,742 Bytes
cb1a5c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | # filename: document_processor.py
"""
Module for extracting text from various document formats.
"""
import io
import docx
from PyPDF2 import PdfReader
from log_config import get_logger
logger = get_logger('DocumentProcessor')
def extract_text_from_document(file_path: str) -> str:
"""
Extracts text from a document based on its file extension.
Args:
file_path (str): The path to the file.
Returns:
str: The extracted text from the document.
Raises:
ValueError: If the file format is not supported.
"""
file_extension = file_path.split(".")[-1].lower()
try:
with open(file_path, 'rb') as file_obj:
if file_extension == "txt":
return extract_text_from_txt(file_obj)
elif file_extension == "pdf":
return extract_text_from_pdf(file_obj)
elif file_extension == "docx":
return extract_text_from_docx(file_obj)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
except Exception as e:
logger.error(f"Failed to extract text from {file_path}: {str(e)}")
raise
def extract_text_from_txt(file_obj: io.BufferedReader) -> str:
"""
Extracts text from a text file.
Args:
file_obj (io.BufferedReader): The file object opened in binary mode.
Returns:
str: The decoded text.
"""
try:
content = file_obj.read()
return content.decode('utf-8')
except UnicodeDecodeError as e:
logger.error(f"Unicode decode error: {str(e)}")
raise
def extract_text_from_pdf(file_obj: io.BufferedReader) -> str:
"""
Extracts text from a PDF file.
Args:
file_obj (io.BufferedReader): The file object opened in binary mode.
Returns:
str: The concatenated text from all pages.
"""
try:
reader = PdfReader(file_obj)
text = ''.join([page.extract_text() or '' for page in reader.pages])
return text.strip()
except Exception as e:
logger.error(f"Failed to extract text from PDF: {str(e)}")
raise
def extract_text_from_docx(file_obj: io.BufferedReader) -> str:
"""
Extracts text from a DOCX file.
Args:
file_obj (io.BufferedReader): The file object opened in binary mode.
Returns:
str: The concatenated text from all paragraphs.
"""
try:
doc = docx.Document(io.BytesIO(file_obj.read()))
text = '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text)
return text.strip()
except Exception as e:
logger.error(f"Failed to extract text from DOCX: {str(e)}")
raise
# file: document_processor.py (end)
|