File size: 2,742 Bytes
cb1a5c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# filename: document_processor.py

"""
Module for extracting text from various document formats.
"""

import io
import docx
from PyPDF2 import PdfReader
from log_config import get_logger

logger = get_logger('DocumentProcessor')

def extract_text_from_document(file_path: str) -> str:
    """
    Extracts text from a document based on its file extension.

    Args:
        file_path (str): The path to the file.

    Returns:
        str: The extracted text from the document.

    Raises:
        ValueError: If the file format is not supported.
    """
    file_extension = file_path.split(".")[-1].lower()
    try:
        with open(file_path, 'rb') as file_obj:
            if file_extension == "txt":
                return extract_text_from_txt(file_obj)
            elif file_extension == "pdf":
                return extract_text_from_pdf(file_obj)
            elif file_extension == "docx":
                return extract_text_from_docx(file_obj)
            else:
                raise ValueError(f"Unsupported file format: {file_extension}")
    except Exception as e:
        logger.error(f"Failed to extract text from {file_path}: {str(e)}")
        raise

def extract_text_from_txt(file_obj: io.BufferedReader) -> str:
    """
    Extracts text from a text file.

    Args:
        file_obj (io.BufferedReader): The file object opened in binary mode.

    Returns:
        str: The decoded text.
    """
    try:
        content = file_obj.read()
        return content.decode('utf-8')
    except UnicodeDecodeError as e:
        logger.error(f"Unicode decode error: {str(e)}")
        raise

def extract_text_from_pdf(file_obj: io.BufferedReader) -> str:
    """
    Extracts text from a PDF file.

    Args:
        file_obj (io.BufferedReader): The file object opened in binary mode.

    Returns:
        str: The concatenated text from all pages.
    """
    try:
        reader = PdfReader(file_obj)
        text = ''.join([page.extract_text() or '' for page in reader.pages])
        return text.strip()
    except Exception as e:
        logger.error(f"Failed to extract text from PDF: {str(e)}")
        raise

def extract_text_from_docx(file_obj: io.BufferedReader) -> str:
    """
    Extracts text from a DOCX file.

    Args:
        file_obj (io.BufferedReader): The file object opened in binary mode.

    Returns:
        str: The concatenated text from all paragraphs.
    """
    try:
        doc = docx.Document(io.BytesIO(file_obj.read()))
        text = '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text)
        return text.strip()
    except Exception as e:
        logger.error(f"Failed to extract text from DOCX: {str(e)}")
        raise

# file: document_processor.py (end)