File size: 2,929 Bytes
f1b19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""

PDF Reader Tool - Extract text and metadata from PDF files

"""
import logging
from typing import Dict, Any
from pathlib import Path

logger = logging.getLogger(__name__)


def read_pdf(file_path: str) -> Dict[str, Any]:
    """

    Read and extract text from a PDF file.

    

    Args:

        file_path: Path to the PDF file

        

    Returns:

        Dictionary containing extracted text, page count, and metadata

    """
    try:
        from PyPDF2 import PdfReader
        
        # Validate file exists
        if not Path(file_path).exists():
            raise FileNotFoundError(f"PDF file not found: {file_path}")
        
        # Read PDF
        reader = PdfReader(file_path)
        
        # Extract text from all pages
        text_parts = []
        for page_num, page in enumerate(reader.pages, 1):
            try:
                text = page.extract_text()
                if text:
                    text_parts.append(f"--- Page {page_num} ---\n{text}")
            except Exception as e:
                logger.warning(f"Failed to extract text from page {page_num}: {e}")
                text_parts.append(f"--- Page {page_num} ---\n[Extraction failed]")
        
        full_text = "\n\n".join(text_parts)
        
        # Extract metadata
        metadata = {}
        if reader.metadata:
            metadata = {
                "author": reader.metadata.get("/Author", "Unknown"),
                "creator": reader.metadata.get("/Creator", "Unknown"),
                "producer": reader.metadata.get("/Producer", "Unknown"),
                "subject": reader.metadata.get("/Subject", "Unknown"),
                "title": reader.metadata.get("/Title", "Unknown"),
                "creation_date": str(reader.metadata.get("/CreationDate", "Unknown"))
            }
        
        return {
            "text": full_text,
            "pages": len(reader.pages),
            "metadata": metadata
        }
        
    except ImportError:
        logger.error("PyPDF2 not installed. Install with: pip install pypdf2")
        raise
    except Exception as e:
        logger.error(f"Error reading PDF: {e}")
        raise


def get_pdf_info(file_path: str) -> Dict[str, Any]:
    """

    Get basic information about a PDF without extracting all text.

    

    Args:

        file_path: Path to the PDF file

        

    Returns:

        Dictionary with PDF information

    """
    try:
        from PyPDF2 import PdfReader
        
        reader = PdfReader(file_path)
        
        return {
            "page_count": len(reader.pages),
            "is_encrypted": reader.is_encrypted,
            "file_size_bytes": Path(file_path).stat().st_size,
            "file_name": Path(file_path).name
        }
    except Exception as e:
        logger.error(f"Error getting PDF info: {e}")
        raise