File size: 885 Bytes
c9ed90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from pypdf import PdfReader
from io import BytesIO
from typing import Union

def extract_text_from_pdf(pdf_content: Union[bytes, BytesIO]) -> str:
    """
    Extract text content from a PDF file.
    
    Args:
        pdf_content: PDF file content as bytes or BytesIO object
        
    Returns:
        str: Extracted text content
    """
    try:
        # Convert bytes to BytesIO if necessary
        if isinstance(pdf_content, bytes):
            pdf_content = BytesIO(pdf_content)
        
        # Create PDF reader
        pdf_reader = PdfReader(pdf_content)
        
        # Extract text from all pages
        text_content = []
        for page in pdf_reader.pages:
            text_content.append(page.extract_text())
        
        return "\n".join(text_content)
    
    except Exception as e:
        raise ValueError(f"Error extracting text from PDF: {str(e)}")