VidSimplify / manimator /inputs /pdf_parser.py
Adityahulk
adding integration for video dfriectly in streamlit app
12fe8d7
Raw
History Blame Contribute Delete
1.3 kB
import logging
from pathlib import Path
from typing import Optional, Union
import PyPDF2
logger = logging.getLogger(__name__)
class PDFParser:
"""
Extracts text from PDF files.
"""
@staticmethod
def parse(file_path: Union[str, Path]) -> str:
"""
Extract text from a PDF file.
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"PDF file not found: {path}")
logger.info(f"Parsing PDF: {path}")
text_content = []
try:
with open(path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
logger.info(f"PDF has {num_pages} pages")
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
text_content.append(text)
full_text = "\n\n".join(text_content)
logger.info(f"Extracted {len(full_text)} characters from PDF")
return full_text
except Exception as e:
logger.error(f"Error parsing PDF: {e}")
raise RuntimeError(f"Failed to parse PDF: {e}")