Spaces:
Running
Running
| import pymupdf | |
| from PyPDF2 import PdfReader | |
| from pdfminer.high_level import extract_text | |
| from langchain.document_loaders import PDFPlumberLoader | |
| import streamlit as st | |
| def pymupdf_pdf_to_text(file_path): | |
| """ | |
| Extract text from a PDF file using PyMuPDF. | |
| Args: | |
| file_path (str): Path to the PDF file. | |
| Returns: | |
| str: Extracted text from the PDF file. | |
| """ | |
| doc = pymupdf.open(stream=file_path.read(), filetype="pdf") | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() + "\n" | |
| return text | |
| def pypdf2_pdf_to_text(file_path): | |
| """ | |
| Extract text from a PDF file using PyPDF2. | |
| Args: | |
| file_path (str): Path to the PDF file. | |
| Returns: | |
| str: Extracted text from the PDF file. | |
| """ | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| # def pdfminer_pdf_to_text(file_path): | |
| # """ | |
| # Extract text from a PDF file using pdfminer. | |
| # Args: | |
| # file_path (str): Path to the PDF file. | |
| # Returns: | |
| # str: Extracted text from the PDF file. | |
| # """ | |
| # # Implementation for pdfminer extraction goes here | |
| # text = extract_text(file_path) | |
| # return text | |
| def pdfminer_pdf_to_text(pdf_path: str) -> str: | |
| try: | |
| text = extract_text(pdf_path) | |
| return text.strip() | |
| except Exception as e: | |
| st.error(f"Error extracting text: {e}") | |
| return "" | |
| def pdfplumber_pdf_to_text(file_path): | |
| """ | |
| Extract text from a PDF file using pdfplumber. | |
| Args: | |
| file_path (str): Path to the PDF file. | |
| Returns: | |
| str: Extracted text from the PDF file. | |
| """ | |
| loader = PDFPlumberLoader(file_path) | |
| documents = loader.load() | |
| text = "" | |
| for doc in documents: | |
| text += doc.page_content + "\n" | |
| return text | |