Spaces:
Running
Running
File size: 1,896 Bytes
44be36b 7381c1f 44be36b 7381c1f 44be36b 7381c1f 44be36b 7381c1f 44be36b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import pymupdf
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
from langchain.document_loaders import PDFPlumberLoader
import streamlit as st
def pymupdf_pdf_to_text(file_path):
"""
Extract text from a PDF file using PyMuPDF.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF file.
"""
doc = pymupdf.open(stream=file_path.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text() + "\n"
return text
def pypdf2_pdf_to_text(file_path):
"""
Extract text from a PDF file using PyPDF2.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF file.
"""
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
# def pdfminer_pdf_to_text(file_path):
# """
# Extract text from a PDF file using pdfminer.
# Args:
# file_path (str): Path to the PDF file.
# Returns:
# str: Extracted text from the PDF file.
# """
# # Implementation for pdfminer extraction goes here
# text = extract_text(file_path)
# return text
def pdfminer_pdf_to_text(pdf_path: str) -> str:
try:
text = extract_text(pdf_path)
return text.strip()
except Exception as e:
st.error(f"Error extracting text: {e}")
return ""
def pdfplumber_pdf_to_text(file_path):
"""
Extract text from a PDF file using pdfplumber.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF file.
"""
loader = PDFPlumberLoader(file_path)
documents = loader.load()
text = ""
for doc in documents:
text += doc.page_content + "\n"
return text
|