Spaces:

EN-collab
/

HQ_Project_EN

Running

App Files Files Community

HQ_Project_EN / src /functions_pdf.py

1mpreccable

reworked and updated RAG ED

7381c1f 4 months ago

raw

history blame contribute delete

1.9 kB

	import pymupdf
	from PyPDF2 import PdfReader
	from pdfminer.high_level import extract_text
	from langchain.document_loaders import PDFPlumberLoader
	import streamlit as st

	def pymupdf_pdf_to_text(file_path):
	"""
	Extract text from a PDF file using PyMuPDF.

	Args:
	file_path (str): Path to the PDF file.

	Returns:
	str: Extracted text from the PDF file.
	"""
	doc = pymupdf.open(stream=file_path.read(), filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text() + "\n"
	return text

	def pypdf2_pdf_to_text(file_path):
	"""
	Extract text from a PDF file using PyPDF2.

	Args:
	file_path (str): Path to the PDF file.

	Returns:
	str: Extracted text from the PDF file.
	"""
	reader = PdfReader(file_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	# def pdfminer_pdf_to_text(file_path):
	# """
	# Extract text from a PDF file using pdfminer.

	# Args:
	# file_path (str): Path to the PDF file.

	# Returns:
	# str: Extracted text from the PDF file.
	# """
	# # Implementation for pdfminer extraction goes here
	# text = extract_text(file_path)
	# return text

	def pdfminer_pdf_to_text(pdf_path: str) -> str:
	try:
	text = extract_text(pdf_path)
	return text.strip()
	except Exception as e:
	st.error(f"Error extracting text: {e}")
	return ""

	def pdfplumber_pdf_to_text(file_path):
	"""
	Extract text from a PDF file using pdfplumber.

	Args:
	file_path (str): Path to the PDF file.

	Returns:
	str: Extracted text from the PDF file.
	"""
	loader = PDFPlumberLoader(file_path)
	documents = loader.load()
	text = ""
	for doc in documents:
	text += doc.page_content + "\n"
	return text