GVAmaresh
/

repetitive-project-backend

Model card Files Files and versions

repetitive-project-backend / comparator /extract /extract.py

GVAmaresh

ini

c87f53a almost 2 years ago

history blame contribute delete

1.63 kB

	import PyPDF2
	from PIL import Image
	import pytesseract
	import io
	import sys


	# def extract_text(file_path, output_file_path):
	# text = ""
	# try:
	# if file_path.lower().endswith(".pdf"):
	# text = extract_text_from_pdf(file_path)
	# else:
	# print("Unsupported file format")

	# with open(output_file_path, "w") as output_file:
	# print("Run output")
	# for line in text.splitlines():
	# print(line)
	# output_file.write(text)
	# print(f"Extracted text saved to {output_file_path}")

	# except Exception as e:
	# print("An error occurred:", e)


	# def extract_text_from_image(file_path):
	# image_path = file_path
	# img = Image.open(image_path)
	# text = pytesseract.image_to_string(img)
	# print(text[:-1])


	import PyPDF2

	def extract_text_from_pdf(pdf_file_path):
	extracted_text = ""
	try:
	with open(pdf_file_path, "rb") as file:
	pdf_reader = PyPDF2.PdfReader(file)
	num_pages = len(pdf_reader.pages)
	for i in range(num_pages):
	page = pdf_reader.pages[i]
	page_text = page.extract_text()
	if "ABSTRACT" in page_text:
	extracted_text += page_text + "\n"
	break
	return extracted_text
	except Exception as e:
	print("An error occurred:", e)
	return None

	# if __name__ == "__main__":
	# import PyPDF2

	# file_path = "./report.pdf"
	# output_file_path = "./extracted_text.txt"
	# extract_text_from_pdf(file_path, output_file_path)