| import PyPDF2 | |
| from PIL import Image | |
| import pytesseract | |
| import io | |
| import sys | |
| # def extract_text(file_path, output_file_path): | |
| # text = "" | |
| # try: | |
| # if file_path.lower().endswith(".pdf"): | |
| # text = extract_text_from_pdf(file_path) | |
| # else: | |
| # print("Unsupported file format") | |
| # with open(output_file_path, "w") as output_file: | |
| # print("Run output") | |
| # for line in text.splitlines(): | |
| # print(line) | |
| # output_file.write(text) | |
| # print(f"Extracted text saved to {output_file_path}") | |
| # except Exception as e: | |
| # print("An error occurred:", e) | |
| # def extract_text_from_image(file_path): | |
| # image_path = file_path | |
| # img = Image.open(image_path) | |
| # text = pytesseract.image_to_string(img) | |
| # print(text[:-1]) | |
| import PyPDF2 | |
| def extract_text_from_pdf(pdf_file_path): | |
| extracted_text = "" | |
| try: | |
| with open(pdf_file_path, "rb") as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| num_pages = len(pdf_reader.pages) | |
| for i in range(num_pages): | |
| page = pdf_reader.pages[i] | |
| page_text = page.extract_text() | |
| if "ABSTRACT" in page_text: | |
| extracted_text += page_text + "\n" | |
| break | |
| return extracted_text | |
| except Exception as e: | |
| print("An error occurred:", e) | |
| return None | |
| # if __name__ == "__main__": | |
| # import PyPDF2 | |
| # file_path = "./report.pdf" | |
| # output_file_path = "./extracted_text.txt" | |
| # extract_text_from_pdf(file_path, output_file_path) |