{ "cells": [ { "cell_type": "markdown", "id": "982f6213", "metadata": {}, "source": [ "# PDF Processor" ] }, { "cell_type": "code", "execution_count": null, "id": "310b5517", "metadata": {}, "outputs": [], "source": [ "# Split pdf into smaller pieces\n", "import os\n", "import math\n", "from PyPDF2 import PdfReader, PdfWriter\n", "\n", "PART_SIZE = 10\n", "PART_OVERLAP = 0\n", "\n", "def split_pdf(pdf_file_path: str, output_dir: str, pages_per_part: int = 30, overlap: int = 5):\n", " \"\"\"\n", " Splits a PDF file into multiple parts with overlapping pages.\n", "\n", " Args:\n", " pdf_file_path (str): The full path to the source PDF file.\n", " output_dir (str): The directory where the output parts will be saved.\n", " pages_per_part (int): The number of pages each output part should have.\n", " overlap (int): The number of pages that should overlap between consecutive parts.\n", " \"\"\"\n", " if not os.path.exists(pdf_file_path):\n", " print(f\"Error: The file '{pdf_file_path}' was not found.\")\n", " return\n", "\n", " if overlap >= pages_per_part:\n", " print(\"Error: Overlap must be smaller than the number of pages per part.\")\n", " return\n", "\n", " try:\n", " os.makedirs(output_dir, exist_ok=True)\n", " reader = PdfReader(pdf_file_path)\n", " total_pages = len(reader.pages)\n", " if total_pages == 0:\n", " print(\"Error: The source PDF has no pages.\")\n", " return\n", " except Exception as e:\n", " print(f\"An error occurred while reading the PDF: {e}\")\n", " return\n", "\n", " step = pages_per_part - overlap\n", " if step <= 0:\n", " print(\"Error: (pages_per_part - overlap) must be a positive number.\")\n", " return\n", " \n", " total_parts = math.ceil((total_pages - pages_per_part) / step) + 1 if total_pages > pages_per_part else 1\n", " base_filename = os.path.splitext(os.path.basename(pdf_file_path))[0]\n", " \n", " for part_num in range(total_parts):\n", " start_page = part_num * step\n", " end_page = min(start_page + pages_per_part, total_pages)\n", "\n", " output_filename = f\"{base_filename}_Part{part_num + 1}_of_{total_parts}.pdf\"\n", " output_filepath = os.path.join(output_dir, output_filename)\n", " # print(f\"Creating '{output_filename}' (pages {start_page + 1}-{end_page})...\")\n", "\n", " writer = PdfWriter()\n", " for page_index in range(start_page, end_page):\n", " writer.add_page(reader.pages[page_index])\n", "\n", " try:\n", " with open(output_filepath, \"wb\") as out_pdf:\n", " writer.write(out_pdf)\n", " except Exception as e:\n", " print(f\"Could not write file '{output_filepath}'. Reason: {e}\")\n", "\n", " # print(\"\\nPDF splitting complete.\")\n", "\n", "\n", "if __name__ == '__main__':\n", " pdf_dir = r\"C:\\Users\\vuvan\\Desktop\\An_Plaza\\ViMedLLM\\Vietnamese-Medical-LLM\\dataset\\RAG_Data\\Download sach y\\Scan\"\n", "\n", " pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]\n", " pdf_files = [os.path.join(pdf_dir, f) for f in pdf_files]\n", " \n", " from tqdm import tqdm\n", "\n", " # output_directory = r\"C:\\Users\\vuvan\\Desktop\\An_Plaza\\ViMedLLM\\Vietnamese-Medical-LLM\\dataset\\RAG_Data\\Download sach y\\Scan\\Splitted\"\n", " # for pdf_to_split in tqdm(pdf_files, desc=\"Processing PDFs\", total=len(pdf_files)):\n", " # split_pdf(\n", " # pdf_file_path=pdf_to_split,\n", " # output_dir=output_directory,\n", " # pages_per_part=PART_SIZE,\n", " # overlap=PART_OVERLAP\n", " # )\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "27534d17", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Abrams Angiography Interventional Radiology-LWW (2013).pdf\n", "Benh Mach Vanh - Nguyen Huy Dung.pdf\n", "Braunwald's Heart Disease Review and Assessment 11e.pdf\n", "Braunwald's Heart Disease-A Textbook of Cardiovascular Medicine, 2-Volume Set, 11e.pdf\n", "Cardiac Electrophysiology From Cell to Bedside 6e.pdf\n", "Cardiology An Illustrated Textbook (Jaypee) (2013).pdf\n", "Cardiology Board Review 2019.pdf\n", "Cardiovascular Intervention - A Companion to Braunwald’s Heart Disease 1e.pdf\n", "Chronic Coronary Artery Disease - A Companion to Braunwald’s Heart Disease.pdf\n", "Clinical Arrhythmology and Electrophysiology - A Companion to Braunwald's Heart Disease 3rd Edition 2019.pdf\n", "Error reading Clinical Arrhythmology and Electrophysiology - A Companion to Braunwald's Heart Disease 3rd Edition 2019.pdf: Invalid Elementary Object starting with b'\\xd4' @13655304: b'\\xc7z\\xfb\\xc6\\xd4\\xd6\\xdc\\x8d\\x8a\\xf2=j\\xca\\xc4\\x9dS\\x0c\\xb4 \\xfb\\xd4\\xbei\\x07\\xa5\\xab{e\\xd4\\xcd\\x07\\x1eN\\xe6\\xd6\\ny\\xa0\\xd7&/\\xf1\\x1eF9?\\xbe\\xd2\\xb7\\xf7\\x959}\\x12\"\\x94\\xf9&I\\x82\\x00\\x01\\x00\\x00\\x80\\x00@\\x00 \\x00\\x10\\x00\\x08\\x00\\x04\\x00\\x02\\x00\\x01\\x00'\n", "Clinical Arrhythmology and Electrophysiology 2e.pdf\n", "Clinical Lipidology A Campanion Braunwalds Heart Disease 2nd Edition.pdf\n", "Color Atlas and Synopsis of Vascular Disease 1e (1).pdf\n", "Color Atlas and Synopsis of Vascular Disease 1e.pdf\n", "Current Diagnosis and Treatment Cardiology 5e.pdf\n", "Diabetes in Cardiovascular Disease - A Companion to Braunwalds Heart Disease.pdf\n", "Essential Echocardiography - A Companion to Braunwald’s Heart Disease.pdf\n", "Fundamentals of Cardiology For the USMLE and General Medics.pdf\n", "Goldberger's Clinical Electrocardiography 9e.pdf\n", "Harrison’s Cardiovascular Medicine 3e.pdf\n", "Heart Failure A Companion to Braunwalds Heart Disease 3rd Edition[PNT].pdf\n", "Hghlghts_2020ECCGuidelines_hoi_sinh_tim_phoi_Vietnamese.pdf\n", "Hurst's the Heart - 2 Vol Set - 14e.pdf\n", "Hypertension A Companion to Braunwald's Heart Disease 3e.pdf\n", "Imaging Coronary Arteries-Springer-Verlag Mailand (2013).pdf\n", "Interventional Cardiology - Principles and Practice 2e.pdf\n", "Kaplans Clinical Hypertension 11th edition.pdf\n", "Khuyen Cao Chan Doan Va Dieu Tri Tang Huyet Ap 2015.pdf\n", "Khuyen Cao VNHA 2010.pdf\n", "Myocardial Infarction - A Companion to Braunwald's Heart Disease (2016).pdf\n", "PD-dieutri2022-BV Tim.pdf\n", "Textbook of Interventional Cardiology 7th 2015.pdf\n", "The EHRA Book of Interventional Electrophysiology.pdf\n", "The Washington Manual of Cardiology Subspecialty Consult-LWW (2014).pdf\n", "Thuc Hanh Benh Tim Mach - Nguyen Lan Viet.pdf\n", "Thuoc Tim Mach.pdf\n", "Thuốc chẹn beta trong lâm sàng.pdf\n", "Vascular Medicine- A Companion to Braunwalds Heart Disease 2ed.pdf\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "file_name", "rawType": "object", "type": "string" }, { "name": "file_size", "rawType": "float64", "type": "float" }, { "name": "num_pages", "rawType": "int64", "type": "integer" } ], "ref": "77060bd6-9391-4b57-905f-f08a09dccd24", "rows": [ [ "0", "Abrams Angiography Interventional Radiology-LWW (2013).pdf", "128.5", "1240" ], [ "1", "Benh Mach Vanh - Nguyen Huy Dung.pdf", "120.24", "475" ], [ "2", "Braunwald's Heart Disease Review and Assessment 11e.pdf", "26.97", "315" ], [ "3", "Braunwald's Heart Disease-A Textbook of Cardiovascular Medicine, 2-Volume Set, 11e.pdf", "529.44", "2350" ], [ "4", "Cardiac Electrophysiology From Cell to Bedside 6e.pdf", "295.21", "1320" ], [ "5", "Cardiology An Illustrated Textbook (Jaypee) (2013).pdf", "72.42", "2174" ], [ "6", "Cardiology Board Review 2019.pdf", "10.97", "234" ], [ "7", "Cardiovascular Intervention - A Companion to Braunwald’s Heart Disease 1e.pdf", "53.59", "653" ], [ "8", "Chronic Coronary Artery Disease - A Companion to Braunwald’s Heart Disease.pdf", "51.18", "514" ], [ "9", "Clinical Arrhythmology and Electrophysiology - A Companion to Braunwald's Heart Disease 3rd Edition 2019.pdf", "89.97", "-1" ], [ "10", "Clinical Arrhythmology and Electrophysiology 2e.pdf", "100.53", "734" ], [ "11", "Clinical Lipidology A Campanion Braunwalds Heart Disease 2nd Edition.pdf", "68.53", "540" ], [ "12", "Color Atlas and Synopsis of Vascular Disease 1e (1).pdf", "79.85", "495" ], [ "13", "Color Atlas and Synopsis of Vascular Disease 1e.pdf", "79.85", "495" ], [ "14", "Current Diagnosis and Treatment Cardiology 5e.pdf", "24.7", "1050" ], [ "15", "Diabetes in Cardiovascular Disease - A Companion to Braunwalds Heart Disease.pdf", "31.37", "393" ], [ "16", "Essential Echocardiography - A Companion to Braunwald’s Heart Disease.pdf", "42.59", "571" ], [ "17", "Fundamentals of Cardiology For the USMLE and General Medics.pdf", "4.59", "287" ], [ "18", "Goldberger's Clinical Electrocardiography 9e.pdf", "17.7", "359" ], [ "19", "Harrison’s Cardiovascular Medicine 3e.pdf", "112.7", "760" ], [ "20", "Heart Failure A Companion to Braunwalds Heart Disease 3rd Edition[PNT].pdf", "118.31", "757" ], [ "21", "Hghlghts_2020ECCGuidelines_hoi_sinh_tim_phoi_Vietnamese.pdf", "8.85", "32" ], [ "22", "Hurst's the Heart - 2 Vol Set - 14e.pdf", "218.3", "2613" ], [ "23", "Hypertension A Companion to Braunwald's Heart Disease 3e.pdf", "9.94", "503" ], [ "24", "Imaging Coronary Arteries-Springer-Verlag Mailand (2013).pdf", "16.57", "265" ], [ "25", "Interventional Cardiology - Principles and Practice 2e.pdf", "49.37", "810" ], [ "26", "Kaplans Clinical Hypertension 11th edition.pdf", "19.61", "675" ], [ "27", "Khuyen Cao Chan Doan Va Dieu Tri Tang Huyet Ap 2015.pdf", "1.8", "36" ], [ "28", "Khuyen Cao VNHA 2010.pdf", "11.26", "273" ], [ "29", "Myocardial Infarction - A Companion to Braunwald's Heart Disease (2016).pdf", "33.64", "527" ], [ "30", "PD-dieutri2022-BV Tim.pdf", "17.34", "728" ], [ "31", "Textbook of Interventional Cardiology 7th 2015.pdf", "236.29", "1114" ], [ "32", "The EHRA Book of Interventional Electrophysiology.pdf", "19.28", "321" ], [ "33", "The Washington Manual of Cardiology Subspecialty Consult-LWW (2014).pdf", "20.96", "729" ], [ "34", "Thuc Hanh Benh Tim Mach - Nguyen Lan Viet.pdf", "9.8", "301" ], [ "35", "Thuoc Tim Mach.pdf", "149.2", "704" ], [ "36", "Thuốc chẹn beta trong lâm sàng.pdf", "51.81", "158" ], [ "37", "Vascular Medicine- A Companion to Braunwalds Heart Disease 2ed.pdf", "74.61", "859" ] ], "shape": { "columns": 3, "rows": 38 } }, "text/html": [ "
| \n", " | file_name | \n", "file_size | \n", "num_pages | \n", "
|---|---|---|---|
| 0 | \n", "Abrams Angiography Interventional Radiology-LW... | \n", "128.50 | \n", "1240 | \n", "
| 1 | \n", "Benh Mach Vanh - Nguyen Huy Dung.pdf | \n", "120.24 | \n", "475 | \n", "
| 2 | \n", "Braunwald's Heart Disease Review and Assessmen... | \n", "26.97 | \n", "315 | \n", "
| 3 | \n", "Braunwald's Heart Disease-A Textbook of Cardio... | \n", "529.44 | \n", "2350 | \n", "
| 4 | \n", "Cardiac Electrophysiology From Cell to Bedside... | \n", "295.21 | \n", "1320 | \n", "
| 5 | \n", "Cardiology An Illustrated Textbook (Jaypee) (2... | \n", "72.42 | \n", "2174 | \n", "
| 6 | \n", "Cardiology Board Review 2019.pdf | \n", "10.97 | \n", "234 | \n", "
| 7 | \n", "Cardiovascular Intervention - A Companion to B... | \n", "53.59 | \n", "653 | \n", "
| 8 | \n", "Chronic Coronary Artery Disease - A Companion ... | \n", "51.18 | \n", "514 | \n", "
| 9 | \n", "Clinical Arrhythmology and Electrophysiology -... | \n", "89.97 | \n", "-1 | \n", "
| 10 | \n", "Clinical Arrhythmology and Electrophysiology 2... | \n", "100.53 | \n", "734 | \n", "
| 11 | \n", "Clinical Lipidology A Campanion Braunwalds Hea... | \n", "68.53 | \n", "540 | \n", "
| 12 | \n", "Color Atlas and Synopsis of Vascular Disease 1... | \n", "79.85 | \n", "495 | \n", "
| 13 | \n", "Color Atlas and Synopsis of Vascular Disease 1... | \n", "79.85 | \n", "495 | \n", "
| 14 | \n", "Current Diagnosis and Treatment Cardiology 5e.pdf | \n", "24.70 | \n", "1050 | \n", "
| 15 | \n", "Diabetes in Cardiovascular Disease - A Compani... | \n", "31.37 | \n", "393 | \n", "
| 16 | \n", "Essential Echocardiography - A Companion to Br... | \n", "42.59 | \n", "571 | \n", "
| 17 | \n", "Fundamentals of Cardiology For the USMLE and G... | \n", "4.59 | \n", "287 | \n", "
| 18 | \n", "Goldberger's Clinical Electrocardiography 9e.pdf | \n", "17.70 | \n", "359 | \n", "
| 19 | \n", "Harrison’s Cardiovascular Medicine 3e.pdf | \n", "112.70 | \n", "760 | \n", "
| 20 | \n", "Heart Failure A Companion to Braunwalds Heart ... | \n", "118.31 | \n", "757 | \n", "
| 21 | \n", "Hghlghts_2020ECCGuidelines_hoi_sinh_tim_phoi_V... | \n", "8.85 | \n", "32 | \n", "
| 22 | \n", "Hurst's the Heart - 2 Vol Set - 14e.pdf | \n", "218.30 | \n", "2613 | \n", "
| 23 | \n", "Hypertension A Companion to Braunwald's Heart ... | \n", "9.94 | \n", "503 | \n", "
| 24 | \n", "Imaging Coronary Arteries-Springer-Verlag Mail... | \n", "16.57 | \n", "265 | \n", "
| 25 | \n", "Interventional Cardiology - Principles and Pra... | \n", "49.37 | \n", "810 | \n", "
| 26 | \n", "Kaplans Clinical Hypertension 11th edition.pdf | \n", "19.61 | \n", "675 | \n", "
| 27 | \n", "Khuyen Cao Chan Doan Va Dieu Tri Tang Huyet Ap... | \n", "1.80 | \n", "36 | \n", "
| 28 | \n", "Khuyen Cao VNHA 2010.pdf | \n", "11.26 | \n", "273 | \n", "
| 29 | \n", "Myocardial Infarction - A Companion to Braunwa... | \n", "33.64 | \n", "527 | \n", "
| 30 | \n", "PD-dieutri2022-BV Tim.pdf | \n", "17.34 | \n", "728 | \n", "
| 31 | \n", "Textbook of Interventional Cardiology 7th 2015... | \n", "236.29 | \n", "1114 | \n", "
| 32 | \n", "The EHRA Book of Interventional Electrophysiol... | \n", "19.28 | \n", "321 | \n", "
| 33 | \n", "The Washington Manual of Cardiology Subspecial... | \n", "20.96 | \n", "729 | \n", "
| 34 | \n", "Thuc Hanh Benh Tim Mach - Nguyen Lan Viet.pdf | \n", "9.80 | \n", "301 | \n", "
| 35 | \n", "Thuoc Tim Mach.pdf | \n", "149.20 | \n", "704 | \n", "
| 36 | \n", "Thuốc chẹn beta trong lâm sàng.pdf | \n", "51.81 | \n", "158 | \n", "
| 37 | \n", "Vascular Medicine- A Companion to Braunwalds H... | \n", "74.61 | \n", "859 | \n", "