{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "287f0df4", "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipython-input-3329394316.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgoogle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolab\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdrive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdrive\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36mmount\u001b[0;34m(mountpoint, force_remount, timeout_ms, readonly)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_ms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m120000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreadonly\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;34m\"\"\"Mount your Google Drive at the specified mountpoint path.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 97\u001b[0;31m return _mount(\n\u001b[0m\u001b[1;32m 98\u001b[0m \u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce_remount\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36m_mount\u001b[0;34m(mountpoint, force_remount, timeout_ms, ephemeral, readonly)\u001b[0m\n\u001b[1;32m 132\u001b[0m )\n\u001b[1;32m 133\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mephemeral\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m _message.blocking_request(\n\u001b[0m\u001b[1;32m 135\u001b[0m \u001b[0;34m'request_auth'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'authType'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'dfs_ephemeral'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mblocking_request\u001b[0;34m(request_type, request, timeout_sec, parent)\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0mrequest_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpect_reply\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m )\n\u001b[0;32m--> 176\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_next_input_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_NOT_READY\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.025\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m if (\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive', force_remount=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "f6891108", "metadata": {}, "outputs": [], "source": [ "# 2. Install dependencies\n", "# Cài đặt hệ thống Tesseract, ngôn ngữ Tiếng Việt và các thư viện development cần thiết để build tesserocr\n", "!sudo apt-get update > /dev/null\n", "!sudo apt-get install -y tesseract-ocr tesseract-ocr-vie libtesseract-dev libleptonica-dev pkg-config > /dev/null\n", "\n", "# Cài đặt tesserocr (Python wrapper cho Tesseract) và docling\n", "# Lưu ý: tesserocr cần được build từ source nên cần các thư viện dev ở trên\n", "!pip install tesserocr docling pypdfium2" ] }, { "cell_type": "code", "execution_count": null, "id": "ca42bfce", "metadata": {}, "outputs": [], "source": [ "# 3. Extract Data\n", "import os\n", "import zipfile\n", "\n", "# Path to your zip file on Drive\n", "zip_path = '/content/drive/MyDrive/data_rag.zip' \n", "extract_path = '/content/data_rag/files'\n", "\n", "if not os.path.exists(extract_path):\n", " os.makedirs(extract_path, exist_ok=True)\n", " print(f\"Extracting {zip_path}...\")\n", " try:\n", " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", " zip_ref.extractall(extract_path)\n", " print(\"Done extraction!\")\n", " except FileNotFoundError:\n", " print(f\"❌ File not found: {zip_path}. Please check the path.\")\n", "else:\n", " print(\"Files already extracted.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "988f7e96", "metadata": {}, "outputs": [], "source": [ "# 4. Define Processor Class (Refactored for High Quality & Performance with Tesseract)\n", "import json\n", "import os\n", "import logging\n", "import shutil\n", "import re\n", "import gc\n", "import signal\n", "from pathlib import Path\n", "from typing import Optional\n", "\n", "# --- AUTO-CONFIG TESSERACT DATA PATH ---\n", "# Fix lỗi \"No language models have been detected\"\n", "# Tự động tìm đường dẫn chứa file ngôn ngữ (vie.traineddata) và set biến môi trường\n", "def setup_tesseract_path():\n", " possible_paths = [\n", " \"/usr/share/tesseract-ocr/4.00/tessdata\",\n", " \"/usr/share/tesseract-ocr/5/tessdata\",\n", " \"/usr/share/tesseract-ocr/tessdata\",\n", " \"/usr/local/share/tessdata\"\n", " ]\n", " \n", " found = False\n", " for path in possible_paths:\n", " if os.path.exists(os.path.join(path, \"vie.traineddata\")):\n", " os.environ[\"TESSDATA_PREFIX\"] = path\n", " print(f\"✅ Found Tesseract data at: {path}\")\n", " print(f\" Set TESSDATA_PREFIX={path}\")\n", " found = True\n", " break\n", " \n", " if not found:\n", " print(\"⚠️ WARNING: Could not find 'vie.traineddata'. Tesseract might fail.\")\n", " print(\" Please run Cell #2 to install tesseract-ocr-vie.\")\n", "\n", "setup_tesseract_path()\n", "\n", "# Setup logging\n", "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')\n", "logger = logging.getLogger(__name__)\n", "\n", "# Docling imports\n", "from docling.document_converter import DocumentConverter, FormatOption\n", "from docling.datamodel.base_models import InputFormat\n", "from docling.datamodel.pipeline_options import (\n", " PdfPipelineOptions, \n", " TableStructureOptions,\n", " AcceleratorOptions,\n", " AcceleratorDevice,\n", " TesseractOcrOptions # SỬ DỤNG TESSERACT CHO ĐỘ CHÍNH XÁC CAO NHẤT\n", ")\n", "from docling.datamodel.settings import settings\n", "from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend\n", "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline\n", "\n", "class ColabDoclingProcessor:\n", " def __init__(self, output_dir: str, use_ocr: bool = True, timeout: int = 300):\n", " self.output_dir = output_dir\n", " self.use_ocr = use_ocr\n", " self.timeout = timeout\n", " os.makedirs(output_dir, exist_ok=True)\n", " \n", " # 1. Cấu hình Pipeline Options\n", " pipeline_options = PdfPipelineOptions()\n", " \n", " # --- Cấu hình TableFormer (Ưu tiên số 1) ---\n", " # Kích hoạt nhận diện cấu trúc bảng\n", " pipeline_options.do_table_structure = True\n", " # Sử dụng chế độ ACCURATE để đảm bảo bảng biểu phức tạp (điểm số, học phí) không bị vỡ\n", " pipeline_options.table_structure_options = TableStructureOptions(\n", " do_cell_matching=True, # Khớp text vào ô chính xác hơn\n", " mode=\"accurate\" # Chế độ chính xác cao\n", " )\n", "\n", " # --- FIX LỖI ẢNH MỜ (QUAN TRỌNG) ---\n", " # Tăng độ phân giải ảnh lên gấp 3 lần để Tesseract nhìn rõ dấu tiếng Việt\n", " # Mặc định là 1.0 (mờ), set lên 3.0 sẽ nét căng.\n", " pipeline_options.images_scale = 3.0\n", "\n", " # --- Chiến lược OCR với Tesseract ---\n", " if use_ocr:\n", " pipeline_options.do_ocr = True\n", " \n", " # --- CẤU HÌNH TESSERACT TƯỜNG MINH ---\n", " ocr_options = TesseractOcrOptions()\n", " \n", " # Cấu hình ngôn ngữ tiếng Việt (vie) - Phải khớp với gói tesseract-ocr-vie\n", " ocr_options.lang = [\"vie\"] \n", " \n", " # --- CHẾ ĐỘ HYBRID (THÔNG MINH) ---\n", " # Tắt force_full_page_ocr để Docling tự quyết định:\n", " # 1. Nếu text layer tốt -> Dùng text layer (Nhanh, nhẹ)\n", " # 2. Nếu text layer lỗi hoặc là ảnh -> Dùng OCR\n", " ocr_options.force_full_page_ocr = False\n", " \n", " # Gán options vào pipeline\n", " pipeline_options.ocr_options = ocr_options\n", " else:\n", " pipeline_options.do_ocr = False\n", "\n", " # --- Tối ưu phần cứng (GPU Acceleration) ---\n", " # Tự động phát hiện và sử dụng GPU nếu có (Colab T4/L4)\n", " pipeline_options.accelerator_options = AcceleratorOptions(\n", " num_threads=8, # Tăng thread cho Tesseract\n", " device=AcceleratorDevice.AUTO \n", " )\n", "\n", " # 2. Tạo Format Options\n", " format_options = {\n", " InputFormat.PDF: FormatOption(\n", " backend=PyPdfiumDocumentBackend,\n", " pipeline_cls=StandardPdfPipeline,\n", " pipeline_options=pipeline_options\n", " )\n", " }\n", " \n", " # Khởi tạo Converter\n", " self.converter = DocumentConverter(format_options=format_options)\n", " print(f\"🚀 Docling Processor Initialized\")\n", " print(f\" - OCR Engine: TESSERACT (Vietnamese)\")\n", " print(f\" - Mode: HYBRID (Text Layer + OCR fallback)\")\n", " print(f\" - Image Scale: 3.0 (High Resolution)\")\n", " print(f\" - Table Mode: Accurate\")\n", " print(f\" - Device: Auto-detect (GPU/CPU)\")\n", " print(f\" - Timeout: {self.timeout}s per file\")\n", "\n", " def clean_markdown(self, text: str) -> str:\n", " \"\"\"Hậu xử lý: Làm sạch Markdown.\"\"\"\n", " # 1. Xóa dòng \"Trang x\" (An toàn)\n", " text = re.sub(r'\\n\\s*Trang\\s+\\d+\\s*\\n', '\\n', text)\n", " \n", " # 3. Xóa nhiều dòng trống (An toàn & Cần thiết)\n", " text = re.sub(r'\\n{3,}', '\\n\\n', text)\n", " return text.strip()\n", "\n", " def parse_directory(self, source_dir: str):\n", " print(f\"📂 Parsing PDFs in: {source_dir}\")\n", " source_path = Path(source_dir)\n", " pdf_files = list(source_path.rglob(\"*.pdf\"))\n", " print(f\" Found {len(pdf_files)} PDF files.\")\n", " \n", " results = {\"total\": 0, \"parsed\": 0, \"skipped\": 0, \"errors\": 0}\n", " \n", " # Define timeout handler\n", " def timeout_handler(signum, frame):\n", " raise TimeoutError(\"Processing timeout\")\n", " \n", " # Register signal for timeout\n", " signal.signal(signal.SIGALRM, timeout_handler)\n", " \n", " for i, file_path in enumerate(pdf_files):\n", " filename = file_path.name\n", " \n", " # --- GIỮ NGUYÊN CẤU TRÚC THƯ MỤC ---\n", " # Tính toán đường dẫn tương đối: data/files/subdir/file.pdf -> subdir/file.pdf\n", " try:\n", " relative_path = file_path.relative_to(source_path)\n", " except ValueError:\n", " # Fallback nếu file không nằm trong source_dir (ít khi xảy ra với rglob)\n", " relative_path = Path(filename)\n", "\n", " # Tạo đường dẫn output tương ứng: output_dir/subdir/file.md\n", " output_file_path = Path(self.output_dir) / relative_path.with_suffix(\".md\")\n", " \n", " # Tạo thư mục con nếu chưa tồn tại\n", " output_file_path.parent.mkdir(parents=True, exist_ok=True)\n", " \n", " output_path = str(output_file_path)\n", " \n", " # --- TỐI ƯU 1: SKIP NẾU ĐÃ CÓ KẾT QUẢ (Checkpoint) ---\n", " if os.path.exists(output_path):\n", " results[\"skipped\"] += 1\n", " if results[\"skipped\"] % 50 == 0:\n", " print(f\"⏩ Skipped {results['skipped']} files (already processed)...\")\n", " continue\n", "\n", " try:\n", " # Set timeout\n", " signal.alarm(self.timeout)\n", " \n", " # Convert\n", " result = self.converter.convert(str(file_path))\n", " \n", " # Cancel timeout\n", " signal.alarm(0)\n", " \n", " # Export to Markdown (Làm sạch dữ liệu ảnh rác)\n", " markdown_content = result.document.export_to_markdown(image_placeholder=\"\")\n", " \n", " # Post-processing cleaning\n", " markdown_content = self.clean_markdown(markdown_content)\n", " \n", " # Metadata Extraction (Chuẩn bị cho RAG)\n", " metadata_header = f\"\"\"---\n", "filename: {filename}\n", "filepath: {file_path}\n", "page_count: {len(result.document.pages)}\n", "processed_at: {os.path.getmtime(file_path)}\n", "---\n", "\n", "\"\"\"\n", " final_content = metadata_header + markdown_content\n", " \n", " # Save\n", " with open(output_path, 'w', encoding='utf-8') as f:\n", " f.write(final_content)\n", " \n", " results[\"parsed\"] += 1\n", " \n", " # --- TỐI ƯU 2: GIẢI PHÓNG RAM ---\n", " del result\n", " del markdown_content\n", " \n", " if (i+1) % 10 == 0:\n", " gc.collect()\n", " print(f\"✅ Processed {i+1}/{len(pdf_files)} files (Skipped: {results['skipped']})\")\n", " \n", " except TimeoutError:\n", " print(f\"⏰ Timeout parsing {filename} (>{self.timeout}s)\")\n", " results[\"errors\"] += 1\n", " except Exception as e:\n", " print(f\"❌ Failed to parse {filename}: {e}\")\n", " results[\"errors\"] += 1\n", " finally:\n", " signal.alarm(0) # Ensure alarm is off\n", " \n", " return results" ] }, { "cell_type": "code", "execution_count": null, "id": "0b87fec5", "metadata": {}, "outputs": [], "source": [ "# 5.5. Test Run on Specific File\n", "# Chạy cell này để kiểm tra chất lượng trên file cụ thể (giống Marker)\n", "import os\n", "from pathlib import Path\n", "\n", "# Setup paths (đồng bộ với Cell 3)\n", "source_dir = '/content/data_rag/files'\n", "root = Path(source_dir)\n", "\n", "if not root.exists():\n", " print(f\"❌ Source directory not found: {root}\")\n", " print(\"⚠️ Hãy chạy Cell 3 (Extract Data) trước.\")\n", "else:\n", " # Nếu zip giải nén ra 1 thư mục con 'files' thì đi vào đó\n", " nested_files = root / 'files'\n", " if nested_files.exists():\n", " root = nested_files\n", "\n", " # Tìm file cụ thể\n", " target_filename = \"1.1. Kỹ thuật Cơ điện tử.pdf\"\n", " # Nếu bạn biết chắc thư mục con, điền ở đây (vd: 'quy_che'); nếu không chắc có thể để None\n", " target_subdir = \"quy_che\"\n", "\n", " preferred_path = (root / target_subdir / target_filename) if target_subdir else (root / target_filename)\n", " target_path = preferred_path\n", "\n", " if not target_path.exists():\n", " # Fallback: tự động tìm theo tên file trong toàn bộ cây thư mục\n", " matches = list(root.rglob(target_filename))\n", " if len(matches) == 1:\n", " target_path = matches[0]\n", " print(f\"🔎 Auto-found file at: {target_path}\")\n", " elif len(matches) > 1:\n", " print(\"⚠️ Found multiple matches. Showing up to 20:\")\n", " for p in matches[:20]:\n", " print(f\" - {p}\")\n", " target_path = matches[0]\n", " print(f\"➡️ Using first match: {target_path}\")\n", " else:\n", " print(f\"❌ File not found: {preferred_path}\")\n", " print(f\"Searching in: {root}\")\n", " # Gợi ý: in ra các thư mục cấp 1 để bạn chọn đúng target_subdir\n", " subdirs = sorted([p.name for p in root.iterdir() if p.is_dir()])\n", " if subdirs:\n", " print(\"📁 Top-level folders:\")\n", " for name in subdirs[:30]:\n", " print(f\" - {name}\")\n", " raise FileNotFoundError(target_filename)\n", "\n", " print(f\"🧪 Using target file: {target_path}\")\n", " \n", " # Initialize processor for test\n", " test_output_dir = '/content/data/test_output'\n", " os.makedirs(test_output_dir, exist_ok=True)\n", " \n", " print(\"🚀 Initializing processor for test run (OCR Enabled - Default)...\")\n", " # Use ColabDoclingProcessor defined in previous cell\n", " test_processor = ColabDoclingProcessor(\n", " output_dir=test_output_dir,\n", " use_ocr=True,\n", " )\n", " \n", " try:\n", " print(f\"⏳ Processing {target_path.name}...\")\n", " result = test_processor.converter.convert(str(target_path))\n", " markdown_content = result.document.export_to_markdown()\n", " \n", " # Save to local output\n", " output_file = Path(test_output_dir) / f\"{target_path.stem}.md\"\n", " with open(output_file, 'w', encoding='utf-8') as f:\n", " f.write(markdown_content)\n", " \n", " print(f\"💾 Saved local test file: {output_file}\")\n", " \n", " print(\"\\n\" + \"=\"*50)\n", " print(\"📄 RESULT PREVIEW (First 2000 characters)\")\n", " print(\"=\"*50)\n", " print(markdown_content[:2000])\n", " print(\"\\n\" + \"=\"*50)\n", " print(\"✅ Test completed! Hãy chạy cell tiếp theo để lưu kết quả lên Drive.\")\n", " \n", " except Exception as e:\n", " print(f\"❌ Test failed: {e}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a46429ed", "metadata": {}, "outputs": [], "source": [ "# 5.6. Save Test Result to Google Drive\n", "import shutil\n", "\n", "# Cấu hình đường dẫn lưu trên Drive (Lưu vào folder riêng để dễ so sánh với Marker)\n", "drive_test_folder = '/content/drive/MyDrive/docling/test_result_docling'\n", "\n", "# Biến test_output_dir được định nghĩa ở cell 5.5\n", "if 'test_output_dir' in locals() and os.path.exists(test_output_dir):\n", " # Tạo thư mục cha trên Drive nếu chưa có\n", " if not os.path.exists(os.path.dirname(drive_test_folder)):\n", " os.makedirs(os.path.dirname(drive_test_folder), exist_ok=True)\n", " \n", " print(f\"📂 Copying test results to: {drive_test_folder}\")\n", " \n", " # Sử dụng copytree với dirs_exist_ok=True để copy cả thư mục con và ghi đè nếu cần\n", " # Cách này giữ nguyên cấu trúc thư mục (subdir)\n", " try:\n", " shutil.copytree(test_output_dir, drive_test_folder, dirs_exist_ok=True)\n", " print(f\" ✅ Copied entire folder structure successfully!\")\n", " except Exception as e:\n", " print(f\" ❌ Error copying folder: {e}\")\n", " \n", " print(f\"\\n🎉 Done! Bạn có thể mở Drive để xem file markdown đầy đủ tại: {drive_test_folder}\")\n", "else:\n", " print(\"❌ Không tìm thấy thư mục kết quả test hoặc biến 'test_output_dir' chưa được định nghĩa.\")\n", " print(\"⚠️ Hãy chạy cell 5.5 (Test Run) trước khi chạy cell này!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8228498a", "metadata": {}, "outputs": [], "source": [ "# 6. Run Processing & Save Results\n", "output_dir = '/content/data/docling_output'\n", "# Bật OCR \n", "processor = ColabDoclingProcessor(output_dir=output_dir, use_ocr=True) \n", "\n", "# Determine source directory (handle if zip extracted to subfolder)\n", "source_dir = '/content/data_rag/files' \n", "# Check if files are in a subfolder named 'files' inside the extraction path\n", "if os.path.exists(os.path.join(source_dir, 'files')):\n", " source_dir = os.path.join(source_dir, 'files')\n", "\n", "# Run\n", "processor.parse_directory(source_dir)\n", "\n", "# Zip output and save to Drive\n", "output_zip_path = '/content/drive/MyDrive/docling/docling_output.zip'\n", "print(f\"Zipping output to {output_zip_path}...\")\n", "shutil.make_archive(output_zip_path.replace('.zip', ''), 'zip', output_dir)\n", "print(\"🎉 Done! Check your Google Drive for docling_output.zip\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }