DoAn

File size: 25,883 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "287f0df4",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipython-input-3329394316.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgoogle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolab\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdrive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdrive\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36mmount\u001b[0;34m(mountpoint, force_remount, timeout_ms, readonly)\u001b[0m\n\u001b[1;32m     95\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_ms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m120000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreadonly\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     96\u001b[0m   \u001b[0;34m\"\"\"Mount your Google Drive at the specified mountpoint path.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 97\u001b[0;31m   return _mount(\n\u001b[0m\u001b[1;32m     98\u001b[0m       \u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     99\u001b[0m       \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce_remount\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36m_mount\u001b[0;34m(mountpoint, force_remount, timeout_ms, ephemeral, readonly)\u001b[0m\n\u001b[1;32m    132\u001b[0m   )\n\u001b[1;32m    133\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0mephemeral\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m     _message.blocking_request(\n\u001b[0m\u001b[1;32m    135\u001b[0m         \u001b[0;34m'request_auth'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'authType'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'dfs_ephemeral'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mblocking_request\u001b[0;34m(request_type, request, timeout_sec, parent)\u001b[0m\n\u001b[1;32m    174\u001b[0m       \u001b[0mrequest_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpect_reply\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    175\u001b[0m   )\n\u001b[0;32m--> 176\u001b[0;31m   \u001b[0;32mreturn\u001b[0m \u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m     94\u001b[0m     \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_next_input_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_NOT_READY\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m       \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.025\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     97\u001b[0m       \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m     if (\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from google.colab import drive\n",
    "drive.mount('/content/drive', force_remount=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6891108",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Install dependencies\n",
    "# Cài đặt hệ thống Tesseract, ngôn ngữ Tiếng Việt và các thư viện development cần thiết để build tesserocr\n",
    "!sudo apt-get update > /dev/null\n",
    "!sudo apt-get install -y tesseract-ocr tesseract-ocr-vie libtesseract-dev libleptonica-dev pkg-config > /dev/null\n",
    "\n",
    "# Cài đặt tesserocr (Python wrapper cho Tesseract) và docling\n",
    "# Lưu ý: tesserocr cần được build từ source nên cần các thư viện dev ở trên\n",
    "!pip install tesserocr docling pypdfium2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca42bfce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. Extract Data\n",
    "import os\n",
    "import zipfile\n",
    "\n",
    "# Path to your zip file on Drive\n",
    "zip_path = '/content/drive/MyDrive/data_rag.zip' \n",
    "extract_path = '/content/data_rag/files'\n",
    "\n",
    "if not os.path.exists(extract_path):\n",
    "    os.makedirs(extract_path, exist_ok=True)\n",
    "    print(f\"Extracting {zip_path}...\")\n",
    "    try:\n",
    "        with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
    "            zip_ref.extractall(extract_path)\n",
    "        print(\"Done extraction!\")\n",
    "    except FileNotFoundError:\n",
    "        print(f\"❌ File not found: {zip_path}. Please check the path.\")\n",
    "else:\n",
    "    print(\"Files already extracted.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "988f7e96",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. Define Processor Class (Refactored for High Quality & Performance with Tesseract)\n",
    "import json\n",
    "import os\n",
    "import logging\n",
    "import shutil\n",
    "import re\n",
    "import gc\n",
    "import signal\n",
    "from pathlib import Path\n",
    "from typing import Optional\n",
    "\n",
    "# --- AUTO-CONFIG TESSERACT DATA PATH ---\n",
    "# Fix lỗi \"No language models have been detected\"\n",
    "# Tự động tìm đường dẫn chứa file ngôn ngữ (vie.traineddata) và set biến môi trường\n",
    "def setup_tesseract_path():\n",
    "    possible_paths = [\n",
    "        \"/usr/share/tesseract-ocr/4.00/tessdata\",\n",
    "        \"/usr/share/tesseract-ocr/5/tessdata\",\n",
    "        \"/usr/share/tesseract-ocr/tessdata\",\n",
    "        \"/usr/local/share/tessdata\"\n",
    "    ]\n",
    "    \n",
    "    found = False\n",
    "    for path in possible_paths:\n",
    "        if os.path.exists(os.path.join(path, \"vie.traineddata\")):\n",
    "            os.environ[\"TESSDATA_PREFIX\"] = path\n",
    "            print(f\"✅ Found Tesseract data at: {path}\")\n",
    "            print(f\"   Set TESSDATA_PREFIX={path}\")\n",
    "            found = True\n",
    "            break\n",
    "            \n",
    "    if not found:\n",
    "        print(\"⚠️ WARNING: Could not find 'vie.traineddata'. Tesseract might fail.\")\n",
    "        print(\"   Please run Cell #2 to install tesseract-ocr-vie.\")\n",
    "\n",
    "setup_tesseract_path()\n",
    "\n",
    "# Setup logging\n",
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')\n",
    "logger = logging.getLogger(__name__)\n",
    "\n",
    "# Docling imports\n",
    "from docling.document_converter import DocumentConverter, FormatOption\n",
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.pipeline_options import (\n",
    "    PdfPipelineOptions, \n",
    "    TableStructureOptions,\n",
    "    AcceleratorOptions,\n",
    "    AcceleratorDevice,\n",
    "    TesseractOcrOptions # SỬ DỤNG TESSERACT CHO ĐỘ CHÍNH XÁC CAO NHẤT\n",
    ")\n",
    "from docling.datamodel.settings import settings\n",
    "from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend\n",
    "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline\n",
    "\n",
    "class ColabDoclingProcessor:\n",
    "    def __init__(self, output_dir: str, use_ocr: bool = True, timeout: int = 300):\n",
    "        self.output_dir = output_dir\n",
    "        self.use_ocr = use_ocr\n",
    "        self.timeout = timeout\n",
    "        os.makedirs(output_dir, exist_ok=True)\n",
    "        \n",
    "        # 1. Cấu hình Pipeline Options\n",
    "        pipeline_options = PdfPipelineOptions()\n",
    "        \n",
    "        # --- Cấu hình TableFormer (Ưu tiên số 1) ---\n",
    "        # Kích hoạt nhận diện cấu trúc bảng\n",
    "        pipeline_options.do_table_structure = True\n",
    "        # Sử dụng chế độ ACCURATE để đảm bảo bảng biểu phức tạp (điểm số, học phí) không bị vỡ\n",
    "        pipeline_options.table_structure_options = TableStructureOptions(\n",
    "            do_cell_matching=True,  # Khớp text vào ô chính xác hơn\n",
    "            mode=\"accurate\"         # Chế độ chính xác cao\n",
    "        )\n",
    "\n",
    "        # --- FIX LỖI ẢNH MỜ (QUAN TRỌNG) ---\n",
    "        # Tăng độ phân giải ảnh lên gấp 3 lần để Tesseract nhìn rõ dấu tiếng Việt\n",
    "        # Mặc định là 1.0 (mờ), set lên 3.0 sẽ nét căng.\n",
    "        pipeline_options.images_scale = 3.0\n",
    "\n",
    "        # --- Chiến lược OCR với Tesseract ---\n",
    "        if use_ocr:\n",
    "            pipeline_options.do_ocr = True\n",
    "            \n",
    "            # --- CẤU HÌNH TESSERACT TƯỜNG MINH ---\n",
    "            ocr_options = TesseractOcrOptions()\n",
    "            \n",
    "            # Cấu hình ngôn ngữ tiếng Việt (vie) - Phải khớp với gói tesseract-ocr-vie\n",
    "            ocr_options.lang = [\"vie\"] \n",
    "            \n",
    "            # --- CHẾ ĐỘ HYBRID (THÔNG MINH) ---\n",
    "            # Tắt force_full_page_ocr để Docling tự quyết định:\n",
    "            # 1. Nếu text layer tốt -> Dùng text layer (Nhanh, nhẹ)\n",
    "            # 2. Nếu text layer lỗi hoặc là ảnh -> Dùng OCR\n",
    "            ocr_options.force_full_page_ocr = False\n",
    "            \n",
    "            # Gán options vào pipeline\n",
    "            pipeline_options.ocr_options = ocr_options\n",
    "        else:\n",
    "            pipeline_options.do_ocr = False\n",
    "\n",
    "        # --- Tối ưu phần cứng (GPU Acceleration) ---\n",
    "        # Tự động phát hiện và sử dụng GPU nếu có (Colab T4/L4)\n",
    "        pipeline_options.accelerator_options = AcceleratorOptions(\n",
    "            num_threads=8, # Tăng thread cho Tesseract\n",
    "            device=AcceleratorDevice.AUTO \n",
    "        )\n",
    "\n",
    "        # 2. Tạo Format Options\n",
    "        format_options = {\n",
    "            InputFormat.PDF: FormatOption(\n",
    "                backend=PyPdfiumDocumentBackend,\n",
    "                pipeline_cls=StandardPdfPipeline,\n",
    "                pipeline_options=pipeline_options\n",
    "            )\n",
    "        }\n",
    "        \n",
    "        # Khởi tạo Converter\n",
    "        self.converter = DocumentConverter(format_options=format_options)\n",
    "        print(f\"🚀 Docling Processor Initialized\")\n",
    "        print(f\"   - OCR Engine: TESSERACT (Vietnamese)\")\n",
    "        print(f\"   - Mode: HYBRID (Text Layer + OCR fallback)\")\n",
    "        print(f\"   - Image Scale: 3.0 (High Resolution)\")\n",
    "        print(f\"   - Table Mode: Accurate\")\n",
    "        print(f\"   - Device: Auto-detect (GPU/CPU)\")\n",
    "        print(f\"   - Timeout: {self.timeout}s per file\")\n",
    "\n",
    "    def clean_markdown(self, text: str) -> str:\n",
    "        \"\"\"Hậu xử lý: Làm sạch Markdown.\"\"\"\n",
    "        # 1. Xóa dòng \"Trang x\" (An toàn)\n",
    "        text = re.sub(r'\\n\\s*Trang\\s+\\d+\\s*\\n', '\\n', text)\n",
    "        \n",
    "        # 3. Xóa nhiều dòng trống (An toàn & Cần thiết)\n",
    "        text = re.sub(r'\\n{3,}', '\\n\\n', text)\n",
    "        return text.strip()\n",
    "\n",
    "    def parse_directory(self, source_dir: str):\n",
    "        print(f\"📂 Parsing PDFs in: {source_dir}\")\n",
    "        source_path = Path(source_dir)\n",
    "        pdf_files = list(source_path.rglob(\"*.pdf\"))\n",
    "        print(f\"   Found {len(pdf_files)} PDF files.\")\n",
    "        \n",
    "        results = {\"total\": 0, \"parsed\": 0, \"skipped\": 0, \"errors\": 0}\n",
    "        \n",
    "        # Define timeout handler\n",
    "        def timeout_handler(signum, frame):\n",
    "            raise TimeoutError(\"Processing timeout\")\n",
    "        \n",
    "        # Register signal for timeout\n",
    "        signal.signal(signal.SIGALRM, timeout_handler)\n",
    "        \n",
    "        for i, file_path in enumerate(pdf_files):\n",
    "            filename = file_path.name\n",
    "            \n",
    "            # --- GIỮ NGUYÊN CẤU TRÚC THƯ MỤC ---\n",
    "            # Tính toán đường dẫn tương đối: data/files/subdir/file.pdf -> subdir/file.pdf\n",
    "            try:\n",
    "                relative_path = file_path.relative_to(source_path)\n",
    "            except ValueError:\n",
    "                # Fallback nếu file không nằm trong source_dir (ít khi xảy ra với rglob)\n",
    "                relative_path = Path(filename)\n",
    "\n",
    "            # Tạo đường dẫn output tương ứng: output_dir/subdir/file.md\n",
    "            output_file_path = Path(self.output_dir) / relative_path.with_suffix(\".md\")\n",
    "            \n",
    "            # Tạo thư mục con nếu chưa tồn tại\n",
    "            output_file_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "            \n",
    "            output_path = str(output_file_path)\n",
    "            \n",
    "            # --- TỐI ƯU 1: SKIP NẾU ĐÃ CÓ KẾT QUẢ (Checkpoint) ---\n",
    "            if os.path.exists(output_path):\n",
    "                results[\"skipped\"] += 1\n",
    "                if results[\"skipped\"] % 50 == 0:\n",
    "                    print(f\"⏩ Skipped {results['skipped']} files (already processed)...\")\n",
    "                continue\n",
    "\n",
    "            try:\n",
    "                # Set timeout\n",
    "                signal.alarm(self.timeout)\n",
    "                \n",
    "                # Convert\n",
    "                result = self.converter.convert(str(file_path))\n",
    "                \n",
    "                # Cancel timeout\n",
    "                signal.alarm(0)\n",
    "                \n",
    "                # Export to Markdown (Làm sạch dữ liệu ảnh rác)\n",
    "                markdown_content = result.document.export_to_markdown(image_placeholder=\"\")\n",
    "                \n",
    "                # Post-processing cleaning\n",
    "                markdown_content = self.clean_markdown(markdown_content)\n",
    "                \n",
    "                # Metadata Extraction (Chuẩn bị cho RAG)\n",
    "                metadata_header = f\"\"\"---\n",
    "filename: {filename}\n",
    "filepath: {file_path}\n",
    "page_count: {len(result.document.pages)}\n",
    "processed_at: {os.path.getmtime(file_path)}\n",
    "---\n",
    "\n",
    "\"\"\"\n",
    "                final_content = metadata_header + markdown_content\n",
    "                \n",
    "                # Save\n",
    "                with open(output_path, 'w', encoding='utf-8') as f:\n",
    "                    f.write(final_content)\n",
    "                \n",
    "                results[\"parsed\"] += 1\n",
    "                \n",
    "                # --- TỐI ƯU 2: GIẢI PHÓNG RAM ---\n",
    "                del result\n",
    "                del markdown_content\n",
    "                \n",
    "                if (i+1) % 10 == 0:\n",
    "                    gc.collect()\n",
    "                    print(f\"✅ Processed {i+1}/{len(pdf_files)} files (Skipped: {results['skipped']})\")\n",
    "                    \n",
    "            except TimeoutError:\n",
    "                print(f\"⏰ Timeout parsing {filename} (>{self.timeout}s)\")\n",
    "                results[\"errors\"] += 1\n",
    "            except Exception as e:\n",
    "                print(f\"❌ Failed to parse {filename}: {e}\")\n",
    "                results[\"errors\"] += 1\n",
    "            finally:\n",
    "                signal.alarm(0) # Ensure alarm is off\n",
    "                \n",
    "        return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b87fec5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.5. Test Run on Specific File\n",
    "# Chạy cell này để kiểm tra chất lượng trên file cụ thể (giống Marker)\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "# Setup paths (đồng bộ với Cell 3)\n",
    "source_dir = '/content/data_rag/files'\n",
    "root = Path(source_dir)\n",
    "\n",
    "if not root.exists():\n",
    "    print(f\"❌ Source directory not found: {root}\")\n",
    "    print(\"⚠️ Hãy chạy Cell 3 (Extract Data) trước.\")\n",
    "else:\n",
    "    # Nếu zip giải nén ra 1 thư mục con 'files' thì đi vào đó\n",
    "    nested_files = root / 'files'\n",
    "    if nested_files.exists():\n",
    "        root = nested_files\n",
    "\n",
    "    # Tìm file cụ thể\n",
    "    target_filename = \"1.1. Kỹ thuật Cơ điện tử.pdf\"\n",
    "    # Nếu bạn biết chắc thư mục con, điền ở đây (vd: 'quy_che'); nếu không chắc có thể để None\n",
    "    target_subdir = \"quy_che\"\n",
    "\n",
    "    preferred_path = (root / target_subdir / target_filename) if target_subdir else (root / target_filename)\n",
    "    target_path = preferred_path\n",
    "\n",
    "    if not target_path.exists():\n",
    "        # Fallback: tự động tìm theo tên file trong toàn bộ cây thư mục\n",
    "        matches = list(root.rglob(target_filename))\n",
    "        if len(matches) == 1:\n",
    "            target_path = matches[0]\n",
    "            print(f\"🔎 Auto-found file at: {target_path}\")\n",
    "        elif len(matches) > 1:\n",
    "            print(\"⚠️ Found multiple matches. Showing up to 20:\")\n",
    "            for p in matches[:20]:\n",
    "                print(f\" - {p}\")\n",
    "            target_path = matches[0]\n",
    "            print(f\"➡️ Using first match: {target_path}\")\n",
    "        else:\n",
    "            print(f\"❌ File not found: {preferred_path}\")\n",
    "            print(f\"Searching in: {root}\")\n",
    "            # Gợi ý: in ra các thư mục cấp 1 để bạn chọn đúng target_subdir\n",
    "            subdirs = sorted([p.name for p in root.iterdir() if p.is_dir()])\n",
    "            if subdirs:\n",
    "                print(\"📁 Top-level folders:\")\n",
    "                for name in subdirs[:30]:\n",
    "                    print(f\" - {name}\")\n",
    "            raise FileNotFoundError(target_filename)\n",
    "\n",
    "    print(f\"🧪 Using target file: {target_path}\")\n",
    "    \n",
    "    # Initialize processor for test\n",
    "    test_output_dir = '/content/data/test_output'\n",
    "    os.makedirs(test_output_dir, exist_ok=True)\n",
    "    \n",
    "    print(\"🚀 Initializing processor for test run (OCR Enabled - Default)...\")\n",
    "    # Use ColabDoclingProcessor defined in previous cell\n",
    "    test_processor = ColabDoclingProcessor(\n",
    "        output_dir=test_output_dir,\n",
    "        use_ocr=True,\n",
    "    )\n",
    "    \n",
    "    try:\n",
    "        print(f\"⏳ Processing {target_path.name}...\")\n",
    "        result = test_processor.converter.convert(str(target_path))\n",
    "        markdown_content = result.document.export_to_markdown()\n",
    "        \n",
    "        # Save to local output\n",
    "        output_file = Path(test_output_dir) / f\"{target_path.stem}.md\"\n",
    "        with open(output_file, 'w', encoding='utf-8') as f:\n",
    "            f.write(markdown_content)\n",
    "            \n",
    "        print(f\"💾 Saved local test file: {output_file}\")\n",
    "        \n",
    "        print(\"\\n\" + \"=\"*50)\n",
    "        print(\"📄 RESULT PREVIEW (First 2000 characters)\")\n",
    "        print(\"=\"*50)\n",
    "        print(markdown_content[:2000])\n",
    "        print(\"\\n\" + \"=\"*50)\n",
    "        print(\"✅ Test completed! Hãy chạy cell tiếp theo để lưu kết quả lên Drive.\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"❌ Test failed: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a46429ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.6. Save Test Result to Google Drive\n",
    "import shutil\n",
    "\n",
    "# Cấu hình đường dẫn lưu trên Drive (Lưu vào folder riêng để dễ so sánh với Marker)\n",
    "drive_test_folder = '/content/drive/MyDrive/docling/test_result_docling'\n",
    "\n",
    "# Biến test_output_dir được định nghĩa ở cell 5.5\n",
    "if 'test_output_dir' in locals() and os.path.exists(test_output_dir):\n",
    "    # Tạo thư mục cha trên Drive nếu chưa có\n",
    "    if not os.path.exists(os.path.dirname(drive_test_folder)):\n",
    "        os.makedirs(os.path.dirname(drive_test_folder), exist_ok=True)\n",
    "        \n",
    "    print(f\"📂 Copying test results to: {drive_test_folder}\")\n",
    "    \n",
    "    # Sử dụng copytree với dirs_exist_ok=True để copy cả thư mục con và ghi đè nếu cần\n",
    "    # Cách này giữ nguyên cấu trúc thư mục (subdir)\n",
    "    try:\n",
    "        shutil.copytree(test_output_dir, drive_test_folder, dirs_exist_ok=True)\n",
    "        print(f\"   ✅ Copied entire folder structure successfully!\")\n",
    "    except Exception as e:\n",
    "        print(f\"   ❌ Error copying folder: {e}\")\n",
    "            \n",
    "    print(f\"\\n🎉 Done! Bạn có thể mở Drive để xem file markdown đầy đủ tại: {drive_test_folder}\")\n",
    "else:\n",
    "    print(\"❌ Không tìm thấy thư mục kết quả test hoặc biến 'test_output_dir' chưa được định nghĩa.\")\n",
    "    print(\"⚠️ Hãy chạy cell 5.5 (Test Run) trước khi chạy cell này!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8228498a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Run Processing & Save Results\n",
    "output_dir = '/content/data/docling_output'\n",
    "# Bật OCR \n",
    "processor = ColabDoclingProcessor(output_dir=output_dir, use_ocr=True) \n",
    "\n",
    "# Determine source directory (handle if zip extracted to subfolder)\n",
    "source_dir = '/content/data_rag/files' \n",
    "# Check if files are in a subfolder named 'files' inside the extraction path\n",
    "if os.path.exists(os.path.join(source_dir, 'files')):\n",
    "    source_dir = os.path.join(source_dir, 'files')\n",
    "\n",
    "# Run\n",
    "processor.parse_directory(source_dir)\n",
    "\n",
    "# Zip output and save to Drive\n",
    "output_zip_path = '/content/drive/MyDrive/docling/docling_output.zip'\n",
    "print(f\"Zipping output to {output_zip_path}...\")\n",
    "shutil.make_archive(output_zip_path.replace('.zip', ''), 'zip', output_dir)\n",
    "print(\"🎉 Done! Check your Google Drive for docling_output.zip\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}