File size: 25,883 Bytes
6bd96d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1c0676
 
 
6bd96d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1c0676
 
 
 
 
 
 
 
 
 
 
 
6bd96d9
c1c0676
 
 
 
6bd96d9
c1c0676
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bd96d9
 
 
 
 
 
 
 
c1c0676
 
6bd96d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1c0676
6bd96d9
 
 
 
 
 
c1c0676
6bd96d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1c0676
6bd96d9
 
 
c1c0676
6bd96d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "287f0df4",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipython-input-3329394316.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgoogle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolab\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdrive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdrive\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36mmount\u001b[0;34m(mountpoint, force_remount, timeout_ms, readonly)\u001b[0m\n\u001b[1;32m     95\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_ms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m120000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreadonly\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     96\u001b[0m   \u001b[0;34m\"\"\"Mount your Google Drive at the specified mountpoint path.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 97\u001b[0;31m   return _mount(\n\u001b[0m\u001b[1;32m     98\u001b[0m       \u001b[0mmountpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     99\u001b[0m       \u001b[0mforce_remount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce_remount\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/drive.py\u001b[0m in \u001b[0;36m_mount\u001b[0;34m(mountpoint, force_remount, timeout_ms, ephemeral, readonly)\u001b[0m\n\u001b[1;32m    132\u001b[0m   )\n\u001b[1;32m    133\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0mephemeral\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m     _message.blocking_request(\n\u001b[0m\u001b[1;32m    135\u001b[0m         \u001b[0;34m'request_auth'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'authType'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'dfs_ephemeral'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mblocking_request\u001b[0;34m(request_type, request, timeout_sec, parent)\u001b[0m\n\u001b[1;32m    174\u001b[0m       \u001b[0mrequest_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpect_reply\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    175\u001b[0m   )\n\u001b[0;32m--> 176\u001b[0;31m   \u001b[0;32mreturn\u001b[0m \u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m     94\u001b[0m     \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_next_input_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_NOT_READY\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m       \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.025\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     97\u001b[0m       \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m     if (\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from google.colab import drive\n",
    "drive.mount('/content/drive', force_remount=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6891108",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Install dependencies\n",
    "# Cài đặt hệ thống Tesseract, ngôn ngữ Tiếng Việt và các thư viện development cần thiết để build tesserocr\n",
    "!sudo apt-get update > /dev/null\n",
    "!sudo apt-get install -y tesseract-ocr tesseract-ocr-vie libtesseract-dev libleptonica-dev pkg-config > /dev/null\n",
    "\n",
    "# Cài đặt tesserocr (Python wrapper cho Tesseract) và docling\n",
    "# Lưu ý: tesserocr cần được build từ source nên cần các thư viện dev ở trên\n",
    "!pip install tesserocr docling pypdfium2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca42bfce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. Extract Data\n",
    "import os\n",
    "import zipfile\n",
    "\n",
    "# Path to your zip file on Drive\n",
    "zip_path = '/content/drive/MyDrive/data_rag.zip' \n",
    "extract_path = '/content/data_rag/files'\n",
    "\n",
    "if not os.path.exists(extract_path):\n",
    "    os.makedirs(extract_path, exist_ok=True)\n",
    "    print(f\"Extracting {zip_path}...\")\n",
    "    try:\n",
    "        with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
    "            zip_ref.extractall(extract_path)\n",
    "        print(\"Done extraction!\")\n",
    "    except FileNotFoundError:\n",
    "        print(f\"❌ File not found: {zip_path}. Please check the path.\")\n",
    "else:\n",
    "    print(\"Files already extracted.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "988f7e96",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. Define Processor Class (Refactored for High Quality & Performance with Tesseract)\n",
    "import json\n",
    "import os\n",
    "import logging\n",
    "import shutil\n",
    "import re\n",
    "import gc\n",
    "import signal\n",
    "from pathlib import Path\n",
    "from typing import Optional\n",
    "\n",
    "# --- AUTO-CONFIG TESSERACT DATA PATH ---\n",
    "# Fix lỗi \"No language models have been detected\"\n",
    "# Tự động tìm đường dẫn chứa file ngôn ngữ (vie.traineddata) và set biến môi trường\n",
    "def setup_tesseract_path():\n",
    "    possible_paths = [\n",
    "        \"/usr/share/tesseract-ocr/4.00/tessdata\",\n",
    "        \"/usr/share/tesseract-ocr/5/tessdata\",\n",
    "        \"/usr/share/tesseract-ocr/tessdata\",\n",
    "        \"/usr/local/share/tessdata\"\n",
    "    ]\n",
    "    \n",
    "    found = False\n",
    "    for path in possible_paths:\n",
    "        if os.path.exists(os.path.join(path, \"vie.traineddata\")):\n",
    "            os.environ[\"TESSDATA_PREFIX\"] = path\n",
    "            print(f\"✅ Found Tesseract data at: {path}\")\n",
    "            print(f\"   Set TESSDATA_PREFIX={path}\")\n",
    "            found = True\n",
    "            break\n",
    "            \n",
    "    if not found:\n",
    "        print(\"⚠️ WARNING: Could not find 'vie.traineddata'. Tesseract might fail.\")\n",
    "        print(\"   Please run Cell #2 to install tesseract-ocr-vie.\")\n",
    "\n",
    "setup_tesseract_path()\n",
    "\n",
    "# Setup logging\n",
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')\n",
    "logger = logging.getLogger(__name__)\n",
    "\n",
    "# Docling imports\n",
    "from docling.document_converter import DocumentConverter, FormatOption\n",
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.pipeline_options import (\n",
    "    PdfPipelineOptions, \n",
    "    TableStructureOptions,\n",
    "    AcceleratorOptions,\n",
    "    AcceleratorDevice,\n",
    "    TesseractOcrOptions # SỬ DỤNG TESSERACT CHO ĐỘ CHÍNH XÁC CAO NHẤT\n",
    ")\n",
    "from docling.datamodel.settings import settings\n",
    "from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend\n",
    "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline\n",
    "\n",
    "class ColabDoclingProcessor:\n",
    "    def __init__(self, output_dir: str, use_ocr: bool = True, timeout: int = 300):\n",
    "        self.output_dir = output_dir\n",
    "        self.use_ocr = use_ocr\n",
    "        self.timeout = timeout\n",
    "        os.makedirs(output_dir, exist_ok=True)\n",
    "        \n",
    "        # 1. Cấu hình Pipeline Options\n",
    "        pipeline_options = PdfPipelineOptions()\n",
    "        \n",
    "        # --- Cấu hình TableFormer (Ưu tiên số 1) ---\n",
    "        # Kích hoạt nhận diện cấu trúc bảng\n",
    "        pipeline_options.do_table_structure = True\n",
    "        # Sử dụng chế độ ACCURATE để đảm bảo bảng biểu phức tạp (điểm số, học phí) không bị vỡ\n",
    "        pipeline_options.table_structure_options = TableStructureOptions(\n",
    "            do_cell_matching=True,  # Khớp text vào ô chính xác hơn\n",
    "            mode=\"accurate\"         # Chế độ chính xác cao\n",
    "        )\n",
    "\n",
    "        # --- FIX LỖI ẢNH MỜ (QUAN TRỌNG) ---\n",
    "        # Tăng độ phân giải ảnh lên gấp 3 lần để Tesseract nhìn rõ dấu tiếng Việt\n",
    "        # Mặc định là 1.0 (mờ), set lên 3.0 sẽ nét căng.\n",
    "        pipeline_options.images_scale = 3.0\n",
    "\n",
    "        # --- Chiến lược OCR với Tesseract ---\n",
    "        if use_ocr:\n",
    "            pipeline_options.do_ocr = True\n",
    "            \n",
    "            # --- CẤU HÌNH TESSERACT TƯỜNG MINH ---\n",
    "            ocr_options = TesseractOcrOptions()\n",
    "            \n",
    "            # Cấu hình ngôn ngữ tiếng Việt (vie) - Phải khớp với gói tesseract-ocr-vie\n",
    "            ocr_options.lang = [\"vie\"] \n",
    "            \n",
    "            # --- CHẾ ĐỘ HYBRID (THÔNG MINH) ---\n",
    "            # Tắt force_full_page_ocr để Docling tự quyết định:\n",
    "            # 1. Nếu text layer tốt -> Dùng text layer (Nhanh, nhẹ)\n",
    "            # 2. Nếu text layer lỗi hoặc là ảnh -> Dùng OCR\n",
    "            ocr_options.force_full_page_ocr = False\n",
    "            \n",
    "            # Gán options vào pipeline\n",
    "            pipeline_options.ocr_options = ocr_options\n",
    "        else:\n",
    "            pipeline_options.do_ocr = False\n",
    "\n",
    "        # --- Tối ưu phần cứng (GPU Acceleration) ---\n",
    "        # Tự động phát hiện và sử dụng GPU nếu có (Colab T4/L4)\n",
    "        pipeline_options.accelerator_options = AcceleratorOptions(\n",
    "            num_threads=8, # Tăng thread cho Tesseract\n",
    "            device=AcceleratorDevice.AUTO \n",
    "        )\n",
    "\n",
    "        # 2. Tạo Format Options\n",
    "        format_options = {\n",
    "            InputFormat.PDF: FormatOption(\n",
    "                backend=PyPdfiumDocumentBackend,\n",
    "                pipeline_cls=StandardPdfPipeline,\n",
    "                pipeline_options=pipeline_options\n",
    "            )\n",
    "        }\n",
    "        \n",
    "        # Khởi tạo Converter\n",
    "        self.converter = DocumentConverter(format_options=format_options)\n",
    "        print(f\"🚀 Docling Processor Initialized\")\n",
    "        print(f\"   - OCR Engine: TESSERACT (Vietnamese)\")\n",
    "        print(f\"   - Mode: HYBRID (Text Layer + OCR fallback)\")\n",
    "        print(f\"   - Image Scale: 3.0 (High Resolution)\")\n",
    "        print(f\"   - Table Mode: Accurate\")\n",
    "        print(f\"   - Device: Auto-detect (GPU/CPU)\")\n",
    "        print(f\"   - Timeout: {self.timeout}s per file\")\n",
    "\n",
    "    def clean_markdown(self, text: str) -> str:\n",
    "        \"\"\"Hậu xử lý: Làm sạch Markdown.\"\"\"\n",
    "        # 1. Xóa dòng \"Trang x\" (An toàn)\n",
    "        text = re.sub(r'\\n\\s*Trang\\s+\\d+\\s*\\n', '\\n', text)\n",
    "        \n",
    "        # 3. Xóa nhiều dòng trống (An toàn & Cần thiết)\n",
    "        text = re.sub(r'\\n{3,}', '\\n\\n', text)\n",
    "        return text.strip()\n",
    "\n",
    "    def parse_directory(self, source_dir: str):\n",
    "        print(f\"📂 Parsing PDFs in: {source_dir}\")\n",
    "        source_path = Path(source_dir)\n",
    "        pdf_files = list(source_path.rglob(\"*.pdf\"))\n",
    "        print(f\"   Found {len(pdf_files)} PDF files.\")\n",
    "        \n",
    "        results = {\"total\": 0, \"parsed\": 0, \"skipped\": 0, \"errors\": 0}\n",
    "        \n",
    "        # Define timeout handler\n",
    "        def timeout_handler(signum, frame):\n",
    "            raise TimeoutError(\"Processing timeout\")\n",
    "        \n",
    "        # Register signal for timeout\n",
    "        signal.signal(signal.SIGALRM, timeout_handler)\n",
    "        \n",
    "        for i, file_path in enumerate(pdf_files):\n",
    "            filename = file_path.name\n",
    "            \n",
    "            # --- GIỮ NGUYÊN CẤU TRÚC THƯ MỤC ---\n",
    "            # Tính toán đường dẫn tương đối: data/files/subdir/file.pdf -> subdir/file.pdf\n",
    "            try:\n",
    "                relative_path = file_path.relative_to(source_path)\n",
    "            except ValueError:\n",
    "                # Fallback nếu file không nằm trong source_dir (ít khi xảy ra với rglob)\n",
    "                relative_path = Path(filename)\n",
    "\n",
    "            # Tạo đường dẫn output tương ứng: output_dir/subdir/file.md\n",
    "            output_file_path = Path(self.output_dir) / relative_path.with_suffix(\".md\")\n",
    "            \n",
    "            # Tạo thư mục con nếu chưa tồn tại\n",
    "            output_file_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "            \n",
    "            output_path = str(output_file_path)\n",
    "            \n",
    "            # --- TỐI ƯU 1: SKIP NẾU ĐÃ CÓ KẾT QUẢ (Checkpoint) ---\n",
    "            if os.path.exists(output_path):\n",
    "                results[\"skipped\"] += 1\n",
    "                if results[\"skipped\"] % 50 == 0:\n",
    "                    print(f\"⏩ Skipped {results['skipped']} files (already processed)...\")\n",
    "                continue\n",
    "\n",
    "            try:\n",
    "                # Set timeout\n",
    "                signal.alarm(self.timeout)\n",
    "                \n",
    "                # Convert\n",
    "                result = self.converter.convert(str(file_path))\n",
    "                \n",
    "                # Cancel timeout\n",
    "                signal.alarm(0)\n",
    "                \n",
    "                # Export to Markdown (Làm sạch dữ liệu ảnh rác)\n",
    "                markdown_content = result.document.export_to_markdown(image_placeholder=\"\")\n",
    "                \n",
    "                # Post-processing cleaning\n",
    "                markdown_content = self.clean_markdown(markdown_content)\n",
    "                \n",
    "                # Metadata Extraction (Chuẩn bị cho RAG)\n",
    "                metadata_header = f\"\"\"---\n",
    "filename: {filename}\n",
    "filepath: {file_path}\n",
    "page_count: {len(result.document.pages)}\n",
    "processed_at: {os.path.getmtime(file_path)}\n",
    "---\n",
    "\n",
    "\"\"\"\n",
    "                final_content = metadata_header + markdown_content\n",
    "                \n",
    "                # Save\n",
    "                with open(output_path, 'w', encoding='utf-8') as f:\n",
    "                    f.write(final_content)\n",
    "                \n",
    "                results[\"parsed\"] += 1\n",
    "                \n",
    "                # --- TỐI ƯU 2: GIẢI PHÓNG RAM ---\n",
    "                del result\n",
    "                del markdown_content\n",
    "                \n",
    "                if (i+1) % 10 == 0:\n",
    "                    gc.collect()\n",
    "                    print(f\"✅ Processed {i+1}/{len(pdf_files)} files (Skipped: {results['skipped']})\")\n",
    "                    \n",
    "            except TimeoutError:\n",
    "                print(f\"⏰ Timeout parsing {filename} (>{self.timeout}s)\")\n",
    "                results[\"errors\"] += 1\n",
    "            except Exception as e:\n",
    "                print(f\"❌ Failed to parse {filename}: {e}\")\n",
    "                results[\"errors\"] += 1\n",
    "            finally:\n",
    "                signal.alarm(0) # Ensure alarm is off\n",
    "                \n",
    "        return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b87fec5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.5. Test Run on Specific File\n",
    "# Chạy cell này để kiểm tra chất lượng trên file cụ thể (giống Marker)\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "# Setup paths (đồng bộ với Cell 3)\n",
    "source_dir = '/content/data_rag/files'\n",
    "root = Path(source_dir)\n",
    "\n",
    "if not root.exists():\n",
    "    print(f\"❌ Source directory not found: {root}\")\n",
    "    print(\"⚠️ Hãy chạy Cell 3 (Extract Data) trước.\")\n",
    "else:\n",
    "    # Nếu zip giải nén ra 1 thư mục con 'files' thì đi vào đó\n",
    "    nested_files = root / 'files'\n",
    "    if nested_files.exists():\n",
    "        root = nested_files\n",
    "\n",
    "    # Tìm file cụ thể\n",
    "    target_filename = \"1.1. Kỹ thuật Cơ điện tử.pdf\"\n",
    "    # Nếu bạn biết chắc thư mục con, điền ở đây (vd: 'quy_che'); nếu không chắc có thể để None\n",
    "    target_subdir = \"quy_che\"\n",
    "\n",
    "    preferred_path = (root / target_subdir / target_filename) if target_subdir else (root / target_filename)\n",
    "    target_path = preferred_path\n",
    "\n",
    "    if not target_path.exists():\n",
    "        # Fallback: tự động tìm theo tên file trong toàn bộ cây thư mục\n",
    "        matches = list(root.rglob(target_filename))\n",
    "        if len(matches) == 1:\n",
    "            target_path = matches[0]\n",
    "            print(f\"🔎 Auto-found file at: {target_path}\")\n",
    "        elif len(matches) > 1:\n",
    "            print(\"⚠️ Found multiple matches. Showing up to 20:\")\n",
    "            for p in matches[:20]:\n",
    "                print(f\" - {p}\")\n",
    "            target_path = matches[0]\n",
    "            print(f\"➡️ Using first match: {target_path}\")\n",
    "        else:\n",
    "            print(f\"❌ File not found: {preferred_path}\")\n",
    "            print(f\"Searching in: {root}\")\n",
    "            # Gợi ý: in ra các thư mục cấp 1 để bạn chọn đúng target_subdir\n",
    "            subdirs = sorted([p.name for p in root.iterdir() if p.is_dir()])\n",
    "            if subdirs:\n",
    "                print(\"📁 Top-level folders:\")\n",
    "                for name in subdirs[:30]:\n",
    "                    print(f\" - {name}\")\n",
    "            raise FileNotFoundError(target_filename)\n",
    "\n",
    "    print(f\"🧪 Using target file: {target_path}\")\n",
    "    \n",
    "    # Initialize processor for test\n",
    "    test_output_dir = '/content/data/test_output'\n",
    "    os.makedirs(test_output_dir, exist_ok=True)\n",
    "    \n",
    "    print(\"🚀 Initializing processor for test run (OCR Enabled - Default)...\")\n",
    "    # Use ColabDoclingProcessor defined in previous cell\n",
    "    test_processor = ColabDoclingProcessor(\n",
    "        output_dir=test_output_dir,\n",
    "        use_ocr=True,\n",
    "    )\n",
    "    \n",
    "    try:\n",
    "        print(f\"⏳ Processing {target_path.name}...\")\n",
    "        result = test_processor.converter.convert(str(target_path))\n",
    "        markdown_content = result.document.export_to_markdown()\n",
    "        \n",
    "        # Save to local output\n",
    "        output_file = Path(test_output_dir) / f\"{target_path.stem}.md\"\n",
    "        with open(output_file, 'w', encoding='utf-8') as f:\n",
    "            f.write(markdown_content)\n",
    "            \n",
    "        print(f\"💾 Saved local test file: {output_file}\")\n",
    "        \n",
    "        print(\"\\n\" + \"=\"*50)\n",
    "        print(\"📄 RESULT PREVIEW (First 2000 characters)\")\n",
    "        print(\"=\"*50)\n",
    "        print(markdown_content[:2000])\n",
    "        print(\"\\n\" + \"=\"*50)\n",
    "        print(\"✅ Test completed! Hãy chạy cell tiếp theo để lưu kết quả lên Drive.\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"❌ Test failed: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a46429ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.6. Save Test Result to Google Drive\n",
    "import shutil\n",
    "\n",
    "# Cấu hình đường dẫn lưu trên Drive (Lưu vào folder riêng để dễ so sánh với Marker)\n",
    "drive_test_folder = '/content/drive/MyDrive/docling/test_result_docling'\n",
    "\n",
    "# Biến test_output_dir được định nghĩa ở cell 5.5\n",
    "if 'test_output_dir' in locals() and os.path.exists(test_output_dir):\n",
    "    # Tạo thư mục cha trên Drive nếu chưa có\n",
    "    if not os.path.exists(os.path.dirname(drive_test_folder)):\n",
    "        os.makedirs(os.path.dirname(drive_test_folder), exist_ok=True)\n",
    "        \n",
    "    print(f\"📂 Copying test results to: {drive_test_folder}\")\n",
    "    \n",
    "    # Sử dụng copytree với dirs_exist_ok=True để copy cả thư mục con và ghi đè nếu cần\n",
    "    # Cách này giữ nguyên cấu trúc thư mục (subdir)\n",
    "    try:\n",
    "        shutil.copytree(test_output_dir, drive_test_folder, dirs_exist_ok=True)\n",
    "        print(f\"   ✅ Copied entire folder structure successfully!\")\n",
    "    except Exception as e:\n",
    "        print(f\"   ❌ Error copying folder: {e}\")\n",
    "            \n",
    "    print(f\"\\n🎉 Done! Bạn có thể mở Drive để xem file markdown đầy đủ tại: {drive_test_folder}\")\n",
    "else:\n",
    "    print(\"❌ Không tìm thấy thư mục kết quả test hoặc biến 'test_output_dir' chưa được định nghĩa.\")\n",
    "    print(\"⚠️ Hãy chạy cell 5.5 (Test Run) trước khi chạy cell này!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8228498a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Run Processing & Save Results\n",
    "output_dir = '/content/data/docling_output'\n",
    "# Bật OCR \n",
    "processor = ColabDoclingProcessor(output_dir=output_dir, use_ocr=True) \n",
    "\n",
    "# Determine source directory (handle if zip extracted to subfolder)\n",
    "source_dir = '/content/data_rag/files' \n",
    "# Check if files are in a subfolder named 'files' inside the extraction path\n",
    "if os.path.exists(os.path.join(source_dir, 'files')):\n",
    "    source_dir = os.path.join(source_dir, 'files')\n",
    "\n",
    "# Run\n",
    "processor.parse_directory(source_dir)\n",
    "\n",
    "# Zip output and save to Drive\n",
    "output_zip_path = '/content/drive/MyDrive/docling/docling_output.zip'\n",
    "print(f\"Zipping output to {output_zip_path}...\")\n",
    "shutil.make_archive(output_zip_path.replace('.zip', ''), 'zip', output_dir)\n",
    "print(\"🎉 Done! Check your Google Drive for docling_output.zip\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}