BaHung commited on Dec 6, 2025

Commit

54b2662

0 Parent(s):

Clean repo

Files changed (35) hide show

.gitattributes +61 -0
.gitignore +174 -0
Dockerfile +0 -0
README.md +0 -0
api/__init__.py +0 -0
api/app.py +0 -0
api/routes/chat.py +0 -0
api/routes/health.py +0 -0
api/schemas.py +0 -0
config/__init__.py +0 -0
config/base.py +4 -0
config/finetune_config.yaml +0 -0
config/parse_config.yaml +3 -0
config/rag_config.yaml +0 -0
core/__init__.py +0 -0
core/embeddings/__init__.py +0 -0
core/embeddings/embedding_model.py +0 -0
core/embeddings/vector_store.py +0 -0
core/fine_tune/__init__.py +0 -0
core/fine_tune/data_prep.py +0 -0
core/fine_tune/evaluator.py +0 -0
core/fine_tune/trainer.py +0 -0
core/hash_file/__init__.py +0 -0
core/hash_file/hash_data_goc.py +109 -0
core/hash_file/hash_file.py +118 -0
core/preprocessing/__init__.py +0 -0
core/preprocessing/chunker.py +0 -0
core/preprocessing/docling_processor.py +137 -0
core/preprocessing/pdf_parser.py +75 -0
requirements.txt +5 -0
test/parse_data_hash_test.py +118 -0
utils/__init__.py +0 -0
utils/helpers.py +136 -0
utils/logger.py +46 -0
utils/metrics.py +102 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,61 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mds filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+# Video files - compressed
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+data/files/*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environment variables
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Project specific
+marker_out/
+test_input/
+test_output/
+chunks/
+chunking_analysis/
+test_pipeline_*/
+# Model cache
+.cache/
+models/
+*.safetensors
+*.bin
+*.onnx
+# Temporary files
+*.tmp
+*.temp
+temp/
+tmp/
+# Log files
+*.log
+logs/
+# API keys and sensitive data
+config.json
+secrets.json
+api_keys.txt
+# Backup files
+*.bak
+*.backup
+*~
+__pycache__/
+/model/
+/data/

Dockerfile ADDED Viewed

File without changes

README.md ADDED Viewed

File without changes

api/__init__.py ADDED Viewed

File without changes

api/app.py ADDED Viewed

File without changes

api/routes/chat.py ADDED Viewed

File without changes

api/routes/health.py ADDED Viewed

File without changes

api/schemas.py ADDED Viewed

File without changes

config/__init__.py ADDED Viewed

File without changes

config/base.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from datasets import load_dataset
+# Login using e.g. `huggingface-cli login` to access this dataset
+ds = load_dataset("hungnha/Do_An_Dataset")

config/finetune_config.yaml ADDED Viewed

File without changes

config/parse_config.yaml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ INPUT_PATH:''
2	+ OUTPUT_PATH:''
3	+

config/rag_config.yaml ADDED Viewed

File without changes

core/__init__.py ADDED Viewed

File without changes

core/embeddings/__init__.py ADDED Viewed

File without changes

core/embeddings/embedding_model.py ADDED Viewed

File without changes

core/embeddings/vector_store.py ADDED Viewed

File without changes

core/fine_tune/__init__.py ADDED Viewed

File without changes

core/fine_tune/data_prep.py ADDED Viewed

File without changes

core/fine_tune/evaluator.py ADDED Viewed

File without changes

core/fine_tune/trainer.py ADDED Viewed

File without changes

core/hash_file/__init__.py ADDED Viewed

File without changes

core/hash_file/hash_data_goc.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import sys
+import os
+import json
+from pathlib import Path
+# Setup path
+current_file = Path(__file__).resolve()
+project_root = current_file.parent.parent.parent
+if str(project_root) not in sys.path:
+    sys.path.insert(0, str(project_root))
+from typing import cast, Dict, Any
+from datasets import load_dataset, Dataset
+from core.hash_file.hash_file import HashProcessor
+def main():
+    # Khởi tạo
+    data_dir = project_root / "data"
+    files_dir = data_dir / "files"
+    files_dir.mkdir(parents=True, exist_ok=True)
+    hash_processor = HashProcessor(verbose=False)
+    hash_file_path = data_dir / "hash_data_goc_index.json"
+    # Load existing hash index
+    existing_hashes = {}
+    if hash_file_path.exists():
+        with open(hash_file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            existing_hashes = {item['index']: item['hash'] for item in data.get('train', [])}
+        print(f"📂 Đã tải {len(existing_hashes)} hash từ index cũ")
+    # Load dataset
+    print("📥 Đang tải dataset từ Hugging Face...")
+    dataset = load_dataset("hungnha/Do_An_Dataset")
+    train_dataset = cast(Dataset, dataset['train'])
+    print(f"✅ Đã tải {len(train_dataset)} files\n")
+    # Xử lý từng file
+    hash_results = []
+    skipped = 0
+    processed = 0
+    for idx, sample in enumerate(train_dataset):
+        sample = cast(Dict[str, Any], sample)
+        filename = f"train_{idx:04d}.pdf"
+        filepath = files_dir / filename
+        # Kiểm tra file đã tồn tại chưa
+        if filepath.exists() and idx in existing_hashes:
+            # Verify hash
+            current_hash = hash_processor.get_file_hash(str(filepath))
+            if current_hash == existing_hashes[idx]:
+                hash_results.append({
+                    'filename': filename,
+                    'hash': current_hash,
+                    'index': idx
+                })
+                skipped += 1
+                continue
+        try:
+            # Lấy PDF object
+            pdf_obj = sample['pdf']
+            # Xử lý dữ liệu từ datasets (thường là dict chứa bytes hoặc bytes trực tiếp)
+            if isinstance(pdf_obj, dict) and 'bytes' in pdf_obj:
+                pdf_bytes = pdf_obj['bytes']
+            elif isinstance(pdf_obj, bytes):
+                pdf_bytes = pdf_obj
+            else:
+                print(f"⚠️ Bỏ qua file {idx} - định dạng dữ liệu không hỗ trợ: {type(pdf_obj)}")
+                continue
+            # Lưu file
+            filepath.write_bytes(pdf_bytes)
+            # Tính hash
+            file_hash = hash_processor.get_file_hash(str(filepath))
+            if file_hash is None:
+                print(f"❌ Lỗi tính hash cho file {idx}")
+                continue
+            hash_results.append({
+                'filename': filename,
+                'hash': file_hash,
+                'index': idx
+            })
+            processed += 1
+            if (idx + 1) % 10 == 0:
+                print(f"📄 Đã xử lý {idx + 1}/{len(train_dataset)} files (mới: {processed}, bỏ qua: {skipped})")
+        except Exception as e:
+            print(f"❌ Lỗi xử lý file {idx}: {e}")
+            continue
+    # Lưu hash index
+    hash_file_path.write_text(json.dumps({'train': hash_results}, indent=2, ensure_ascii=False))
+    print(f"\n✅ Hoàn thành!")
+    print(f"   - Đã xử lý mới: {processed} files")
+    print(f"   - Đã bỏ qua: {skipped} files")
+    print(f"   - Tổng cộng: {len(hash_results)} files")
+    print(f"📁 Thư mục files: {files_dir}")
+    print(f"📄 Hash index: {hash_file_path}")
+if __name__ == "__main__":
+    main()

core/hash_file/hash_file.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import hashlib
+import json
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional
+from datetime import datetime
+# Constants
+CHUNK_SIZE = 8192  # 8KB chunks for reading files
+DEFAULT_FILE_EXTENSION = '.pdf'
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+class HashProcessor:
+    """Simplified HashProcessor for RAG system - only core functionality."""
+    def __init__(self, verbose: bool = True):
+        self.verbose = verbose
+        self.logger = logging.getLogger(__name__)
+        if not verbose:
+            self.logger.setLevel(logging.WARNING)
+    def get_file_hash(self, path: str) -> Optional[str]:
+        """Calculate SHA256 hash of file."""
+        h = hashlib.sha256()
+        try:
+            with open(path, "rb") as f:
+                while chunk := f.read(CHUNK_SIZE):
+                    h.update(chunk)
+            return h.hexdigest()
+        except (IOError, OSError) as e:
+            self.logger.error(f"Lỗi khi đọc file {path}: {e}")
+            return None
+        except Exception as e:
+            self.logger.error(f"Lỗi không xác định khi xử lý file {path}: {e}")
+            return None
+    def scan_files_for_hash(
+        self,
+        source_dir: str,
+        file_extension: str = DEFAULT_FILE_EXTENSION
+    ) -> Dict[str, List[Dict[str, str]]]:
+        """Scan directory and calculate hash for each file."""
+        if not os.path.exists(source_dir):
+            raise FileNotFoundError(f"Thư mục không tồn tại: {source_dir}")
+        if not os.path.isdir(source_dir):
+            raise NotADirectoryError(f"Đường dẫn không phải là thư mục: {source_dir}")
+        hash_to_files = defaultdict(list)
+        self.logger.info(f"Đang quét file trong thư mục: {source_dir}")
+        try:
+            files = [f for f in os.listdir(source_dir)
+                    if f.lower().endswith(file_extension.lower())]
+            for filename in files:
+                file_path = os.path.join(source_dir, filename)
+                if not os.path.isfile(file_path):
+                    continue
+                self.logger.info(f"Đang tính hash cho: {filename}")
+                file_hash = self.get_file_hash(file_path)
+                if file_hash:
+                    hash_to_files[file_hash].append({
+                        'filename': filename,
+                        'path': file_path,
+                        'size': os.path.getsize(file_path)
+                    })
+        except PermissionError as e:
+            self.logger.error(f"Không có quyền truy cập thư mục {source_dir}: {e}")
+            raise
+        return hash_to_files
+    def load_processed_index(self, index_file: str) -> Dict:
+        """Load processed index from JSON file."""
+        if os.path.exists(index_file):
+            try:
+                with open(index_file, "r", encoding="utf-8") as f:
+                    return json.load(f)
+            except json.JSONDecodeError as e:
+                self.logger.error(f"Lỗi đọc file index {index_file}: {e}")
+                return {}
+            except Exception as e:
+                self.logger.error(f"Lỗi không xác định khi đọc index: {e}")
+                return {}
+        return {}
+    def save_processed_index(self, index_file: str, processed_hashes: Dict) -> None:
+        """Save processed index to JSON file."""
+        try:
+            # Tạo thư mục nếu chưa tồn tại
+            os.makedirs(os.path.dirname(index_file), exist_ok=True)
+            with open(index_file, "w", encoding="utf-8") as f:
+                json.dump(processed_hashes, f, indent=2, ensure_ascii=False)
+            self.logger.info(f"Đã lưu index file: {index_file}")
+        except Exception as e:
+            self.logger.error(f"Lỗi khi lưu index file {index_file}: {e}")
+    def get_current_timestamp(self) -> str:
+        """Get current timestamp in ISO format."""
+        return datetime.now().isoformat()
+    def get_string_hash(self, text: str) -> str:
+        """Calculate SHA256 hash of string."""
+        return hashlib.sha256(text.encode('utf-8')).hexdigest()

core/preprocessing/__init__.py ADDED Viewed

File without changes

core/preprocessing/chunker.py ADDED Viewed

File without changes

core/preprocessing/docling_processor.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import json
+import os
+import signal
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional
+import sys
+# Import dependencies
+from core.hash_file.hash_file import HashProcessor
+from docling.document_converter import DocumentConverter, FormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+class DoclingProcessor:
+    def __init__(self, output_dir: str, use_ocr: bool = False, timeout: int = 300):
+        self.output_dir = output_dir
+        self.use_ocr = use_ocr
+        self.timeout = timeout
+        self.hash_processor = HashProcessor(verbose=False)
+        os.makedirs(output_dir, exist_ok=True)
+        # Cache system - lưu index ở ngoài output_dir
+        data_dir = Path(output_dir).parent
+        self.index_file = str(data_dir / "hash_docling_index.json")
+        self.parsed_docs = self.hash_processor.load_processed_index(self.index_file)
+        # Cấu hình OCR settings
+        if not use_ocr:
+            # Tạo PDF pipeline options với OCR tắt
+            pdf_pipeline_options = PdfPipelineOptions(
+                do_ocr=False,  # Tắt OCR hoàn toàn
+                do_table_structure=True,  # Vẫn giữ table structure
+                do_picture_classification=False,
+                do_picture_description=False
+            )
+            # Tạo format options cho PDF
+            format_options = {
+                InputFormat.PDF: FormatOption(
+                    backend=PyPdfiumDocumentBackend,
+                    pipeline_cls=StandardPdfPipeline,
+                    pipeline_options=pdf_pipeline_options
+                )
+            }
+            self.converter = DocumentConverter(format_options=format_options)
+            print("🔧 OCR completely disabled for docling")
+        else:
+            # Sử dụng converter mặc định với OCR enabled
+            self.converter = DocumentConverter()
+            print("🔧 OCR enabled for docling")
+    def parse_document(self, file_path: str) -> Optional[Dict]:
+        """Parse single document - có cache system!"""
+        if not os.path.exists(file_path):
+            return None
+        filename = os.path.basename(file_path)
+        file_hash = self.hash_processor.get_file_hash(file_path)
+        # Kiểm tra cache trước
+        if file_hash in self.parsed_docs:
+            cached_info = self.parsed_docs[file_hash]
+            output_path = os.path.join(self.output_dir, cached_info['output_file'])
+            if os.path.exists(output_path):
+                print(f"⏭️  Already parsed: {filename}")
+                with open(output_path, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+        try:
+            print(f"🔄 Processing: {filename}...")
+            # Set timeout alarm
+            signal.signal(signal.SIGALRM, lambda s, f: (_ for _ in ()).throw(TimeoutError("Processing timeout")))
+            signal.alarm(self.timeout)
+            result = self.converter.convert(file_path)
+            docling_json = result.document.export_to_dict()
+            # Cancel timeout
+            signal.alarm(0)
+        except TimeoutError:
+            print(f"⏰ Timeout processing {filename} (>{self.timeout}s)")
+            signal.alarm(0)
+            return None
+        except Exception as e:
+            print(f"❌ Failed to parse {filename}: {e}")
+            signal.alarm(0)
+            return None
+        # Lưu file JSON với tên có hash để tránh trùng lặp
+        output_file = f"{Path(filename).stem}_{file_hash[:8]}.json"
+        output_path = os.path.join(self.output_dir, output_file)
+        # Lưu output mặc định của Docling - minified như Docling gốc
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(docling_json, f, ensure_ascii=False)
+        # Cập nhật cache
+        self.parsed_docs[file_hash] = {
+            "filename": filename,
+            "output_file": output_file,
+            "parsed_date": datetime.now().isoformat()
+        }
+        self.hash_processor.save_processed_index(self.index_file, self.parsed_docs)
+        print(f"✓ Parsed: {filename}")
+        return docling_json
+    def parse_directory(self, source_dir: str) -> Dict:
+        """Parse all PDFs in directory - tận dụng HashProcessor"""
+        print(f"Parsing PDFs in: {source_dir}")
+        # Tận dụng HashProcessor để scan files
+        hash_to_files = self.hash_processor.scan_files_for_hash(source_dir, '.pdf')
+        results = {"total": 0, "parsed": 0, "errors": 0}
+        for file_hash, file_list in hash_to_files.items():
+            for file_info in file_list:
+                results["total"] += 1
+                result = self.parse_document(file_info['path'])
+                if result:
+                    results["parsed"] += 1
+                else:
+                    results["errors"] += 1
+        print(f"Summary: {results['parsed']}/{results['total']} files parsed")
+        return results

core/preprocessing/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import sys
+import os
+import traceback
+import warnings
+from pathlib import Path
+from core.preprocessing.docling_processor import DoclingProcessor
+# Tắt cảnh báo pin_memory từ docling/PyTorch
+warnings.filterwarnings("ignore", message=".*pin_memory.*")
+def get_project_paths():
+    # Lấy từ data/files
+    source_dir = Path("data/files").resolve()
+    output_dir = Path("data/docling_output").resolve()
+    return str(source_dir), str(output_dir)
+def main(source_dir=None, output_dir=None, use_ocr=False, timeout=300):
+    """Parse PDF documents."""
+    # Auto-detect paths nếu không được cung cấp
+    if source_dir is None or output_dir is None:
+        auto_source, auto_output = get_project_paths()
+        source_dir = source_dir or auto_source
+        output_dir = output_dir or auto_output
+    # Kiểm tra source directory
+    if not os.path.exists(source_dir):
+        print(f"❌ Source not found: {source_dir}")
+        print(f"\n💡 Solution:")
+        print(f"   1. Run hash_data_goc.py first to download PDFs")
+        print(f"   2. Or specify path: python parse_data_hash.py --source /path/to/pdfs")
+        return 1
+    print(f"📂 Source: {source_dir}")
+    print(f"📁 Output: {output_dir}\n")
+    try:
+        processor = DoclingProcessor(
+            output_dir=output_dir,
+            use_ocr=use_ocr,
+            timeout=timeout
+        )
+        results = processor.parse_directory(source_dir)
+        print(f"\n📊 Total: {results['total']} docs | "
+              f"Parsed: {results['parsed']} | Errors: {results['errors']}\n")
+        return 0
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        traceback.print_exc()
+        return 1
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Parse PDFs with Docling")
+    parser.add_argument("--source", help="Source directory with PDFs")
+    parser.add_argument("--output", help="Output directory for results")
+    parser.add_argument("--ocr", action="store_true", help="Enable OCR")
+    parser.add_argument("--timeout", type=int, default=300, help="Timeout per file in seconds (default: 300)")
+    args = parser.parse_args()
+    exit(main(
+        source_dir=args.source,
+        output_dir=args.output,
+        use_ocr=args.ocr,
+        timeout=args.timeout
+    ))

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+#unsloth
+#langchain
+docling
+datasets
+pdfplumber

test/parse_data_hash_test.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import sys
+import random
+# Ensure project root is on sys.path so `core` and `config` can be imported
+_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+from huggingface_hub import HfApi, hf_hub_download
+from core.preprocessing.docling_processor import DoclingProcessor
+from config.base import ds
+REPO_ID = "hungnha/Do_An_Dataset"
+def _extract_pdf_path_from_example(example):
+    # Tìm path PDF trong example (ưu tiên các giá trị string kết thúc .pdf và tồn tại trên máy/cached)
+    if isinstance(example, dict):
+        for value in example.values():
+            if isinstance(value, str) and value.lower().endswith('.pdf') and os.path.exists(value):
+                return value
+    # Không tìm thấy
+    return None
+def _download_random_pdf_from_hub(repo_id: str) -> str:
+    api = HfApi()
+    files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+    pdf_files = [f for f in files if f.lower().endswith('.pdf')]
+    if not pdf_files:
+        return None
+    chosen = random.choice(pdf_files)
+    # Tải về cache local và trả về đường dẫn
+    try:
+        local_path = hf_hub_download(repo_id=repo_id, filename=chosen, repo_type="dataset")
+        return local_path
+    except Exception:
+        return None
+def main(output_dir=None, use_ocr=False):
+    """Parse PDF documents - test mode chỉ chạy 1 file random."""
+    # Auto-detect output path (dataset đọc từ cache HF, không dùng source_dir local)
+    if output_dir is None:
+        output_dir = "core/data"
+    # Lấy split chính (mặc định 'train' nếu có)
+    split = 'train' if 'train' in ds else list(ds.keys())[0]
+    dataset_split = ds[split]
+    print(f"📚 Using split: {split} (n={len(dataset_split)})")
+    try:
+        # Ưu tiên: tải ngẫu nhiên 1 PDF trực tiếp từ repo dataset trên Hugging Face
+        file_path = _download_random_pdf_from_hub(REPO_ID)
+        if file_path is None:
+            # Fallback: thử lấy từ example đã cache (nếu dataset lưu sẵn đường dẫn local)
+            if len(dataset_split) == 0:
+                print("❌ Dataset split is empty")
+                return 1
+            attempts = 0
+            while attempts < 32 and file_path is None:
+                idx = random.randint(0, len(dataset_split) - 1)
+                example = dataset_split[idx]
+                candidate = _extract_pdf_path_from_example(example)
+                if candidate is not None:
+                    file_path = candidate
+                    break
+                attempts += 1
+            if file_path is None:
+                print("❌ Could not locate any PDF (hub or cache)")
+                return 1
+        random_file = os.path.basename(file_path)
+        print(f"🎯 Testing with: {random_file}\n")
+        # Khởi tạo processor (vẫn dùng cache system)
+        processor = DoclingProcessor(
+            output_dir=output_dir,
+            use_ocr=use_ocr,
+            timeout=300
+        )
+        # Parse 1 file
+        result = processor.parse_document(file_path)
+        if result:
+            print(f"\n✅ Test successful!")
+            print(f"📊 Parsed: {random_file}")
+            # Kiểm tra output file
+            random_stem = os.path.splitext(random_file)[0]
+            output_files = [
+                f for f in os.listdir(output_dir)
+                if random_stem in f
+            ]
+            if output_files:
+                print(f"📄 Output: {output_files[0]}")
+        else:
+            print(f"\n❌ Test failed for: {random_file}")
+            return 1
+        return 0
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        return 1
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Test Docling with 1 random PDF from HF cache")
+    parser.add_argument("--output", help="Output directory")
+    parser.add_argument("--ocr", action="store_true", help="Enable OCR")
+    args = parser.parse_args()
+    sys.exit(main(
+        output_dir=args.output,
+        use_ocr=args.ocr
+    ))

utils/__init__.py ADDED Viewed

File without changes

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import json
+import os
+import random
+import re
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Optional, Sequence, Tuple, TypeVar
+import yaml
+T = TypeVar("T")
+# Filesystem helpers
+def ensure_dir(path: str | os.PathLike) -> str:
+    p = Path(path)
+    p.mkdir(parents=True, exist_ok=True)
+    return str(p)
+def read_json(path: str | os.PathLike) -> Any:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def write_json(data: Any, path: str | os.PathLike, *, indent: int = 2) -> None:
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=indent)
+def read_yaml(path: str | os.PathLike) -> Any:
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def write_yaml(data: Any, path: str | os.PathLike) -> None:
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
+# General helpers
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    try:
+        import numpy as np  # type: ignore
+        np.random.seed(seed)
+    except Exception:
+        pass
+    try:
+        import torch  # type: ignore
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True  # type: ignore[attr-defined]
+        torch.backends.cudnn.benchmark = False  # type: ignore[attr-defined]
+    except Exception:
+        pass
+def load_env(key: str, default: Optional[str] = None) -> Optional[str]:
+    val = os.getenv(key)
+    return val if val is not None else default
+def slugify_filename(name: str, max_len: int = 128) -> str:
+    base = re.sub(r"[^a-zA-Z0-9._-]+", "-", name).strip("-._")
+    return base[:max_len]
+def safe_stem(path: str | os.PathLike) -> str:
+    p = Path(path)
+    return p.stem
+def batched(iterable: Iterable[T], batch_size: int) -> Iterator[List[T]]:
+    batch: List[T] = []
+    for item in iterable:
+        batch.append(item)
+        if len(batch) >= batch_size:
+            yield batch
+            batch = []
+    if batch:
+        yield batch
+# Timing and retry utilities
+def timeit(func: Callable[..., T]) -> Callable[..., T]:
+    def wrapper(*args: Any, **kwargs: Any) -> T:
+        start = time.perf_counter()
+        try:
+            return func(*args, **kwargs)
+        finally:
+            elapsed = (time.perf_counter() - start) * 1000
+            print(f"⏱️ {func.__name__} took {elapsed:.2f} ms")
+    return wrapper
+def retry(
+    exceptions: Tuple[type[BaseException], ...] = (Exception,),
+    tries: int = 3,
+    delay: float = 0.5,
+    backoff: float = 2.0,
+) -> Callable[[Callable[..., T]], Callable[..., T]]:
+    def decorator(fn: Callable[..., T]) -> Callable[..., T]:
+        def inner(*args: Any, **kwargs: Any) -> T:
+            _tries, _delay = tries, delay
+            while _tries > 1:
+                try:
+                    return fn(*args, **kwargs)
+                except exceptions:
+                    time.sleep(_delay)
+                    _tries -= 1
+                    _delay *= backoff
+            return fn(*args, **kwargs)
+        return inner
+    return decorator
+# Text utilities helpful for RAG
+def normalize_text(text: str) -> str:
+    text = text.replace("\u00A0", " ")  # non-breaking space
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def strip_markdown(text: str) -> str:
+    # very light-weight markdown stripper for indexing
+    text = re.sub(r"`{1,3}[^`]*`{1,3}", " ", text)  # code spans/blocks
+    text = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", text)  # links
+    text = re.sub(r"[#>*_~`]+", " ", text)  # punctuation markers
+    return normalize_text(text)

utils/logger.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import logging
+import os
+from typing import Optional
+def _get_level(level: Optional[str | int]) -> int:
+    if isinstance(level, int):
+        return level
+    if isinstance(level, str):
+        try:
+            return getattr(logging, level.upper())
+        except AttributeError:
+            return logging.INFO
+    # ENV override
+    env_level = os.getenv("LOG_LEVEL")
+    if env_level:
+        return getattr(logging, env_level.upper(), logging.INFO)
+    return logging.INFO
+def setup_root_logger(level: Optional[str | int] = None) -> None:
+    """Configure root logger once. Safe to call multiple times."""
+    if getattr(setup_root_logger, "_configured", False):
+        return
+    resolved = _get_level(level)
+    logging.basicConfig(
+        level=resolved,
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    )
+    setup_root_logger._configured = True  # type: ignore[attr-defined]
+def get_logger(name: Optional[str] = None, level: Optional[str | int] = None) -> logging.Logger:
+    """Create or fetch a module-scoped logger with consistent formatting.
+    - Honors LOG_LEVEL env if level not provided.
+    - Does not add duplicate handlers on repeated calls.
+    """
+    setup_root_logger(level)
+    logger = logging.getLogger(name if name else __name__)
+    if level is not None:
+        logger.setLevel(_get_level(level))
+    return logger

utils/metrics.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import math
+import time
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+@dataclass
+class RollingAverage:
+    window: int = 100
+    values: List[float] = field(default_factory=list)
+    def add(self, x: float) -> None:
+        self.values.append(x)
+        if len(self.values) > self.window:
+            self.values.pop(0)
+    def mean(self) -> float:
+        return sum(self.values) / len(self.values) if self.values else 0.0
+class LatencyTracker:
+    def __init__(self, name: str = "latency_ms", sink: Optional[RollingAverage] = None):
+        self.name = name
+        self.sink = sink or RollingAverage()
+        self._start = 0.0
+    def __enter__(self):
+        self._start = time.perf_counter()
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        elapsed_ms = (time.perf_counter() - self._start) * 1000
+        self.sink.add(elapsed_ms)
+    @property
+    def avg_ms(self) -> float:
+        return self.sink.mean()
+def _safe_div(num: float, den: float) -> float:
+    return num / den if den else 0.0
+# Retrieval metrics for RAG
+def hit_rate_at_k(retrieved: Sequence[Sequence[str]], gold: Sequence[Sequence[str]], k: int = 5) -> float:
+    hits = 0
+    total = len(retrieved)
+    for preds, truths in zip(retrieved, gold):
+        topk = set(preds[:k])
+        truths_set = set(truths)
+        hits += 1 if topk & truths_set else 0
+    return _safe_div(hits, total)
+def recall_at_k(retrieved: Sequence[Sequence[str]], gold: Sequence[Sequence[str]], k: int = 5) -> float:
+    total_recall = 0.0
+    total = len(retrieved)
+    for preds, truths in zip(retrieved, gold):
+        topk = set(preds[:k])
+        truths_set = set(truths)
+        if truths_set:
+            total_recall += len(topk & truths_set) / len(truths_set)
+    return _safe_div(total_recall, total)
+def mrr_at_k(retrieved: Sequence[Sequence[str]], gold: Sequence[Sequence[str]], k: int = 5) -> float:
+    mrr = 0.0
+    total = len(retrieved)
+    for preds, truths in zip(retrieved, gold):
+        truths_set = set(truths)
+        rr = 0.0
+        for rank, pid in enumerate(preds[:k], start=1):
+            if pid in truths_set:
+                rr = 1.0 / rank
+                break
+        mrr += rr
+    return _safe_div(mrr, total)
+def evaluate_retrieval(
+    retrieved: Sequence[Sequence[str]],
+    gold: Sequence[Sequence[str]],
+    k: int = 5,
+) -> Dict[str, float]:
+    return {
+        "hit_rate@k": hit_rate_at_k(retrieved, gold, k),
+        "recall@k": recall_at_k(retrieved, gold, k),
+        "mrr@k": mrr_at_k(retrieved, gold, k),
+    }
+# Token utilities
+def estimate_tokens(text: str, model_name: Optional[str] = None) -> int:
+    try:
+        import tiktoken  # type: ignore
+        enc = tiktoken.get_encoding("cl100k_base") if not model_name else tiktoken.encoding_for_model(model_name)
+        return len(enc.encode(text))
+    except Exception:
+        # Fallback: rough heuristic ~ 4 chars per token
+        return max(1, math.ceil(len(text) / 4))