|
|
import sys |
|
|
import json |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parents[2] |
|
|
if str(PROJECT_ROOT) not in sys.path: |
|
|
sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
|
|
|
from core.hash_file.hash_file import HashProcessor |
|
|
|
|
|
|
|
|
HF_RAW_PDF_REPO = "hungnha/Do_An_Dataset" |
|
|
|
|
|
|
|
|
def download_from_hf(cache_dir: Path) -> Path: |
|
|
"""Tải PDF từ HuggingFace, trả về đường dẫn tới folder data_rag.""" |
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
|
|
|
if cache_dir.exists() and any(cache_dir.iterdir()): |
|
|
print(f"Cache đã tồn tại: {cache_dir}") |
|
|
return cache_dir / "data_rag" |
|
|
|
|
|
print(f"Đang tải từ HuggingFace: {HF_RAW_PDF_REPO}") |
|
|
snapshot_download( |
|
|
repo_id=HF_RAW_PDF_REPO, |
|
|
repo_type="dataset", |
|
|
local_dir=str(cache_dir), |
|
|
local_dir_use_symlinks=False, |
|
|
) |
|
|
return cache_dir / "data_rag" |
|
|
|
|
|
|
|
|
def load_existing_hashes(path: Path) -> dict: |
|
|
"""Đọc hash index cũ từ file JSON.""" |
|
|
if not path.exists(): |
|
|
return {} |
|
|
try: |
|
|
data = json.loads(path.read_text(encoding='utf-8')) |
|
|
return {item['filename']: item['hash'] for item in data.get('train', [])} |
|
|
except Exception: |
|
|
return {} |
|
|
|
|
|
|
|
|
def process_pdfs(source_root: Path, dest_dir: Path, existing_hashes: dict) -> tuple: |
|
|
"""Copy PDFs và tính hash. Trả về (results, processed, skipped).""" |
|
|
hasher = HashProcessor(verbose=False) |
|
|
pdf_files = list(source_root.rglob("*.pdf")) |
|
|
print(f"Tìm thấy {len(pdf_files)} file PDF\n") |
|
|
|
|
|
results, processed, skipped = [], 0, 0 |
|
|
|
|
|
for idx, src in enumerate(pdf_files): |
|
|
rel_path = str(src.relative_to(source_root)) |
|
|
dest = dest_dir / rel_path |
|
|
dest.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
if dest.exists() and rel_path in existing_hashes: |
|
|
current_hash = hasher.get_file_hash(str(dest)) |
|
|
if current_hash == existing_hashes[rel_path]: |
|
|
results.append({'filename': rel_path, 'hash': current_hash, 'index': idx}) |
|
|
skipped += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
shutil.copy2(src, dest) |
|
|
file_hash = hasher.get_file_hash(str(dest)) |
|
|
if file_hash: |
|
|
results.append({'filename': rel_path, 'hash': file_hash, 'index': idx}) |
|
|
processed += 1 |
|
|
except Exception as e: |
|
|
print(f"Lỗi: {rel_path} - {e}") |
|
|
|
|
|
|
|
|
if (idx + 1) % 20 == 0: |
|
|
print(f"Tiến độ: {idx + 1}/{len(pdf_files)}") |
|
|
|
|
|
return results, processed, skipped |
|
|
|
|
|
|
|
|
def main(): |
|
|
import argparse |
|
|
parser = argparse.ArgumentParser(description="Tải PDF và tạo hash index") |
|
|
parser.add_argument("--source", type=str, help="Đường dẫn local tới PDFs (bỏ qua tải HF)") |
|
|
parser.add_argument("--download-only", action="store_true", help="Chỉ tải về, không copy") |
|
|
args = parser.parse_args() |
|
|
|
|
|
data_dir = PROJECT_ROOT / "data" |
|
|
files_dir = data_dir / "files" |
|
|
files_dir.mkdir(parents=True, exist_ok=True) |
|
|
hash_file = data_dir / "hash_data_goc_index.json" |
|
|
|
|
|
|
|
|
if args.source: |
|
|
source_root = Path(args.source) |
|
|
if not source_root.exists(): |
|
|
return print(f"Không tìm thấy thư mục nguồn: {source_root}") |
|
|
else: |
|
|
|
|
|
source_root = download_from_hf(data_dir / "raw_pdf_cache") |
|
|
if args.download_only: |
|
|
return print(f"PDF đã cache tại: {source_root}") |
|
|
|
|
|
if not source_root.exists(): |
|
|
return print(f"Không tìm thấy thư mục PDF: {source_root}") |
|
|
|
|
|
|
|
|
existing = load_existing_hashes(hash_file) |
|
|
print(f"Đã tải {len(existing)} hash từ index cũ") |
|
|
|
|
|
results, processed, skipped = process_pdfs(source_root, files_dir, existing) |
|
|
|
|
|
|
|
|
hash_file.write_text(json.dumps({ |
|
|
'train': results, |
|
|
'total_files': len(results) |
|
|
}, ensure_ascii=False, indent=2), encoding='utf-8') |
|
|
|
|
|
print(f"\nHoàn tất! Tổng: {len(results)} | Mới: {processed} | Bỏ qua: {skipped}") |
|
|
print(f"File index: {hash_file}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|