DoAn / core /hash_file /hash_data_goc.py

change commit

b91b0a5 24 days ago

4.45 kB

	import sys
	import json
	import shutil
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parents[2]
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(PROJECT_ROOT))

	from core.hash_file.hash_file import HashProcessor

	# HuggingFace repo chứa PDF gốc
	HF_RAW_PDF_REPO = "hungnha/Do_An_Dataset"


	def download_from_hf(cache_dir: Path) -> Path:
	"""Tải PDF từ HuggingFace, trả về đường dẫn tới folder data_rag."""
	from huggingface_hub import snapshot_download

	# Kiểm tra cache đã tồn tại chưa
	if cache_dir.exists() and any(cache_dir.iterdir()):
	print(f"Cache đã tồn tại: {cache_dir}")
	return cache_dir / "data_rag"

	print(f"Đang tải từ HuggingFace: {HF_RAW_PDF_REPO}")
	snapshot_download(
	repo_id=HF_RAW_PDF_REPO,
	repo_type="dataset",
	local_dir=str(cache_dir),
	local_dir_use_symlinks=False,
	)
	return cache_dir / "data_rag"


	def load_existing_hashes(path: Path) -> dict:
	"""Đọc hash index cũ từ file JSON."""
	if not path.exists():
	return {}
	try:
	data = json.loads(path.read_text(encoding='utf-8'))
	return {item['filename']: item['hash'] for item in data.get('train', [])}
	except Exception:
	return {}


	def process_pdfs(source_root: Path, dest_dir: Path, existing_hashes: dict) -> tuple:
	"""Copy PDFs và tính hash. Trả về (results, processed, skipped)."""
	hasher = HashProcessor(verbose=False)
	pdf_files = list(source_root.rglob("*.pdf"))
	print(f"Tìm thấy {len(pdf_files)} file PDF\n")

	results, processed, skipped = [], 0, 0

	for idx, src in enumerate(pdf_files):
	rel_path = str(src.relative_to(source_root))
	dest = dest_dir / rel_path
	dest.parent.mkdir(parents=True, exist_ok=True)

	# Bỏ qua nếu file không thay đổi (hash khớp)
	if dest.exists() and rel_path in existing_hashes:
	current_hash = hasher.get_file_hash(str(dest))
	if current_hash == existing_hashes[rel_path]:
	results.append({'filename': rel_path, 'hash': current_hash, 'index': idx})
	skipped += 1
	continue

	# Copy và tính hash
	try:
	shutil.copy2(src, dest)
	file_hash = hasher.get_file_hash(str(dest))
	if file_hash:
	results.append({'filename': rel_path, 'hash': file_hash, 'index': idx})
	processed += 1
	except Exception as e:
	print(f"Lỗi: {rel_path} - {e}")

	# Hiển thị tiến độ
	if (idx + 1) % 20 == 0:
	print(f"Tiến độ: {idx + 1}/{len(pdf_files)}")

	return results, processed, skipped


	def main():
	import argparse
	parser = argparse.ArgumentParser(description="Tải PDF và tạo hash index")
	parser.add_argument("--source", type=str, help="Đường dẫn local tới PDFs (bỏ qua tải HF)")
	parser.add_argument("--download-only", action="store_true", help="Chỉ tải về, không copy")
	args = parser.parse_args()

	data_dir = PROJECT_ROOT / "data"
	files_dir = data_dir / "files"
	files_dir.mkdir(parents=True, exist_ok=True)
	hash_file = data_dir / "hash_data_goc_index.json"

	# Xác định thư mục nguồn
	if args.source:
	source_root = Path(args.source)
	if not source_root.exists():
	return print(f"Không tìm thấy thư mục nguồn: {source_root}")
	else:
	# Tải từ HuggingFace
	source_root = download_from_hf(data_dir / "raw_pdf_cache")
	if args.download_only:
	return print(f"PDF đã cache tại: {source_root}")

	if not source_root.exists():
	return print(f"Không tìm thấy thư mục PDF: {source_root}")

	# Xử lý
	existing = load_existing_hashes(hash_file)
	print(f"Đã tải {len(existing)} hash từ index cũ")

	results, processed, skipped = process_pdfs(source_root, files_dir, existing)

	# Lưu kết quả
	hash_file.write_text(json.dumps({
	'train': results,
	'total_files': len(results)
	}, ensure_ascii=False, indent=2), encoding='utf-8')

	print(f"\nHoàn tất! Tổng: {len(results)} \| Mới: {processed} \| Bỏ qua: {skipped}")
	print(f"File index: {hash_file}")


	if __name__ == "__main__":
	main()