File size: 4,136 Bytes
54b2662 9348624 54b2662 b91b0a5 54b2662 92c9b4d c429a2d b91b0a5 c429a2d 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d c429a2d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 54b2662 c429a2d 92c9b4d c429a2d b91b0a5 c429a2d 54b2662 b91b0a5 54b2662 92c9b4d c429a2d 92c9b4d c429a2d 92c9b4d b91b0a5 c429a2d 92c9b4d c429a2d 9348624 92c9b4d 54b2662 92c9b4d b91b0a5 92c9b4d 9348624 b91b0a5 54b2662 92c9b4d b91b0a5 54b2662 92c9b4d 9348624 54b2662 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import sys
import json
import shutil
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from core.hash_file.hash_file import HashProcessor
# HuggingFace repo containing raw PDFs
HF_RAW_PDF_REPO = "hungnha/Do_An_Dataset"
def download_from_hf(cache_dir: Path) -> Path:
from huggingface_hub import snapshot_download
# Check if cache already exists
if cache_dir.exists() and any(cache_dir.iterdir()):
print(f"Cache already exists: {cache_dir}")
return cache_dir / "data_rag"
print(f"Downloading from HuggingFace: {HF_RAW_PDF_REPO}")
snapshot_download(
repo_id=HF_RAW_PDF_REPO,
repo_type="dataset",
local_dir=str(cache_dir),
local_dir_use_symlinks=False,
)
return cache_dir / "data_rag"
def load_existing_hashes(path: Path) -> dict:
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding='utf-8'))
return {item['filename']: item['hash'] for item in data.get('train', [])}
except Exception:
return {}
def process_pdfs(source_root: Path, dest_dir: Path, existing_hashes: dict) -> tuple:
hasher = HashProcessor(verbose=False)
pdf_files = list(source_root.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")
results, processed, skipped = [], 0, 0
for idx, src in enumerate(pdf_files):
rel_path = str(src.relative_to(source_root))
dest = dest_dir / rel_path
dest.parent.mkdir(parents=True, exist_ok=True)
# Skip if file unchanged (hash matches)
if dest.exists() and rel_path in existing_hashes:
current_hash = hasher.get_file_hash(str(dest))
if current_hash == existing_hashes[rel_path]:
results.append({'filename': rel_path, 'hash': current_hash, 'index': idx})
skipped += 1
continue
# Copy and compute hash
try:
shutil.copy2(src, dest)
file_hash = hasher.get_file_hash(str(dest))
if file_hash:
results.append({'filename': rel_path, 'hash': file_hash, 'index': idx})
processed += 1
except Exception as e:
print(f"Error: {rel_path} - {e}")
# Display progress
if (idx + 1) % 20 == 0:
print(f"Progress: {idx + 1}/{len(pdf_files)}")
return results, processed, skipped
def main():
import argparse
parser = argparse.ArgumentParser(description="Download PDFs and build hash index")
parser.add_argument("--source", type=str, help="Local path to PDFs (skip HF download)")
parser.add_argument("--download-only", action="store_true", help="Download only, no copy")
args = parser.parse_args()
data_dir = PROJECT_ROOT / "data"
files_dir = data_dir / "files"
files_dir.mkdir(parents=True, exist_ok=True)
hash_file = data_dir / "hash_data_goc_index.json"
# Determine source directory
if args.source:
source_root = Path(args.source)
if not source_root.exists():
return print(f"Source directory not found: {source_root}")
else:
# Download from HuggingFace
source_root = download_from_hf(data_dir / "raw_pdf_cache")
if args.download_only:
return print(f"PDFs cached at: {source_root}")
if not source_root.exists():
return print(f"PDF directory not found: {source_root}")
# Process
existing = load_existing_hashes(hash_file)
print(f"Loaded {len(existing)} hashes from existing index")
results, processed, skipped = process_pdfs(source_root, files_dir, existing)
# Save results
hash_file.write_text(json.dumps({
'train': results,
'total_files': len(results)
}, ensure_ascii=False, indent=2), encoding='utf-8')
print(f"\nDone! Total: {len(results)} | New: {processed} | Skipped: {skipped}")
print(f"Index file: {hash_file}")
if __name__ == "__main__":
main()
|