File size: 4,136 Bytes

54b2662
 
9348624
54b2662
 
b91b0a5
 
 
54b2662
 
 
92c9b4d
c429a2d
 
 
 
b91b0a5
c429a2d
92c9b4d
c429a2d
92c9b4d
c429a2d
 
92c9b4d
c429a2d
 
 
 
 
 
 
 
 
b91b0a5
 
 
 
 
 
 
 
 
 
 
 
 
92c9b4d
b91b0a5
 
 
 
 
 
 
 
92c9b4d
b91b0a5
 
 
 
 
 
 
92c9b4d
b91b0a5
 
 
 
 
 
 
92c9b4d
b91b0a5
92c9b4d
b91b0a5
92c9b4d
b91b0a5
 
 
 
54b2662
c429a2d
92c9b4d
 
 
c429a2d
 
b91b0a5
c429a2d
54b2662
b91b0a5
54b2662
92c9b4d
c429a2d
 
 
92c9b4d
c429a2d
92c9b4d
b91b0a5
c429a2d
92c9b4d
c429a2d
9348624
92c9b4d
54b2662
92c9b4d
b91b0a5
92c9b4d
9348624
b91b0a5
54b2662
92c9b4d
b91b0a5
 
 
 
54b2662
92c9b4d
 
9348624
54b2662

import sys
import json
import shutil
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from core.hash_file.hash_file import HashProcessor

# HuggingFace repo containing raw PDFs
HF_RAW_PDF_REPO = "hungnha/Do_An_Dataset"


def download_from_hf(cache_dir: Path) -> Path:
    from huggingface_hub import snapshot_download
    
    # Check if cache already exists
    if cache_dir.exists() and any(cache_dir.iterdir()):
        print(f"Cache already exists: {cache_dir}")
        return cache_dir / "data_rag"
    
    print(f"Downloading from HuggingFace: {HF_RAW_PDF_REPO}")
    snapshot_download(
        repo_id=HF_RAW_PDF_REPO,
        repo_type="dataset",
        local_dir=str(cache_dir),
        local_dir_use_symlinks=False,
    )
    return cache_dir / "data_rag"


def load_existing_hashes(path: Path) -> dict:
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding='utf-8'))
        return {item['filename']: item['hash'] for item in data.get('train', [])}
    except Exception:
        return {}


def process_pdfs(source_root: Path, dest_dir: Path, existing_hashes: dict) -> tuple:
    hasher = HashProcessor(verbose=False)
    pdf_files = list(source_root.rglob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files\n")
    
    results, processed, skipped = [], 0, 0
    
    for idx, src in enumerate(pdf_files):
        rel_path = str(src.relative_to(source_root))
        dest = dest_dir / rel_path
        dest.parent.mkdir(parents=True, exist_ok=True)
        
        # Skip if file unchanged (hash matches)
        if dest.exists() and rel_path in existing_hashes:
            current_hash = hasher.get_file_hash(str(dest))
            if current_hash == existing_hashes[rel_path]:
                results.append({'filename': rel_path, 'hash': current_hash, 'index': idx})
                skipped += 1
                continue
        
        # Copy and compute hash
        try:
            shutil.copy2(src, dest)
            file_hash = hasher.get_file_hash(str(dest))
            if file_hash:
                results.append({'filename': rel_path, 'hash': file_hash, 'index': idx})
                processed += 1
        except Exception as e:
            print(f"Error: {rel_path} - {e}")
        
        # Display progress
        if (idx + 1) % 20 == 0:
            print(f"Progress: {idx + 1}/{len(pdf_files)}")
    
    return results, processed, skipped


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Download PDFs and build hash index")
    parser.add_argument("--source", type=str, help="Local path to PDFs (skip HF download)")
    parser.add_argument("--download-only", action="store_true", help="Download only, no copy")
    args = parser.parse_args()
    
    data_dir = PROJECT_ROOT / "data"
    files_dir = data_dir / "files"
    files_dir.mkdir(parents=True, exist_ok=True)
    hash_file = data_dir / "hash_data_goc_index.json"
    
    # Determine source directory
    if args.source:
        source_root = Path(args.source)
        if not source_root.exists():
            return print(f"Source directory not found: {source_root}")
    else:
        # Download from HuggingFace
        source_root = download_from_hf(data_dir / "raw_pdf_cache")
        if args.download_only:
            return print(f"PDFs cached at: {source_root}")
    
    if not source_root.exists():
        return print(f"PDF directory not found: {source_root}")
    
    # Process
    existing = load_existing_hashes(hash_file)
    print(f"Loaded {len(existing)} hashes from existing index")
    
    results, processed, skipped = process_pdfs(source_root, files_dir, existing)
    
    # Save results
    hash_file.write_text(json.dumps({
        'train': results, 
        'total_files': len(results)
    }, ensure_ascii=False, indent=2), encoding='utf-8')
    
    print(f"\nDone! Total: {len(results)} | New: {processed} | Skipped: {skipped}")
    print(f"Index file: {hash_file}")


if __name__ == "__main__":
    main()