import sys
import json
import shutil
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from core.hash_file.hash_file import HashProcessor

# HuggingFace repo chứa PDF gốc
HF_RAW_PDF_REPO = "hungnha/Do_An_Dataset"


def download_from_hf(cache_dir: Path) -> Path:
    """Tải PDF từ HuggingFace, trả về đường dẫn tới folder data_rag."""
    from huggingface_hub import snapshot_download
    
    # Kiểm tra cache đã tồn tại chưa
    if cache_dir.exists() and any(cache_dir.iterdir()):
        print(f"Cache đã tồn tại: {cache_dir}")
        return cache_dir / "data_rag"
    
    print(f"Đang tải từ HuggingFace: {HF_RAW_PDF_REPO}")
    snapshot_download(
        repo_id=HF_RAW_PDF_REPO,
        repo_type="dataset",
        local_dir=str(cache_dir),
        local_dir_use_symlinks=False,
    )
    return cache_dir / "data_rag"


def load_existing_hashes(path: Path) -> dict:
    """Đọc hash index cũ từ file JSON."""
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding='utf-8'))
        return {item['filename']: item['hash'] for item in data.get('train', [])}
    except Exception:
        return {}


def process_pdfs(source_root: Path, dest_dir: Path, existing_hashes: dict) -> tuple:
    """Copy PDFs và tính hash. Trả về (results, processed, skipped)."""
    hasher = HashProcessor(verbose=False)
    pdf_files = list(source_root.rglob("*.pdf"))
    print(f"Tìm thấy {len(pdf_files)} file PDF\n")
    
    results, processed, skipped = [], 0, 0
    
    for idx, src in enumerate(pdf_files):
        rel_path = str(src.relative_to(source_root))
        dest = dest_dir / rel_path
        dest.parent.mkdir(parents=True, exist_ok=True)
        
        # Bỏ qua nếu file không thay đổi (hash khớp)
        if dest.exists() and rel_path in existing_hashes:
            current_hash = hasher.get_file_hash(str(dest))
            if current_hash == existing_hashes[rel_path]:
                results.append({'filename': rel_path, 'hash': current_hash, 'index': idx})
                skipped += 1
                continue
        
        # Copy và tính hash
        try:
            shutil.copy2(src, dest)
            file_hash = hasher.get_file_hash(str(dest))
            if file_hash:
                results.append({'filename': rel_path, 'hash': file_hash, 'index': idx})
                processed += 1
        except Exception as e:
            print(f"Lỗi: {rel_path} - {e}")
        
        # Hiển thị tiến độ
        if (idx + 1) % 20 == 0:
            print(f"Tiến độ: {idx + 1}/{len(pdf_files)}")
    
    return results, processed, skipped


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Tải PDF và tạo hash index")
    parser.add_argument("--source", type=str, help="Đường dẫn local tới PDFs (bỏ qua tải HF)")
    parser.add_argument("--download-only", action="store_true", help="Chỉ tải về, không copy")
    args = parser.parse_args()
    
    data_dir = PROJECT_ROOT / "data"
    files_dir = data_dir / "files"
    files_dir.mkdir(parents=True, exist_ok=True)
    hash_file = data_dir / "hash_data_goc_index.json"
    
    # Xác định thư mục nguồn
    if args.source:
        source_root = Path(args.source)
        if not source_root.exists():
            return print(f"Không tìm thấy thư mục nguồn: {source_root}")
    else:
        # Tải từ HuggingFace
        source_root = download_from_hf(data_dir / "raw_pdf_cache")
        if args.download_only:
            return print(f"PDF đã cache tại: {source_root}")
    
    if not source_root.exists():
        return print(f"Không tìm thấy thư mục PDF: {source_root}")
    
    # Xử lý
    existing = load_existing_hashes(hash_file)
    print(f"Đã tải {len(existing)} hash từ index cũ")
    
    results, processed, skipped = process_pdfs(source_root, files_dir, existing)
    
    # Lưu kết quả
    hash_file.write_text(json.dumps({
        'train': results, 
        'total_files': len(results)
    }, ensure_ascii=False, indent=2), encoding='utf-8')
    
    print(f"\nHoàn tất! Tổng: {len(results)} | Mới: {processed} | Bỏ qua: {skipped}")
    print(f"File index: {hash_file}")


if __name__ == "__main__":
    main()