File size: 4,242 Bytes
55d584b
0ccf2f0
55d584b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccf2f0
55d584b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccf2f0
55d584b
 
 
 
 
 
0ccf2f0
55d584b
 
0ccf2f0
55d584b
0ccf2f0
 
 
 
 
 
 
 
 
 
 
 
 
 
c2695d4
0ccf2f0
55d584b
 
 
 
0ccf2f0
55d584b
0ccf2f0
55d584b
0ccf2f0
 
 
55d584b
 
0ccf2f0
55d584b
 
 
 
 
 
 
0ccf2f0
 
 
 
55d584b
 
0ccf2f0
55d584b
 
ec38897
55d584b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
Pack synchronization and update checking.

Verifies that local packs are up-to-date with upstream HuggingFace datasets.
Optionally re-ingests if packs are missing or outdated.
"""

import json
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from datetime import datetime

logger = logging.getLogger(__name__)


class PackSync:
    """Manages pack synchronization with upstream sources."""

    PACK_MANIFEST = {
        "warbler-pack-hf-arxiv": {
            "source": "nick007x/arxiv-papers",
            "type": "huggingface",
            "description": "Scholarly papers",
        },
        "warbler-pack-hf-prompt-report": {
            "source": "PromptSystematicReview/ThePromptReport",
            "type": "huggingface",
            "description": "Prompt engineering documentation",
        },
        "warbler-pack-hf-novels": {
            "source": "GOAT-AI/generated-novels",
            "type": "huggingface",
            "description": "Generated novels",
        },
        "warbler-pack-hf-manuals": {
            "source": "nlasso/anac-manuals-23",
            "type": "huggingface",
            "description": "Technical manuals",
        },
        "warbler-pack-hf-enterprise": {
            "source": "AST-FRI/EnterpriseBench",
            "type": "huggingface",
            "description": "Enterprise benchmarks",
        },
        "warbler-pack-hf-portuguese-edu": {
            "source": "Solshine/Portuguese_Language_Education_Texts",
            "type": "huggingface",
            "description": "Portuguese education texts",
        },
    }

    def __init__(self, packs_dir: Path = None):
        """Initialize the pack synchronizer."""
        if packs_dir is None:
            packs_dir = Path(__file__).parent.parent / "packs"
        self.packs_dir = Path(packs_dir)
        self.metadata_file = self.packs_dir / ".pack_metadata.json"

    def verify_packs(self) -> Dict[str, Any]:
        """Verify all packs exist and are accessible."""
        status = {"verified": [], "missing": [], "timestamp": datetime.now().isoformat()}

        for pack_name in self.PACK_MANIFEST:
            pack_dir = self.packs_dir / pack_name
            pack_file = pack_dir / f"{pack_name}.jsonl"

            if pack_dir.exists() and pack_file.exists():
                try:
                    with open(pack_file, "r", encoding="utf-8") as f:
                        line_count = sum(1 for _ in f)
                    status["verified"].append(
                        {"pack": pack_name, "documents": line_count, "path": str(pack_dir)}
                    )
                    logger.info("✓ %s: %d documents", pack_name, line_count)
                except OSError as e:
                    logger.warning("⚠️  %s exists but unable to read: %s", pack_name, e)
                    status["missing"].append(pack_name)
            else:
                status["missing"].append(pack_name)
                logger.warning("⚠️  %s not found", pack_name)

        return status

    def save_metadata(self, status: Dict[str, Any]) -> None:
        """Save pack verification metadata."""
        try:
            with open(self.metadata_file, "w", encoding="utf-8") as f:
                json.dump(status, f, indent=2)
            logger.debug("Saved pack metadata to %s", self.metadata_file)
        except OSError as e:
            logger.warning("Could not save pack metadata: %s", e)

    def get_sync_status(self) -> str:
        """Return human-readable pack sync status."""
        status = self.verify_packs()
        verified_count = len(status["verified"])
        missing_count = len(status["missing"])

        if missing_count == 0:
            return f"✓ All {verified_count} packs verified and ready"
        else:
            return (
                f"⚠️  {verified_count} packs verified, {missing_count} "
                f"missing (run ingest to rebuild)"
            )

    def suggest_reingest(self) -> Optional[str]:
        """Return reingest command if packs are missing."""
        status = self.verify_packs()
        if status["missing"]:
            return "python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
        return None