File size: 4,720 Bytes
0b39aef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a731017
 
 
6878a93
0b39aef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Configuration for PubGuard classifier.

Mirrors openalex_classifier.config with multi-head additions.
"""

from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional
import os


def _find_models_dir() -> Path:
    """Locate PubGuard models directory.

    Checks for 'head_doc_type.npz' to distinguish PubGuard models
    from other model directories (e.g. OpenAlex) that may exist nearby.
    """
    marker = "head_doc_type.npz"

    if env_dir := os.environ.get("PUBGUARD_MODELS_DIR"):
        path = Path(env_dir)
        if path.exists():
            return path

    # Package data
    pkg = Path(__file__).parent / "models"
    if (pkg / marker).exists():
        return pkg

    # CWD
    cwd = Path.cwd() / "pubguard_models"
    if (cwd / marker).exists():
        return cwd

    # Repo dev path (pub_check/models)
    repo = Path(__file__).parent.parent.parent / "models"
    if (repo / marker).exists():
        return repo

    # User home (default install location)
    home = Path.home() / ".pubguard" / "models"
    if (home / marker).exists():
        return home

    # Fallback β€” use home dir even if empty (training will populate it)
    home.mkdir(parents=True, exist_ok=True)
    return home


# ── Label schemas ────────────────────────────────────────────────

DOC_TYPE_LABELS: List[str] = [
    "scientific_paper",   # Full research article / journal paper
    "poster",             # Conference poster (often single-page, visual)
    "abstract_only",      # Standalone abstract without full paper body
    "junk",               # Flyers, advertisements, non-scholarly PDFs
]

AI_DETECT_LABELS: List[str] = [
    "human",
    "ai_generated",
]

TOXICITY_LABELS: List[str] = [
    "clean",
    "toxic",
]


@dataclass
class PubGuardConfig:
    """Runtime configuration for PubGuard."""

    # ── Embedding backbone ──────────────────────────────────────
    # Re-use the same distilled model you already cache for OpenAlex
    # to avoid downloading a second 50 MB blob.  Any model2vec-
    # compatible StaticModel works here.
    model_name: str = "minishlab/potion-base-32M"
    embedding_dim: int = 512          # potion-base-32M output dim

    # ── Per-head thresholds ─────────────────────────────────────
    # These are posterior-probability thresholds from the softmax
    # head; anything below is "uncertain" and falls back to the
    # majority class.  Calibrate on held-out data.
    doc_type_threshold: float = 0.50
    ai_detect_threshold: float = 0.55
    toxicity_threshold: float = 0.50

    # ── Pipeline gate logic ─────────────────────────────────────
    # The overall `.screen()` returns pass=True ONLY when doc_type
    # is 'scientific_paper'.  Posters, abstracts, and junk are all
    # blocked β€” the PubVerse pipeline processes publications only.
    # AI detection and toxicity are informational by default.
    require_scientific: bool = True
    block_ai_generated: bool = False   # informational by default
    block_toxic: bool = False          # informational by default

    # ── Batch / performance ─────────────────────────────────────
    batch_size: int = 256
    max_text_chars: int = 4000  # Truncate long texts for embedding

    # ── Paths ───────────────────────────────────────────────────
    models_dir: Optional[Path] = None

    def __post_init__(self):
        if self.models_dir is None:
            self.models_dir = _find_models_dir()
        self.models_dir = Path(self.models_dir)

    # Derived paths
    @property
    def distilled_model_path(self) -> Path:
        return self.models_dir / "pubguard-embedding"

    @property
    def doc_type_head_path(self) -> Path:
        return self.models_dir / "head_doc_type.npz"

    @property
    def ai_detect_head_path(self) -> Path:
        return self.models_dir / "head_ai_detect.npz"

    @property
    def toxicity_head_path(self) -> Path:
        return self.models_dir / "head_toxicity.npz"

    @property
    def label_schemas(self) -> Dict[str, List[str]]:
        return {
            "doc_type": DOC_TYPE_LABELS,
            "ai_detect": AI_DETECT_LABELS,
            "toxicity": TOXICITY_LABELS,
        }