File size: 4,720 Bytes
0b39aef a731017 6878a93 0b39aef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
"""
Configuration for PubGuard classifier.
Mirrors openalex_classifier.config with multi-head additions.
"""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional
import os
def _find_models_dir() -> Path:
"""Locate PubGuard models directory.
Checks for 'head_doc_type.npz' to distinguish PubGuard models
from other model directories (e.g. OpenAlex) that may exist nearby.
"""
marker = "head_doc_type.npz"
if env_dir := os.environ.get("PUBGUARD_MODELS_DIR"):
path = Path(env_dir)
if path.exists():
return path
# Package data
pkg = Path(__file__).parent / "models"
if (pkg / marker).exists():
return pkg
# CWD
cwd = Path.cwd() / "pubguard_models"
if (cwd / marker).exists():
return cwd
# Repo dev path (pub_check/models)
repo = Path(__file__).parent.parent.parent / "models"
if (repo / marker).exists():
return repo
# User home (default install location)
home = Path.home() / ".pubguard" / "models"
if (home / marker).exists():
return home
# Fallback β use home dir even if empty (training will populate it)
home.mkdir(parents=True, exist_ok=True)
return home
# ββ Label schemas ββββββββββββββββββββββββββββββββββββββββββββββββ
DOC_TYPE_LABELS: List[str] = [
"scientific_paper", # Full research article / journal paper
"poster", # Conference poster (often single-page, visual)
"abstract_only", # Standalone abstract without full paper body
"junk", # Flyers, advertisements, non-scholarly PDFs
]
AI_DETECT_LABELS: List[str] = [
"human",
"ai_generated",
]
TOXICITY_LABELS: List[str] = [
"clean",
"toxic",
]
@dataclass
class PubGuardConfig:
"""Runtime configuration for PubGuard."""
# ββ Embedding backbone ββββββββββββββββββββββββββββββββββββββ
# Re-use the same distilled model you already cache for OpenAlex
# to avoid downloading a second 50 MB blob. Any model2vec-
# compatible StaticModel works here.
model_name: str = "minishlab/potion-base-32M"
embedding_dim: int = 512 # potion-base-32M output dim
# ββ Per-head thresholds βββββββββββββββββββββββββββββββββββββ
# These are posterior-probability thresholds from the softmax
# head; anything below is "uncertain" and falls back to the
# majority class. Calibrate on held-out data.
doc_type_threshold: float = 0.50
ai_detect_threshold: float = 0.55
toxicity_threshold: float = 0.50
# ββ Pipeline gate logic βββββββββββββββββββββββββββββββββββββ
# The overall `.screen()` returns pass=True ONLY when doc_type
# is 'scientific_paper'. Posters, abstracts, and junk are all
# blocked β the PubVerse pipeline processes publications only.
# AI detection and toxicity are informational by default.
require_scientific: bool = True
block_ai_generated: bool = False # informational by default
block_toxic: bool = False # informational by default
# ββ Batch / performance βββββββββββββββββββββββββββββββββββββ
batch_size: int = 256
max_text_chars: int = 4000 # Truncate long texts for embedding
# ββ Paths βββββββββββββββββββββββββββββββββββββββββββββββββββ
models_dir: Optional[Path] = None
def __post_init__(self):
if self.models_dir is None:
self.models_dir = _find_models_dir()
self.models_dir = Path(self.models_dir)
# Derived paths
@property
def distilled_model_path(self) -> Path:
return self.models_dir / "pubguard-embedding"
@property
def doc_type_head_path(self) -> Path:
return self.models_dir / "head_doc_type.npz"
@property
def ai_detect_head_path(self) -> Path:
return self.models_dir / "head_ai_detect.npz"
@property
def toxicity_head_path(self) -> Path:
return self.models_dir / "head_toxicity.npz"
@property
def label_schemas(self) -> Dict[str, List[str]]:
return {
"doc_type": DOC_TYPE_LABELS,
"ai_detect": AI_DETECT_LABELS,
"toxicity": TOXICITY_LABELS,
}
|