Spaces:
Running
Running
| """ | |
| Patch A — paradigm_classifier | |
| Classifica pares (q_bad, q_candidato) como melhoria genuina (label=1) | |
| ou probe adversarial / pergunta inocua (label=0). | |
| Dataset de treino: | |
| - pairs_layer2.jsonl : (q_bad, q_good) -> 163 positivos (label=1) | |
| - adversarial_probes : (q_bad, q_good_fake) -> 210 negativos (label=0) | |
| - adversarial_probes_cross : cross-domain -> 200 negativos (label=0) | |
| Total: 573 pares | |
| Layout de X por par (1185 colunas): | |
| [0:32] estruturais (32): len, spec, op, meas, hedg, nomin, wh, meth (x bad+cand+delta) + sim + jaccard | |
| [32:416] emb_bad (384): embedding sentence-transformer do q_bad | |
| [416:800] emb_cand (384): embedding sentence-transformer do q_cand | |
| [800:1184] emb_diff (384): emb_cand - emb_bad | |
| [1184] cross_enc (1): score do cross-encoder ms-marco (relevancia q_bad -> q_cand) | |
| Pipeline de treino: | |
| ColumnTransformer: | |
| struct + cross_enc -> StandardScaler | |
| emb_bad, emb_cand, emb_diff -> StandardScaler + PCA(40) | |
| HistGradientBoostingClassifier(class_weight='balanced') | |
| Gate: accuracy > 0.85 e kappa > 0.70 | |
| Saida: data/models/paradigm_classifier.pkl | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from pathlib import Path | |
| import joblib | |
| import numpy as np | |
| # --------------------------------------------------------------------------- | |
| # Wordlists para features estruturais | |
| # --------------------------------------------------------------------------- | |
| _SPECULATIVE = { | |
| "consciousness", | |
| "awareness", | |
| "qualia", | |
| "phenomenal", | |
| "subjective", | |
| "essence", | |
| "nature", | |
| "meaning", | |
| "reality", | |
| "existence", | |
| "being", | |
| "ontological", | |
| "ontology", | |
| "metaphysical", | |
| "metaphysics", | |
| "noumenal", | |
| "teleological", | |
| "teleology", | |
| "intrinsic", | |
| "ineffable", | |
| "transcendent", | |
| "transcendence", | |
| "ultimate", | |
| "fundamental", | |
| "emergent", | |
| "emergence", | |
| "substrate", | |
| "irreducible", | |
| "irreducibility", | |
| "holistic", | |
| "holism", | |
| "vitalism", | |
| "panpsychism", | |
| "epiphenomenal", | |
| "epiphenomenalism", | |
| "supervenience", | |
| "grounding", | |
| "instantiation", | |
| "potentiality", | |
| "actuality", | |
| "telos", | |
| "logos", | |
| "ontogenetic", | |
| "morphogenetic", | |
| } | |
| _OPERATIONAL = { | |
| "measure", | |
| "measures", | |
| "measured", | |
| "measuring", | |
| "measurement", | |
| "test", | |
| "tests", | |
| "tested", | |
| "testing", | |
| "experiment", | |
| "experimental", | |
| "analyze", | |
| "analysis", | |
| "compare", | |
| "comparison", | |
| "correlate", | |
| "correlation", | |
| "predict", | |
| "prediction", | |
| "predictive", | |
| "quantify", | |
| "quantification", | |
| "identify", | |
| "isolate", | |
| "control", | |
| "replicate", | |
| "simulate", | |
| "simulation", | |
| "model", | |
| "models", | |
| "optimize", | |
| "detect", | |
| "estimate", | |
| "calculate", | |
| "compute", | |
| "observe", | |
| "observation", | |
| "classify", | |
| "validate", | |
| "validation", | |
| "calibrate", | |
| "calibration", | |
| "statistically", | |
| "empirically", | |
| "operationalize", | |
| "operationalized", | |
| "protocol", | |
| "methodology", | |
| "randomized", | |
| "controlled", | |
| "blinded", | |
| "sequence", | |
| "genome", | |
| "gene", | |
| "protein", | |
| "pathway", | |
| "mechanism", | |
| "circuit", | |
| "neural", | |
| "behavioral", | |
| "cognitive", | |
| "physiological", | |
| "specific", | |
| "particular", | |
| "defined", | |
| "characterized", | |
| } | |
| _MEASUREMENT = { | |
| "rate", | |
| "frequency", | |
| "level", | |
| "concentration", | |
| "correlation", | |
| "coefficient", | |
| "proportion", | |
| "ratio", | |
| "percentage", | |
| "threshold", | |
| "range", | |
| "scale", | |
| "index", | |
| "score", | |
| "metric", | |
| "magnitude", | |
| "intensity", | |
| "duration", | |
| "latency", | |
| "accuracy", | |
| "precision", | |
| "recall", | |
| "sensitivity", | |
| "specificity", | |
| "variance", | |
| "deviation", | |
| "gradient", | |
| "density", | |
| "flux", | |
| "potential", | |
| "resistance", | |
| "temperature", | |
| "velocity", | |
| "mass", | |
| "volume", | |
| "charge", | |
| } | |
| _HEDGING = { | |
| "might", | |
| "could", | |
| "possibly", | |
| "perhaps", | |
| "presumably", | |
| "arguably", | |
| "allegedly", | |
| "seemingly", | |
| "apparently", | |
| "conceivably", | |
| "hypothetically", | |
| "theoretically", | |
| "speculatively", | |
| "putatively", | |
| } | |
| _WH_WORDS = { | |
| "what": 0, | |
| "how": 1, | |
| "why": 2, | |
| "which": 3, | |
| "whether": 4, | |
| "when": 5, | |
| "where": 6, | |
| "who": 7, | |
| "whom": 8, | |
| "whose": 9, | |
| } | |
| _NOMINALIZATIONS = re.compile( | |
| r"\b\w+(?:tion|ity|ness|ism|ence|ance|ment|hood|ship|ics)\b", | |
| re.IGNORECASE, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Constantes de layout | |
| # --------------------------------------------------------------------------- | |
| N_STRUCT = 10 # features estruturais por questao | |
| N_STRUCTURAL_TOTAL = N_STRUCT * 3 + 2 # 32: bad + cand + delta + sim + jaccard | |
| EMB_DIM = 384 # all-MiniLM-L6-v2 | |
| N_CROSS_ENC = 1 # score do cross-encoder | |
| # Total colunas brutas: 32 + 384*3 + 1 = 1185 | |
| N_FEATURES_RAW = N_STRUCTURAL_TOTAL + EMB_DIM * 3 + N_CROSS_ENC | |
| # --------------------------------------------------------------------------- | |
| # Extratores de features | |
| # --------------------------------------------------------------------------- | |
| def _words(text: str) -> list[str]: | |
| return re.findall(r"[a-zA-Z']+", text.lower()) | |
| def _structural_features(q: str) -> list[float]: | |
| words = _words(q) | |
| wset = set(words) | |
| n = max(len(words), 1) | |
| wh = _WH_WORDS.get(words[0] if words else "", -1) | |
| return [ | |
| len(words), | |
| sum(1 for w in words if w in _SPECULATIVE) / n, | |
| sum(1 for w in words if w in _OPERATIONAL) / n, | |
| sum(1 for w in words if w in _MEASUREMENT) / n, | |
| sum(1 for w in words if w in _HEDGING) / n, | |
| len(_NOMINALIZATIONS.findall(q)) / n, | |
| float(wh), | |
| float(any(w in wset for w in {"protocol", "methodology", "randomized", "blinded"})), | |
| float(sum(1 for w in words if w in _SPECULATIVE)), | |
| float(sum(1 for w in words if w in _OPERATIONAL)), | |
| ] | |
| def _jaccard(q1: str, q2: str) -> float: | |
| stop = { | |
| "is", | |
| "are", | |
| "the", | |
| "a", | |
| "an", | |
| "of", | |
| "in", | |
| "on", | |
| "at", | |
| "to", | |
| "do", | |
| "does", | |
| "did", | |
| "can", | |
| "could", | |
| "how", | |
| "what", | |
| "why", | |
| "which", | |
| "when", | |
| "where", | |
| "who", | |
| "that", | |
| "this", | |
| "these", | |
| "those", | |
| "and", | |
| "or", | |
| "but", | |
| "for", | |
| "with", | |
| "between", | |
| "among", | |
| } | |
| w1 = {w for w in _words(q1) if len(w) > 3 and w not in stop} | |
| w2 = {w for w in _words(q2) if len(w) > 3 and w not in stop} | |
| if not w1 and not w2: | |
| return 0.0 | |
| return len(w1 & w2) / len(w1 | w2) | |
| def build_structural_matrix( | |
| q_bads: list[str], | |
| q_cands: list[str], | |
| emb_bad: np.ndarray, | |
| emb_cand: np.ndarray, | |
| ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: | |
| """ | |
| Retorna (X_struct, emb_bad, emb_cand, emb_diff). | |
| X_struct shape: (n, N_STRUCTURAL_TOTAL) | |
| """ | |
| sims = (emb_bad * emb_cand).sum(axis=1) # cosine (embeddings normalizados) | |
| rows = [] | |
| for q_bad, q_cand, sim in zip(q_bads, q_cands, sims): | |
| fb = _structural_features(q_bad) | |
| fc = _structural_features(q_cand) | |
| delta = [c - b for b, c in zip(fb, fc)] | |
| rows.append(fb + fc + delta + [float(sim), _jaccard(q_bad, q_cand)]) | |
| return ( | |
| np.array(rows, dtype=float), | |
| emb_bad, | |
| emb_cand, | |
| emb_cand - emb_bad, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Wrapper do modelo treinado | |
| # --------------------------------------------------------------------------- | |
| class ParadigmClassifier: | |
| """Classificador de pares (q_bad, q_candidato).""" | |
| CROSS_ENC_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" | |
| EMBED_MODEL = "all-MiniLM-L6-v2" | |
| def __init__(self, model_path: str | Path | None = None): | |
| self.model_path = Path(model_path or "data/models/paradigm_classifier.pkl") | |
| self._pipeline = None | |
| self._embedder = None | |
| self._cross_encoder = None | |
| # ------------------------------------------------------------------ | |
| # Lazy loaders | |
| # ------------------------------------------------------------------ | |
| def _load_embedder(self): | |
| if self._embedder is None: | |
| from sentence_transformers import SentenceTransformer | |
| self._embedder = SentenceTransformer(self.EMBED_MODEL) | |
| return self._embedder | |
| def _load_cross_encoder(self): | |
| if self._cross_encoder is None: | |
| from sentence_transformers import CrossEncoder | |
| self._cross_encoder = CrossEncoder(self.CROSS_ENC_MODEL) | |
| return self._cross_encoder | |
| def load(self) -> "ParadigmClassifier": | |
| self._pipeline = joblib.load(self.model_path) | |
| return self | |
| # ------------------------------------------------------------------ | |
| # Computacao de features | |
| # ------------------------------------------------------------------ | |
| def _embed_batch(self, texts: list[str]) -> np.ndarray: | |
| return self._load_embedder().encode( | |
| texts, batch_size=64, show_progress_bar=False, normalize_embeddings=True | |
| ) | |
| def _cross_enc_batch(self, pairs: list[tuple[str, str]]) -> np.ndarray: | |
| """Retorna array (n, 1) com scores do cross-encoder.""" | |
| scores = self._load_cross_encoder().predict(pairs) | |
| return scores.reshape(-1, 1).astype(float) | |
| def build_features(self, pairs: list[tuple[str, str]]) -> np.ndarray: | |
| """ | |
| Extrai X de shape (n, N_FEATURES_RAW = 1185). | |
| Layout: | |
| [0:32] estruturais | |
| [32:416] emb_bad | |
| [416:800] emb_cand | |
| [800:1184] emb_diff | |
| [1184] cross_encoder score | |
| """ | |
| q_bads = [p[0] for p in pairs] | |
| q_cands = [p[1] for p in pairs] | |
| all_embs = self._embed_batch(q_bads + q_cands) | |
| emb_bad = all_embs[: len(q_bads)] | |
| emb_cand = all_embs[len(q_bads) :] | |
| X_struct, X_eb, X_ec, X_ed = build_structural_matrix(q_bads, q_cands, emb_bad, emb_cand) | |
| X_ce = self._cross_enc_batch(pairs) # (n, 1) | |
| return np.hstack([X_struct, X_eb, X_ec, X_ed, X_ce]) | |
| # ------------------------------------------------------------------ | |
| # Predicao (requer load() antes) | |
| # ------------------------------------------------------------------ | |
| def predict(self, q_bad: str, q_cand: str) -> int: | |
| X = self.build_features([(q_bad, q_cand)]) | |
| return int(self._pipeline.predict(X)[0]) | |
| def predict_proba(self, q_bad: str, q_cand: str) -> float: | |
| """Retorna P(label=1) — probabilidade de melhoria genuina.""" | |
| X = self.build_features([(q_bad, q_cand)]) | |
| return float(self._pipeline.predict_proba(X)[0, 1]) | |