chairulridjal commited on 8 days ago

Commit

a0b4998

verified ·

1 Parent(s): bf013b2

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
data/processed/backup/enriched_13class_train.jsonl +3 -0
data/processed/backup/enriched_5class_train.jsonl +3 -0
data/processed/enriched_13class_train.jsonl +3 -0
data/processed/enriched_5class_train.jsonl +3 -0
data/processed/enriched_5class_train_cleaned.jsonl +3 -0
data/processed/enriched_5class_train_cleaned_deleaked.jsonl +3 -0
data/processed/enriched_5class_train_cleaned_trimmed.jsonl +3 -0
data/processed/r7_5class_train.jsonl +3 -0
data/processed/r8_5class_train.jsonl +3 -0
data/processed/r8_5class_train_propagated.jsonl +3 -0
data/processed/r9_5class_train.jsonl +3 -0
research/decisions/ADR-001-use-case-cybersecurity.md +45 -0
research/decisions/ADR-002-strict-r9-and-benchmark-portfolio.md +57 -0
research/notes/class_balance_audit_2026-04-24.md +30 -0
research/notes/progress/2026-04-24-12-r8-dataset-build.md +27 -0
research/notes/progress/2026-04-24-16-baseline-eval-script.md +33 -0
research/notes/progress/2026-04-24-20-paper-direction-decided.md +24 -0
research/notes/progress/2026-04-24-24-cyner2-baseline-discovered.md +30 -0
research/notes/progress/2026-04-24-25-competitor-landscape-deep-dive.md +257 -0
research/notes/progress/2026-04-24-26-dataset-aggregation-plan.md +99 -0
research/notes/progress/2026-04-24-29-final-llm-merge-complete.md +43 -0
research/notes/progress/2026-04-24-30-data-quality-audit.md +245 -0
research/notes/progress/2026-04-24-32-round4-training-overfitting.md +25 -0
research/notes/progress/2026-04-24-36-dapt-research.md +230 -0
research/notes/progress/2026-04-24-37-training-tricks-research.md +259 -0
research/notes/progress/2026-04-24-39-class-weighting-data-scaling-research.md +296 -0
research/notes/progress/2026-04-24-40-round4b-killed-no-checkpoint.md +29 -0
research/notes/progress/2026-04-24-44-r5a-baseline-results.md +48 -0
research/notes/progress/2026-04-24-45-data-quality-audit.md +175 -0
research/notes/progress/2026-04-24-46-competitor-deep-dive.md +261 -0
research/notes/progress/2026-04-24-49-moe-finetuning-research.md +189 -0
research/notes/progress/2026-04-24-50-r7-data-pipeline-plan.md +262 -0
research/notes/progress/2026-04-24-51-audit-ioc-coverage.md +62 -0
research/notes/progress/2026-04-24-53-audit-label-consistency.md +147 -0
research/notes/progress/2026-04-24-59-aptner-held-out-test.md +37 -0
research/notes/progress/2026-04-24-cyner-deep-dive-and-datasets.md +408 -0
research/notes/progress/2026-04-24-landscape-research-opus.md +390 -0
research/notes/progress/2026-04-24-ner-recall-improvement-techniques.md +61 -0
research/notes/progress/2026-04-26-01-feasibility-check-approach.md +48 -0
research/notes/progress/2026-04-26-02-cyner-exact-match-and-gap-analysis.md +85 -0
research/paper/outline.md +150 -0
research/securebert2/.gitignore +12 -0
research/securebert2/CODE_OF_CONDUCT.md +132 -0
research/securebert2/CONTRIBUTING.md +58 -0
research/securebert2/LICENSE +201 -0
research/securebert2/MAINTAINERS.md +2 -0
research/securebert2/README.md +231 -0
research/securebert2/SECURITY.md +57 -0
research/securebert2/dataset.py +179 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/processed/r8_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/enriched_13class_train.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/enriched_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/enriched_5class_train_cleaned_trimmed.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/r8_5class_train_propagated.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/enriched_5class_train_cleaned.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/r7_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/r9_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/enriched_5class_train_cleaned_deleaked.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/backup/enriched_13class_train.jsonl filter=lfs diff=lfs merge=lfs -text
+data/processed/backup/enriched_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text

data/processed/backup/enriched_13class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5d1e4f4d9bd3a414fcd81d05242c1f913f575a553b3233adb15a8ae51740ecf
+size 24261655

data/processed/backup/enriched_5class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90c6603e997c5844aec40404907210816886f6a56bce2acc5f27577b2d7f9469
+size 21643218

data/processed/enriched_13class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f723221d386fe83d06916f9c1b0885e52327750bcfa4d9ccac36d0143b79d410
+size 21203019

data/processed/enriched_5class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae602b2b8c89136ac80c49061c23fc0b41edeeb677a56888580feea5476dd21a
+size 19573553

data/processed/enriched_5class_train_cleaned.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0f49bff19319f8210cc3e1ecfbc18488ef60d73682f318ced1f314afbe44297
+size 18736010

data/processed/enriched_5class_train_cleaned_deleaked.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a047e2b5d5d8197d2601e5ed575082026876be6861eed46cb9ace034213c6d0
+size 16417825

data/processed/enriched_5class_train_cleaned_trimmed.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f59710bfdd33666816cb79282c812ae8db62052478d3b740b46ec827fcad71e8
+size 18097434

data/processed/r7_5class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:772a78f8c6fce9a7f0b81120082e0f5579c89b6abd2dbb4c62ccebbd182b3508
+size 19579089

data/processed/r8_5class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5172ffe17d2f266d1aa9f0e815a7f03045afab7b42b9d2fdaf95555cb23fbd8
+size 18041668

data/processed/r8_5class_train_propagated.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04d647cb77ac4a81db2d69e11cca934baed8fd6562488e99c67bd292e016118c
+size 22410416

data/processed/r9_5class_train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8d4b825c5eab6b6ac91915aac239ead3b5622844d69163fe51af4b8f9826d7f
+size 14918444

research/decisions/ADR-001-use-case-cybersecurity.md ADDED Viewed

	@@ -0,0 +1,45 @@

+# ADR-001: Use Case Selection — Cybersecurity IOC/Entity Extraction
+**Date:** 2026-04-24
+**Status:** Accepted
+**Deciders:** Human (lead) + Claude (research partner)
+## Context
+We evaluated 10+ verticals for repurposing OpenAI's Privacy Filter (50M active MoE bidirectional token classifier) beyond PII detection. The Opus research agent conducted a comprehensive landscape analysis covering existing tools, market gaps, available datasets, and architectural fit.
+## Options Considered
+1. **Cybersecurity IOC extraction** — Score 9/10
+2. **Developer tools (secret/annotation scanning)** — Score 8.5/10
+3. **Clinical de-identification** — Score 7.5/10
+4. **Improved PII (Presidio backend)** — Score 7/10
+5. **Financial entity extraction** — Score 6/10
+6. **Energy/power systems** — Low (no data, tiny market)
+7. Several others scored lower (legal, scientific, education, supply chain, HR)
+## Decision
+**Cybersecurity IOC/entity extraction from threat intelligence reports**, with CyNER (560M params) as the primary benchmark competitor.
+## Reasoning
+- **Biggest efficiency gap:** CyNER uses 560M dense params; we use 50M active (MoE). 11x compute reduction is the clearest "same accuracy, fraction of the cost" story.
+- **Architecture fit:** Cybersecurity entities (IPs, hashes, CVEs, malware names, threat actors) are short-to-medium spans with clear boundaries — ideal for BIOES + Viterbi.
+- **257-token window is sufficient:** IOC context is almost always within 1-2 sentences.
+- **Data exists:** PRISM benchmark, CyNER corpus, Pile-NER cybersecurity subset, MITRE ATT&CK structured data.
+- **Privacy argument is strong:** Threat reports contain internal network topology, can't be sent to cloud APIs.
+- **Publishable:** "Sparse MoE vs. dense transformer for cybersecurity NER" is a clean research question.
+- **Practical tool:** Every SOC team, every SIEM vendor needs lightweight local IOC extraction.
+## Deliverables
+1. **Research paper** — Rigorous comparison of Arcspan vs. CyNER (and other baselines)
+2. **Open-source tool** — Fine-tuned checkpoint + CLI/library for cybersecurity entity extraction
+## Consequences
+- Need to acquire and convert multiple cybersecurity NER datasets to BIOES JSONL format
+- Need to design a unified label taxonomy across datasets
+- Need reproducible experimental setup (fixed seeds, documented hyperparameters, held-out test sets)
+- Energy/power systems remains a potential future vertical once the platform is proven

research/decisions/ADR-002-strict-r9-and-benchmark-portfolio.md ADDED Viewed

	@@ -0,0 +1,57 @@

+# ADR-002: Strict R9 Dataset and Multi-Benchmark Evaluation
+**Date:** 2026-04-26
+**Status:** Accepted
+**Deciders:** Human (lead) + Codex replacing Claude
+## Context
+R8 proved that OpenAI Privacy Filter can learn the 5-class cyber NER task, but the honest exact-match results show different benchmark weaknesses:
+- APTNER exact-match micro F1: 0.4982
+- CyNER exact-match micro F1: 0.4050
+APTNER mainly exposes APT-report-style Organization/System recall gaps. CyNER mainly exposes Indicator boundary and format coverage gaps, especially defanged or unusual IOCs.
+The entity-propagated R8 file is now available, but its audit found 156,929 added spans on top of 76,824 base spans, including many generic or ambiguous surfaces. Including it in the next run would make any result hard to interpret.
+## Options Considered
+1. **Strict R9 only:** Train on R8 + deleaked CyberNER_harmonized + deleaked DNRTI, with validation/test overlap removed before deduplication.
+2. **R9 plus propagated R8:** Add the full propagated dataset immediately to maximize recall.
+3. **Delay R9 for a larger data rebuild:** Wait until we harvest much more targeted APT-style and CyNER-style data.
+## Decision
+Run **strict R9** next.
+Do **not** include propagated R8 in strict R9. Treat propagation as a separate future experiment only after filtering/auditing.
+Report R9 with a benchmark portfolio:
+- APTNER exact-match as the independent APT-report benchmark
+- CyNER exact-match as the original CyNER benchmark comparison
+- Enriched 5-class and SecureBERT2 5-class as supplementary continuity checks
+- OPF containment metrics as diagnostics only, not the primary paper-comparable score
+## Reasoning
+- Strict R9 is leakage-clean after the readiness gate: zero exact and zero prefix-80 train overlap with validation, enriched test, CyNER, SecureBERT2, and APTNER.
+- The propagated dataset is too noisy for the next controlled experiment. It would likely improve some recall numbers while injecting false positives and benchmark memorization risk.
+- A multi-benchmark protocol is necessary because improving APTNER and improving CyNER are not the same task. A single benchmark can be overfit unintentionally even with honest intent.
+- Strict R9 gives a clean signal before larger data scaling. If it helps APTNER but not CyNER, the next branch should target Indicators. If it helps neither, we revisit training/decoding rather than blindly adding data.
+## Consequences
+- R9 may score lower than a noisy propagation-boosted run, but its result will be interpretable.
+- Future data work should split into two explicit tracks:
+  - **Track A:** APT-report-style Organization/System examples.
+  - **Track B:** CyNER-style Indicator examples, including defanged domains/IPs/URLs, file paths, registry paths, package names, and odd multi-token indicators.
+- Decode calibration should happen after strict R9, using validation only, then evaluated unchanged across the benchmark portfolio.
+## Source
+- R9 readiness audit: `results/r9_readiness_audit.md`
+- Propagation audit: `results/entity_propagation_audit.md`
+- R8 CyNER exact-match note: `research/notes/progress/2026-04-26-02-cyner-exact-match-and-gap-analysis.md`
+- R9 readiness note: `research/notes/progress/2026-04-26-03-r9-readiness-and-propagation-audit.md`

research/notes/class_balance_audit_2026-04-24.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Arcspan NER Dataset Class Balance Audit (2026-04-24)
+## Summary
+Analyzed 5 fixed/deleaked training files comprising **54,139 examples** and **152,941 entity spans** across 5 security NER classes.
+| Dataset | Examples | All-O % | Total Spans | Imbalance |
+|---------|----------|---------|-------------|-----------|
+| **enriched_trimmed** | 25,127 | 10.0% | 75,677 | 2.46x |
+| **enriched_deleaked** | 24,339 | 19.3% | 63,831 | 2.77x |
+| **aptner_deleaked** | 3,078 | 33.1% | 4,627 | 16.77x |
+| **securebert2_deleaked** | 316 | 47.5% | 344 | 11.42x |
+| **defanged_augmented** | 1,279 | 0.0% | 8,462 | 11.41x |
+| **COMBINED** | **54,139** | **15.5%** | **152,941** | **2.94x** |
+## Entity Distribution (Combined)
+- **Indicator**: 44,282 (28.9%) — most common
+- **Malware**: 35,646 (23.3%)
+- **Organization**: 31,946 (20.9%)
+- **System**: 25,995 (17.0%)
+- **Vulnerability**: 15,072 (9.8%) — least common
+## Key Findings
+1. **Enriched files dominate**: `enriched_trimmed` + `enriched_deleaked` = 49.5k examples (91% of dataset)
+2. **Moderate imbalance**: 2.94x ratio within acceptable range for sequence labeling
+3. **All-O distribution**: 15.5% negative examples (reasonable for NER)
+4. **Defanged boost**: Augmentation adds 8.4k spans, particularly boosting Indicator class
+5. **Smaller sources volatile**: `aptner` and `securebert2` show high imbalance (11–17x) but contribute <6% of total
+## Recommendation
+**Dataset is well-balanced for training.** The 2.94x imbalance is healthy—Vulnerability's underrepresentation (9.8%) is acceptable given domain scarcity. Enriched files provide stable foundation; defanged augmentation adds diversity without distorting class ratios.

research/notes/progress/2026-04-24-12-r8-dataset-build.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# R8 Dataset Build
+## What we found
+Built the R8 (likely final) cybersecurity NER training dataset from deleaked sources:
+- **26,079 train** examples, **76,824 entities**, 12.0% all-O rate
+- **2,999 valid** examples, **5,927 entities**, 12.3% all-O rate
+- Sources: enriched (deleaked), APTNER (deleaked), SecureBERT2 (deleaked), defanged augmented
+- Stucco excluded (too noisy)
+- Trimmed all-O from 20% down to 12% by random subsampling negative examples
+## Entity distribution (train)
+- Indicator: 24,685
+- Malware: 16,887
+- Organization: 14,815
+- System: 13,320
+- Vulnerability: 7,117
+## Leakage verification
+- **Zero exact matches** against all 4 test sets
+- Prefix-80 matches are false positives (different texts sharing common openings)
+## Why it matters
+This is the final clean dataset for training. All known leakage issues resolved.
+## Open questions
+- Entity propagation (cross-document) running — will it meaningfully boost recall?
+- Vulnerability class is smallest (7K) — may be the hardest to learn

research/notes/progress/2026-04-24-16-baseline-eval-script.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# Baseline Evaluation Script Created
+**Date:** 2026-04-24
+## What we built
+`src/arcspan/eval/run_baselines.py` — evaluates HF NER models against our CyNER test data (748 examples, 5 entity types) with span-level exact-match P/R/F1.
+Two baselines wired up:
+1. **SecureBERT2.0-NER** (`cisco-ai/SecureBERT2.0-NER`) — TF-based, BIO, 5 entity types matching ours directly
+2. **SecureModernBERT-NER** (`attack-vector/SecureModernBERT-NER`) — PyTorch, 22 entity types mapped to our 5-class space
+## Key findings from 20-example smoke test
+| Model | Overall P | Overall R | Overall F1 |
+|---|---|---|---|
+| SecureBERT2.0-NER | 14.8% | 40.0% | 21.6% |
+| SecureModernBERT-NER | 55.0% | 55.0% | 55.0% |
+- SecureBERT2.0 is very noisy — over-predicts spans (low precision), includes trailing punctuation and non-entity text
+- SecureModernBERT is substantially better on exact match; cleaner span boundaries
+- Both models produce offsets with leading whitespace; we strip it in post-processing
+- Neither model saw any Malware or Indicator entities in the first 20 examples (those types appear later in the dataset)
+## Why it matters
+These are the baselines our fine-tuned Arcspan model will be measured against. The script is modular (`BASELINES` registry dict) so adding more models is trivial.
+## Open questions
+- Need to run full 748-example eval for real numbers
+- Should we add a "relaxed match" mode (overlapping spans count as partial credit)?
+- The 20-example sample is Organization-heavy; full eval will give better per-type coverage

research/notes/progress/2026-04-24-20-paper-direction-decided.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# Paper Direction Decided + Experimental Framework
+## Decision
+Both a **publishable research paper** and an **open-source tool**. If Arcspan matches or surpasses CyNER (560M dense) at 50M active params, it's a genuine contribution.
+## Paper Thesis
+"Sparse MoE token classifiers, fine-tuned with minimal data, can match dense transformer NER models at 1/11th the active compute for cybersecurity entity extraction."
+## The Five Key Experiments
+1. **Main comparison table:** Arcspan vs CyNER vs BERT-base vs GLiNER-zero-shot vs regex-only
+2. **Data efficiency curve (Figure 1 — the money chart):** F1 at 1%/5%/10%/25%/50%/100% of data
+3. **Per-entity-type breakdown:** Where does MoE win vs lose?
+4. **Viterbi vs argmax:** Our unique architectural advantage
+5. **Expert routing ablation:** top-2 vs top-4 via OPF_EXPERTS_PER_TOKEN
+## Baselines to Implement
+- CyNER (560M) — primary competitor
+- BERT-base fine-tuned on same data (110M) — standard NER baseline
+- GLiNER-M zero-shot (90M) — zero-shot ceiling
+- Regex-only — lower bound
+- SpaCy trf (110M) — out-of-domain baseline
+## What's Blocking Progress
+Waiting on Opus agent for: CyNER exact label schema, dataset locations, PRISM benchmark details. Once we have those, we can design the label space JSON and start data conversion.

research/notes/progress/2026-04-24-24-cyner2-baseline-discovered.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# CyNER 2.0 DeBERTa-v3-base — New Baseline Discovered
+## Key Facts
+- **Model**: DeBERTa-v3-base (200M params, dense)
+- **F1**: 91.88% (self-reported, needs verification)
+- **Training data**: 11,074 examples (7,751 train) — augmented from bnsapa/cybersecurity-ner + AlienVault/OpenCTI
+- **Label space**: 8 entity types (original 5 + Date, Location, ThreatGroup)
+- **Training**: lr=2e-5, 3 epochs, batch_size=8, weight_decay=0.01
+- **License**: MIT
+- **HuggingFace**: https://huggingface.co/PranavaKailash/CyNER-2.0-DeBERTa-v3-base
+## Critical Observations
+1. **91.88% F1 is likely on their own augmented test set** — NOT on the original CyNER test set. This makes direct comparison tricky. We need to eval them on the same test set.
+2. **They use 8 entity types** vs CyNER's 5 — added Date, Location, ThreatGroup. Not apples-to-apples.
+3. **11K examples vs our 2.8K** — they have ~3x more training data from augmentation.
+4. **LR = 2e-5** — 10x lower than our first run (2e-4). This is a strong hint for our hyperparameter tuning.
+5. **DeBERTa-v3-base is 200M dense params** — 4x our active params (50M).
+## What We Can Use From This
+- **Their augmented dataset** (MIT license) — we should download and convert it. 7,751 training examples is much better than our 2,811.
+- **LR = 2e-5 as reference point** — our 2e-4 was too aggressive, confirmed.
+- **As a baseline** — run their model on our CyNER test set for fair comparison.
+- **Their additional entity types** (ThreatActor, Date) overlap with our planned Tier 1 expansion.
+## Source
+- Model: https://huggingface.co/PranavaKailash/CyNER-2.0-DeBERTa-v3-base
+- Dataset: https://huggingface.co/datasets/PranavaKailash/CyNER2.0_augmented_dataset
+- GitHub: https://github.com/Pranava-Kailash/CyNER_2.0_API

research/notes/progress/2026-04-24-25-competitor-landscape-deep-dive.md ADDED Viewed

	@@ -0,0 +1,257 @@

+# Cybersecurity NER Competitor Landscape Deep Dive
+**Date:** 2026-04-24
+**Purpose:** Map the competitive landscape for cybersecurity NER to inform Arcspan's positioning, baselines, and related work section.
+---
+## Summary Comparison Table
+| Model | Architecture | Active Params | Entity Types | Overall F1 | Dataset | Weights Public? | License | Runnable Baseline? |
+|---|---|---|---|---|---|---|---|---|
+| **SecureBERT 2.0 NER** | ModernBERT-base (22L, 768d) | ~149M | 5 (Malware, Indicator, Vulnerability, System, Organization) | **0.945** | Cisco internal (3,400 train / 717 test) | Yes ([HF](https://huggingface.co/cisco-ai/SecureBERT2.0-NER)) | Apache 2.0 | **Yes** |
+| **SecureModernBERT-NER** | ModernBERT-large | ~395M | 22 fine-grained (MALWARE, THREAT-ACTOR, CVE, IPV4, IPV6, DOMAIN, URL, HASHES, EMAIL, REGISTRY-KEYS, ORG, PRODUCT, PLATFORM, SERVICE, SECTOR, LOC, FILEPATH, MITRE-TACTIC, TOOL, CAMPAIGN, ...) | **0.848** | 502,726 curated spans from real-world CTI | Yes ([HF](https://huggingface.co/attack-vector/SecureModernBERT-NER)) | MIT | **Yes** |
+| **CyberLLaMA** | LLaMA-3.2-3B + BiLSTM + CRF | ~3B | BIO-tagged cybersecurity terms (4,788 unique) | **0.989** | 42,404 articles (newspapers, blogs, official sites) | No (paper only) | Unknown | **No** |
+| **XLNet-CRF** | XLNet-base + CRF | ~110M | CTI entities (malware, IP, URL, hash, etc.) | **0.974** (CTI-Reports), **0.887** (MalwareTextDB) | CTI-Reports, MalwareTextDB | Code on GitHub (no pretrained weights) | Unknown | **Partial** (retrain needed) |
+| **BERT-CRF for CTI** | BERT-base + CRF | ~110M | 13 types (DNRTI), malware/IP/URL/hash (CTI-Reports) | **0.900** (DNRTI), **0.773** (CTI-Reports) | DNRTI (182K words), CTI-Reports (310K records), MalwareTextDB | Code on [GitHub](https://github.com/stwater20/NER-BERT-CRF-for-CTI) | Unknown | **Partial** |
+| **CyNER** | Transformer + heuristics ensemble | Varies | Malware, threat actors, indicators, vulnerabilities | ~0.74 (on CyberNER harmonized) | CyNER dataset | Yes ([GitHub](https://github.com/aiforsec/CyNER)) | Unknown | **Yes** |
+| **CyberNER (Harmonized)** | RoBERTa/SecureBERT/CySecBERT + CRF | ~125M | 21 STIX 2.1 entity types | **0.736** (RoBERTa best) | 610K tokens, 23,477 sentences from 4 merged datasets | Dataset public, code public | Unknown | **Yes** (benchmark) |
+| **SecLMNER** | LLM (generative) + SecureBERT (encoder) | <10B + 110M | Multi-source cybersecurity entities | SecureBERT +6-17% F1 | 5 cybersecurity text sources | No (paper only) | Unknown | **No** |
+---
+## Detailed Model Profiles
+### 1. SecureBERT 2.0 NER (Cisco AI)
+**Architecture:** ModernBERT-base with 22 hidden layers, 768 hidden size, 12 attention heads, max 8,192 tokens. Fine-tuned for token classification.
+**Training data:** Cisco's internal hand-labeled NER corpus: 3,400 training samples, 717 test samples. The base model (SecureBERT 2.0) was pretrained on 13B+ text tokens and 53M code tokens from cybersecurity sources.
+**Performance:**
+| Model | F1 | Recall | Precision |
+|---|---|---|---|
+| CyBERT | 0.351 | 0.281 | 0.467 |
+| SecureBERT 1.0 | 0.734 | 0.759 | 0.717 |
+| **SecureBERT 2.0** | **0.945** | **0.965** | **0.927** |
+Per-entity breakdown not publicly reported (only aggregate). Entity types: Malware, Indicator, Vulnerability, System, Organization (5 categories with 11 labels via BIO).
+**Availability:**
+- HuggingFace: `cisco-ai/SecureBERT2.0-NER` (Apache 2.0)
+- GitHub: `cisco-ai-defense/securebert2`
+- Uses TF model (`TFAutoModelForTokenClassification`) -- note TensorFlow dependency
+**Paper:** Aghaei, E. et al. "SecureBERT 2.0: Advanced Language Model for Cybersecurity Intelligence." arXiv:2510.00240 (2025). https://arxiv.org/abs/2510.00240
+**Baseline verdict:** **PRIMARY BASELINE.** Directly downloadable and runnable. The 0.945 F1 is on their own dataset with only 5 entity types -- important caveat. We must either (a) eval on their dataset with their labels, or (b) eval on a shared benchmark.
+---
+### 2. SecureModernBERT-NER (attack-vector)
+**Architecture:** ModernBERT-large (answerdotai/ModernBERT-large), ~395M params, fine-tuned for token classification.
+**Training data:** 502,726 manually curated text spans from real-world threat reports, vulnerability advisories, and incident analyses. Max sequence length 128 tokens during training.
+**Performance:**
+- Precision: 0.847, Recall: 0.848, F1: 0.848, Accuracy: 0.959
+- Strong per-label: CVE (0.9995), SHA256 (0.9874), URL (0.9801), LOC (0.9557)
+- Weaker: IPV6, EMAIL (rare types)
+**Entity types (22):** MALWARE, THREAT-ACTOR, CVE, IPV4, IPV6, DOMAIN, URL, MD5, SHA1, SHA256, EMAIL, REGISTRY-KEYS, ORG, PRODUCT, PLATFORM, SERVICE, SECTOR, LOC, FILEPATH, MITRE-TACTIC, TOOL, CAMPAIGN
+**Availability:**
+- HuggingFace: `attack-vector/SecureModernBERT-NER` (MIT license)
+- PyTorch model, standard `pipeline("token-classification")` inference
+**Paper:** No academic paper. Community model card only.
+**Baseline verdict:** **STRONG BASELINE.** 22 entity types makes this the most comprehensive label space. The 0.848 F1 across 22 types is arguably more impressive than SecureBERT 2.0's 0.945 across only 5 types. Directly runnable. MIT license is ideal.
+---
+### 3. CyMapNER / CyNER
+**Note:** "CyMapNER" does not appear to be a real model. The actual model is **CyNER** — an open-source Python library from `aiforsec`.
+**Architecture:** Ensemble approach combining transformer-based models, heuristics for IOC extraction, and publicly available NER models.
+**Training data:** Custom cybersecurity corpus; integrates with MALOnt2.0 ontology.
+**Performance:** On the CyberNER harmonized benchmark, transformer models trained on CyNER data achieve ~0.74 F1.
+**Availability:**
+- GitHub: `aiforsec/CyNER`
+- arXiv: https://arxiv.org/abs/2204.05754
+**Paper:** Alam, M.T. et al. "CyNER: A Python Library for Cybersecurity Named Entity Recognition." arXiv:2204.05754 (2022).
+**Baseline verdict:** **SECONDARY BASELINE.** Useful as a reference point for the CyberNER harmonized benchmark. Older (2022) and lower performance.
+---
+### 4. CTI-BERT / BERT-CRF for CTI
+**Architecture:** BERT-base-uncased + CRF layer. Also evaluated with secBERT (domain-adapted BERT).
+**Training data:** Three public datasets:
+- DNRTI: 182,452 words, 300+ threat reports, 13 entity classes
+- CTI-Reports: 310,406 records (malware, IP, URL, hash)
+- MalwareTextDB: malware text samples
+**Performance:**
+- DNRTI: 90.02% F1
+- CTI-Reports: 77.29% F1 (high precision 98.37%, low recall 74.10%)
+- MalwareTextDB: 58.57% F1
+- Real-world OSINT: 82.64% accuracy
+**Entity types:** 13 types on DNRTI (hacker groups, attacks, tools, vulnerabilities, methods); 4 types on CTI-Reports (malware, IP, URL, hash).
+**Availability:**
+- GitHub: `stwater20/NER-BERT-CRF-for-CTI`
+- No pretrained weights hosted; training code available
+**Paper:** Authors from NYCU. Published as a conference/workshop paper. PDF: https://speed.cs.nycu.edu.tw/~ydlin/Enhancing%20Cyber%20Threat%20Intelligence%20with%20Named%20Entity%20Recognition%20using%20BERT-CRF.pdf
+**Baseline verdict:** **REFERENCE ONLY.** No hosted weights. Useful as a literature comparison point for DNRTI/CTI-Reports benchmarks.
+---
+### 5. LANCE
+**Note:** No model called "LANCE" was found in the cybersecurity NER literature. This may be a confusion with:
+- **LanG** — a governance-aware agentic AI platform (unrelated)
+- **SecLMNER** — the LLM+encoder pipeline framework
+- **TTPrompt** — the retrieval-to-reasoning CTI NER framework
+The closest match to "LLM-based pipeline using GPT-4o/Llama on PRISM benchmark" is the **CyberBench** evaluation, which tested GPT-4 and Llama-2 on cybersecurity tasks. **PRISM** appears to be a GLM model variant, not a cybersecurity benchmark.
+**CyberBench results (AAAI-24 Workshop):**
+- GPT-4: 69.6 average across all tasks
+- GPT-3.5-Turbo: 62.6
+- Llama-2-13B: 54.1
+- CyberInstruct-13B (fine-tuned Llama-2): 70.4
+- For NER specifically: BERT-based models outperformed generative LLMs
+**Baseline verdict:** **NOT A REAL COMPETITOR.** "LANCE" likely doesn't exist as described. CyberBench/CyberInstruct results confirm that generative LLMs underperform specialized encoder models on NER.
+---
+### 6. Additional High-Performers
+#### CyberLLaMA (2025)
+- **Architecture:** LLaMA-3.2-3B + BiLSTM + CRF
+- **F1: 98.88%** — but this is on their own custom dataset (42,404 articles, 4,788 terms). No cross-benchmark validation.
+- **Paper:** Zhang, H. et al. "CyberLLaMA: A fine-tuned large language model for cybersecurity named entity recognition." Knowledge-Based Systems 328:114183 (2025).
+- **Weights:** NOT public. Paper only.
+- **Baseline verdict:** **NOT USABLE.** No weights, no shared benchmark. The 98.88% F1 is likely inflated by narrow label space and custom eval. Include in related work, not in experiments.
+#### XLNet-CRF (2025)
+- **Architecture:** XLNet-base + CRF
+- **F1: 97.43%** on CTI-Reports, 88.65% on MalwareTextDB
+- **Paper:** Wang, T. et al. "XLNet-CRF: Efficient Named Entity Recognition for Cyber Threat Intelligence with Permutation Language Modeling." Electronics 14(15):3034 (2025). https://www.mdpi.com/2079-9292/14/15/3034
+- **Code:** GitHub (training code, no pretrained weights)
+- **Baseline verdict:** **REFERENCE ONLY.** We can cite their numbers on CTI-Reports/MalwareTextDB. Could retrain if we use those datasets.
+#### CyberNER Harmonized Benchmark (2025)
+- **Architecture:** Various (RoBERTa+CRF best at 0.736 F1)
+- **21 STIX 2.1 entity types**, 610K tokens
+- **Paper:** Ech-Chammakhy, Y. et al. "CyberNER: A Harmonized STIX Corpus for Cybersecurity Named Entity Recognition." arXiv:2510.26499 (2025).
+- **Data + code:** Publicly available
+- **Baseline verdict:** **USE AS BENCHMARK.** This is the most principled evaluation framework -- STIX-aligned, multi-dataset, public. Best baseline F1 is only 0.736, leaving huge room for Arcspan to demonstrate value.
+#### SecLMNER (2025)
+- **Architecture:** Two-stage: generative LLM (<10B params) reformats text, then SecureBERT does NER
+- **Performance:** +6-17% F1 over SecureBERT alone
+- **Paper:** Zhang, Y. et al. "SecLMNER: A framework for enhanced NER in multi-source cybersecurity data using LLMs." Expert Systems with Applications 271:126651 (2025).
+- **Weights:** NOT public.
+- **Baseline verdict:** **REFERENCE ONLY.** Interesting architecture comparison (two-stage LLM+encoder vs. our single-pass approach).
+---
+## Key Observations
+1. **The field is fragmented.** No single benchmark dominates. Everyone evaluates on different datasets with different label spaces, making direct comparison nearly impossible.
+2. **CyberNER harmonized benchmark is the best shared eval.** 21 STIX entity types, public data+code, multiple baselines. Best result is only 0.736 F1 -- enormous headroom.
+3. **SecureBERT 2.0's 0.945 F1 is on only 5 coarse entity types** with a small private dataset. Impressive but not directly comparable to models handling 20+ types.
+4. **SecureModernBERT-NER is our closest competitor** in terms of practical utility (22 types, MIT license, public weights, standard inference). Its 0.848 F1 is the number to beat.
+5. **The claimed 98%+ F1 scores (CyberLLaMA, XLNet-CRF) are on narrow/custom benchmarks** and weights are not public. Not practically threatening.
+6. **Arcspan's architectural advantages:** 50M active params (vs. 149-395M for competitors), 128K context window (vs. 128-8192 for competitors), single-pass Viterbi decoding (vs. pipeline approaches), BIOES scheme (vs. BIO).
+---
+## Recommended Baselines for Our Paper
+### Tier 1: Must Include (runnable, public weights)
+| Model | How to Run | What to Report |
+|---|---|---|
+| **SecureBERT 2.0 NER** | `pip install transformers tensorflow`; load `cisco-ai/SecureBERT2.0-NER`; standard NER pipeline. **Note:** TF model, may need `TFAutoModelForTokenClassification`. | F1 on our dataset + their dataset if we can get it |
+| **SecureModernBERT-NER** | `pip install transformers`; load `attack-vector/SecureModernBERT-NER`; `pipeline("token-classification")`. PyTorch, straightforward. | F1 on our dataset (22 entity types, map to our label space) |
+| **CyNER** | `pip install cyner`; GitHub `aiforsec/CyNER`. Ensemble approach. | F1 on CyberNER benchmark |
+### Tier 2: Benchmark Comparison (shared datasets)
+| Benchmark | Source | Best Published F1 | Our Target |
+|---|---|---|---|
+| **CyberNER (STIX harmonized)** | arXiv:2510.26499, public | 0.736 (RoBERTa+CRF) | >0.80 |
+| **DNRTI** | Public, 13 entity types | 0.900 (BERT-CRF) | >0.90 |
+| **CTI-Reports** | Public, 4 entity types | 0.974 (XLNet-CRF) | Competitive |
+### Tier 3: Literature Comparison (cite numbers, can't re-run)
+| Model | Reported F1 | Notes |
+|---|---|---|
+| CyberLLaMA | 0.989 | Custom dataset, no weights, 3B params |
+| XLNet-CRF | 0.974 | CTI-Reports only, no pretrained weights |
+| SecLMNER | SecureBERT +6-17% | Two-stage pipeline, no weights |
+| BERT-CRF for CTI | 0.900 (DNRTI) | Can retrain from code if needed |
+### Practical Instructions
+```bash
+# SecureBERT 2.0 NER
+pip install transformers tensorflow
+python -c "
+from transformers import AutoTokenizer, TFAutoModelForTokenClassification, pipeline
+model = TFAutoModelForTokenClassification.from_pretrained('cisco-ai/SecureBERT2.0-NER')
+tokenizer = AutoTokenizer.from_pretrained('cisco-ai/SecureBERT2.0-NER')
+nlp = pipeline('ner', model=model, tokenizer=tokenizer)
+print(nlp('APT29 exploited CVE-2024-1234 using Cobalt Strike against Microsoft Exchange.'))
+"
+# SecureModernBERT-NER
+pip install transformers torch
+python -c "
+from transformers import pipeline
+nlp = pipeline('token-classification', model='attack-vector/SecureModernBERT-NER', aggregation_strategy='first')
+print(nlp('APT29 exploited CVE-2024-1234 using Cobalt Strike against Microsoft Exchange.'))
+"
+# CyNER
+pip install cyner
+python -c "
+import cyner
+model = cyner.CyNER()
+print(model.get_entities('APT29 exploited CVE-2024-1234 using Cobalt Strike.'))
+"
+```
+---
+## Arcspan Positioning
+Our key differentiators vs. the field:
+1. **10x smaller active footprint** (50M vs. 149-395M) -- crucial for edge/SOC deployment
+2. **128K context window** -- can process entire threat reports in one pass (competitors max at 512-8192)
+3. **Constrained Viterbi decoding with BIOES** -- structurally guaranteed valid spans (competitors use BIO + greedy/CRF)
+4. **Single-pass architecture** -- no two-stage LLM preprocessing (vs. SecLMNER)
+5. **MoE efficiency** -- 1.5B total params but only 50M active per token
+The CyberNER harmonized benchmark (0.736 best F1, 21 STIX types) is our ideal proving ground. If Arcspan can hit >0.80 F1 on that benchmark with 50M active params, the story writes itself.

research/notes/progress/2026-04-24-26-dataset-aggregation-plan.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Cybersecurity NER Dataset Aggregation — Research & Results
+## What We Built
+A master aggregation pipeline (`src/arcspan/data/aggregate_datasets.py`) that combines 4 public cybersecurity NER datasets into a unified 13-class and 5-class OPF BIOES JSONL format, with deduplication.
+## Datasets Found & Status
+### ✅ Successfully Aggregated (4 datasets)
+| Dataset | Source | Format | Raw Types | Sentences | Spans | Notes |
+|---------|--------|--------|-----------|-----------|-------|-------|
+| **CyNER original** | `data/raw/CyNER/dataset/mitre/` | CoNLL BIO | 5 | 4,372 | 3,040 | Baseline dataset |
+| **CyNER 2.0 augmented** | `data/raw/cyner2_augmented/hf_dataset/` | HF datasets | 8 | 11,074 | 15,036 | Adds ThreatActor, Date, Location |
+| **CyberNER harmonized** | `data/raw/CyberNER_harmonized/` (GitHub: yasirech-chammakhy/CyberNER) | CSV w/ STIX tags | 21 | 10,042 | 42,329 | **Best single source** — harmonizes CyNER+DNRTI+APTNER+Attacker onto STIX 2.1 |
+| **DNRTI** | `data/raw/DNRTI/DNRTI_Dataset/` (GitHub: LiuPeiP-CS/NER4CTI) | CoNLL BIO | 13 | 6,577 | 12,974 | Chinese-origin CTI dataset, 13 cybersec types |
+### ❌ Not Usable for NER Training
+| Dataset | Why Not |
+|---------|---------|
+| **SecureModernBERT-NER** | Only the *model* is on HuggingFace (attack-vector/SecureModernBERT-NER). Training data (502K spans, 22 classes) is NOT published. Model card describes the data but doesn't share it. |
+| **PRISM** | IOC *classification* (IoC vs nonIoC per indicator), not span-level NER annotations. Already at `data/raw/LANCE/PRISM/GT.json`. |
+| **CTI-Reports** | Behind IEEE DataPort download wall. XML format with IOC extractions, not token-level NER. |
+| **MalwareTextDB** | Requires manual download from statnlp.org (link may be dead). Only has generic "Entity" labels — no typed NER. |
+| **bnsapa/cybersecurity-ner** | Just a distilBERT fine-tune on original CyNER MITRE data — same data we already have. |
+| **Pile-NER cybersecurity subset** | General-purpose GPT-3.5-generated NER, not cybersecurity-specific. Would need heavy filtering and label mapping. Low quality. |
+| **MITRE ATT&CK STIX data** | Structured KB, not annotated text. Useful for distant supervision / data augmentation but not direct NER training. |
+### 🔍 Notable: CyberNER Already Subsumes Multiple Sources
+The CyberNER harmonized corpus (arXiv:2510.26499) already harmonizes CyNER, DNRTI, APTNER, and Attacker datasets. This means our aggregation has **significant overlap** between CyberNER and the individual CyNER/DNRTI datasets. The deduplication step removed ~3,766 duplicates (exact text match), but some paraphrased overlap likely remains. This is acceptable — the STIX-harmonized labels from CyberNER are higher quality than the raw source labels.
+## Final Aggregated Stats
+After deduplication:
+| Split | Sentences | Spans |
+|-------|-----------|-------|
+| Train | 20,436 | 52,331 |
+| Valid | 3,966 | 8,229 |
+| Test | 3,897 | 7,903 |
+| **Total** | **28,299** | **68,463** |
+### 13-Class Label Distribution (train)
+| Label | Count | % |
+|-------|-------|---|
+| MALWARE | 12,537 | 24.0% |
+| ORGANIZATION | 12,036 | 23.0% |
+| THREAT_ACTOR | 11,589 | 22.1% |
+| TOOL | 7,459 | 14.3% |
+| SYSTEM | 3,672 | 7.0% |
+| VULNERABILITY | 2,709 | 5.2% |
+| FILEPATH | 1,764 | 3.4% |
+| DOMAIN | 298 | 0.6% |
+| IP_ADDRESS | 168 | 0.3% |
+| URL | 71 | 0.1% |
+| EMAIL | 28 | <0.1% |
+| CVE_ID | 0 | 0% |
+| HASH | 0 | 0% |
+## Unified Label Mapping
+### CyNER (5 types) → 13-class
+- Malware → MALWARE
+- System → SYSTEM
+- Organization → ORGANIZATION
+- Vulnerability → VULNERABILITY
+- Indicator → **dropped** (mixed IOC types, can't reliably split)
+### CyNER 2.0 (8 types) → 13-class
+- Malware → MALWARE, ThreatActor → THREAT_ACTOR, System → SYSTEM, Organization → ORGANIZATION, Vulnerability → VULNERABILITY
+- Indicator → dropped, Date → dropped, Location → dropped
+### CyberNER STIX (21 types) → 13-class
+- Malware → MALWARE, Threat-Actor → THREAT_ACTOR, Intrusion-Set → THREAT_ACTOR
+- Tool → TOOL, Software → SYSTEM, Infrastructure → SYSTEM
+- Identity → ORGANIZATION, Vulnerability → VULNERABILITY
+- Domain-Name → DOMAIN, IPv4-Addr → IP_ADDRESS, URL → URL, Email-Addr → EMAIL, File → FILEPATH
+- Dropped: Campaign, Attack-Pattern, Course-of-Action, Indicator, Location, Observed-Data, Malware-Analysis, Network-Traffic
+### DNRTI (13 types) → 13-class
+- HackOrg → THREAT_ACTOR, SamFile → MALWARE, Tool → TOOL
+- SecTeam → ORGANIZATION, Org → ORGANIZATION, Exp → VULNERABILITY
+- Dropped: OffAct, Time, Purp, Area, Idus, Way, Features
+## Open Questions
+1. **CVE_ID and HASH have zero examples.** Need regex-based distant supervision from MITRE ATT&CK or synthetic generation to populate these.
+2. **IOC classes are severely underrepresented** (IP, DOMAIN, URL, EMAIL, FILEPATH total ~2,329 in train). Consider augmenting with regex-extracted IOCs from CTI reports.
+3. **CyberNER overlap with CyNER/DNRTI.** We deduplicate by exact text, but the same sentences appear with different tokenizations. Could do fuzzy dedup but risk losing valid data.
+4. **SecureModernBERT training data** (502K spans) would be transformative if released. Worth reaching out to the authors.
+## Sources
+- CyberNER: https://github.com/yasirech-chammakhy/CyberNER | arXiv:2510.26499
+- DNRTI: https://github.com/LiuPeiP-CS/NER4CTI
+- CyNER 2.0: HuggingFace PranavaKailash/CyNER2.0_augmented_dataset
+- SecureModernBERT-NER: https://huggingface.co/attack-vector/SecureModernBERT-NER (model only)

research/notes/progress/2026-04-24-29-final-llm-merge-complete.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Final LLM Annotation Merge — All 8 Sources Complete
+## Enriched Dataset Stats
+- **enriched_13class_train**: 22,052 examples (20,436 aggregated + 1,616 LLM)
+- **enriched_5class_train**: 21,891 examples
+- Total LLM spans: 6,060 across all 13 entity types
+## LLM Span Distribution (all sources combined)
+| Label | Count |
+|-------|-------|
+| MALWARE | 1,638 |
+| THREAT_ACTOR | 959 |
+| SYSTEM | 796 |
+| CVE_ID | 485 |
+| VULNERABILITY | 425 |
+| TOOL | 325 |
+| ORGANIZATION | 315 |
+| DOMAIN | 271 |
+| IP_ADDRESS | 248 |
+| HASH | 248 |
+| FILEPATH | 234 |
+| URL | 69 |
+| EMAIL | 47 |
+## Sources (8 annotation agents)
+| Source | Examples | Spans |
+|--------|----------|-------|
+| MITRE ATT&CK | 954 | 2,750 |
+| NVD CVEs | 339 | 990 |
+| Synthetic | 100 | 752 |
+| Vendor blogs | 67 | 446 |
+| News articles | 51 | 362 |
+| CISA advisories | 40 | 400 |
+| AlienVault OTX | 40 | 295 |
+| Malware reports | 25 | 464 |
+## Why It Matters
+- Zero-count entity classes eliminated (CVE_ID: 0→485, HASH: 0→248, FILEPATH: 0→234)
+- 8% more training data for Round 5
+- Diverse sources = better generalization
+## Next
+Round 4 training in progress (~1h36m). Round 5 script staged and ready.

research/notes/progress/2026-04-24-30-data-quality-audit.md ADDED Viewed

	@@ -0,0 +1,245 @@

+# Data Quality Audit — LLM-Annotated Cybersecurity NER Data
+**Date:** 2026-04-24
+**Auditor:** Automated script + manual review
+**Scope:** All 13 files in `data/processed/`, 17,516 total records
+---
+## Executive Summary
+| Issue | Count | Severity | Action Required |
+|-------|-------|----------|-----------------|
+| Offset errors | **0** | — | None |
+| Duplicate texts | **1,727 unique** (4,408 records) | HIGH | Deduplicate before training |
+| Short texts (<20 chars) | **71** | MEDIUM | Remove — too short for meaningful NER |
+| Mislabeled entities | **~10,854** | CRITICAL | See breakdown — most are label-space design issues |
+| Overlapping spans | **1,060** | HIGH | Fix or pick longest-match |
+| Garbage text (real HTML) | **~471** | MEDIUM | Strip HTML markup |
+| Repetitive entities (50+) | **100 entities** | MEDIUM | Review for template artifacts |
+| Empty spans (no annotations) | **942** | LOW-MEDIUM | Decide: keep as negatives or remove |
+**Overall data health: FAIR.** Offsets are clean (big win), but label consistency, overlaps, and duplicates need remediation before training.
+---
+## 1. Offset Errors: 0 ✅
+All `text[start:end]` slices match their declared entity text across all 17,516 records. The annotation pipeline produced correct character offsets.
+---
+## 2. Duplicate Texts: 1,727 unique texts appear 2+ times (4,408 total records)
+**Within-file duplicates:** 78 unique texts
+**Cross-file duplicates:** 1,649 unique texts
+### Worst offenders:
+- `"Ransomware."` — **44 copies** in `llm_annotated_apt.jsonl`
+- `"Ransomware"` — 7 copies in same file
+- Many MITRE descriptions appear in **both** `llm_annotated_mitre.jsonl` AND `llm_annotated_mitre_v2.jsonl` AND `llm_annotated_apt.jsonl` (3-4 copies each)
+- Oracle NVD boilerplate descriptions appear 4-6 times in `llm_annotated_nvd_v2.jsonl`
+### Root cause:
+- `mitre` and `mitre_v2` are overlapping dataset versions that were both kept
+- `apt` dataset ingested MITRE descriptions alongside its own data
+- Very short texts like "Ransomware." are degenerate entries from APT descriptions
+### Recommendation:
+**Deduplicate globally.** Keep the version with the best annotations when spans differ. Priority: `mitre_v2` > `mitre`, `nvd_v2` > `nvd`.
+---
+## 3. Short Texts (<20 chars): 71
+All 71 are from `llm_annotated_apt.jsonl`. Examples:
+- `"WebShell."` (9 chars) — 2 occurrences
+- `"Ransomware."` (11 chars) — 44+ occurrences
+- `"Keylogger."` (10 chars)
+- `"PyVil RAT"` (9 chars)
+These are malware "descriptions" that are just a single word. They have no spans (empty annotations) and provide zero training signal.
+### Recommendation:
+**Remove all records with text <20 chars.** They cannot produce useful span examples.
+---
+## 4. Mislabeled Entities: ~10,854 flagged
+This is the highest-count issue but most are **label-space design disagreements**, not random errors. Breakdown:
+### 4a. Security vendors labeled as SYSTEM instead of ORGANIZATION (200 instances)
+| Entity | Count |
+|--------|-------|
+| ESET | 37 |
+| Trend Micro | 25 |
+| Kaspersky | 16 |
+| Symantec | 11 |
+| SentinelOne | 8 |
+| Avast | 7 |
+| Fortinet | 7 |
+| Bitdefender | 3 |
+| Sophos | 2 |
+| Palo Alto | 2 |
+| McAfee | 1 |
+**Analysis:** The LLM annotator confused security product names with their parent companies. "Kaspersky" the company vs "Kaspersky" the antivirus product. This is genuinely ambiguous, but for cybersecurity NER, these should be **ORGANIZATION**.
+**Severity: HIGH.** These are real errors that will confuse the model. Fix by relabeling.
+### 4b. CVE_ID vs VULNERABILITY label (30 instances)
+CVE identifiers (e.g., `CVE-2023-1389`) are labeled as `CVE_ID` but the audit expected `VULNERABILITY`.
+**Analysis:** This is actually a **label-space design question**. If the label space includes both `CVE_ID` and `VULNERABILITY`, then CVE IDs should indeed be `CVE_ID`. Check if `CVE_ID` is in the intended label space.
+**Severity: LOW** if `CVE_ID` is a valid label. **HIGH** if it's not in the final label space.
+### 4c. URL and HASH labeled as their own types instead of INDICATOR (51 instances)
+URLs labeled `URL`, hashes labeled `HASH` — audit expected `INDICATOR`.
+**Analysis:** Same as 4b — depends on label-space design. If `URL`, `HASH`, `IP_ADDRESS`, `DOMAIN`, `EMAIL` are all valid labels (they appear in the label distribution), then these are **correct**. The audit's expectation of a single `INDICATOR` class was wrong.
+**Severity: NOT AN ISSUE** — the data uses fine-grained IOC labels which is actually better for cybersecurity NER.
+### 4d. Revised mislabel count
+Excluding label-space design issues (4b, 4c), the **real mislabel count is ~200** (security vendors as SYSTEM). This is much more manageable.
+---
+## 5. Overlapping Spans: 1,060
+### Dominant patterns:
+1. **"Google Play" triple overlap** (~100+ instances):
+   - `ORGANIZATION: Google [26:32]`
+   - `SYSTEM: Google Play [26:37]`
+   - `MALWARE: Play [33:37]` ← **This is wrong** — "Play" (as in Google Play) is not malware
+2. **Nested entity annotations** (e.g., `SYSTEM: Cisco` inside `ORGANIZATION: Cisco Talos`)
+3. **Partial overlaps** (e.g., `SYSTEM: Android` overlapping `SYSTEM: Android operating system`)
+### Root cause:
+The LLM annotator is producing **all possible readings** of ambiguous spans instead of picking one. The BIOES tagging scheme used by the model **cannot represent overlapping spans** — the Viterbi decoder produces exactly one label per token.
+### Recommendation:
+**Resolve all overlaps before training.** Strategy:
+- For nested spans: keep the **longest** span
+- For `Google Play`: annotate as `SYSTEM: Google Play` only (not three separate entities)
+- For `MALWARE: Play`: **remove** — this is a false annotation. "Play" in "Google Play" is not the Play ransomware group
+- General rule: prefer the span that covers the full entity mention
+---
+## 6. Garbage Text / HTML Artifacts: ~471 records with real HTML
+Of 1,119 records flagged for HTML-like patterns:
+- **~471** contain actual HTML markup tags (`<p>`, `<code>`, `<a>`, etc.)
+- **~648** contain legitimate code references (`<script>`, `<EXEC>`, `<guid>`) that are valid cybersecurity text
+The real HTML artifacts are concentrated in:
+- `llm_annotated_apt.jsonl` — MITRE technique descriptions with residual HTML
+- `llm_annotated_nvd_v2.jsonl` — NVD descriptions with markup
+1 record has encoding issues (high non-ASCII ratio).
+### Recommendation:
+**Strip HTML tags** from the ~471 affected records (careful not to remove code references). Re-run annotation on cleaned text since offsets will shift.
+---
+## 7. Repetitive Entities
+### Legitimate high-frequency entities (expected):
+- `SYSTEM: Windows` (1,011), `SYSTEM: Linux` (465), `SYSTEM: Linux kernel` (1,262)
+- `ORGANIZATION: Microsoft` (431), `ORGANIZATION: Google` (297)
+- `TOOL: PowerShell` (229), `TOOL: Metasploit` (171)
+- `THREAT_ACTOR: APT29` (149)
+### Suspicious / problematic:
+| Entity | Count | Issue |
+|--------|-------|-------|
+| `TOOL: at` | 495 | **FALSE POSITIVE** — English word "at" (as in "since at least 2020") labeled as the Unix `at` command |
+| `FILEPATH: /01/2014` | 155 | **FALSE POSITIVE** — date substrings from Linux kernel commit references labeled as file paths |
+| `VULNERABILITY: phishing` | 240 | **Debatable** — phishing is an attack technique, not a vulnerability |
+| `SYSTEM: .NET` | 225 | **Debatable** — .NET is a framework, could be SYSTEM or TOOL |
+| `SYSTEM: QEMU` | 196 | Correct for NVD kernel data |
+| `SYSTEM: Python` | 177 | **Debatable** — Python is a language, not a system |
+### Critical fix needed:
+- **`TOOL: at`** — 495 false positives will heavily poison the model. The word "at" appears thousands of times in text; labeling it as a tool will cause massive false positive rates at inference. **Must remove all instances and re-annotate only genuine uses of the `at` command.**
+- **`FILEPATH: /01/2014`** — 155 false positives from date strings in kernel changelogs. **Must remove.**
+---
+## 8. Empty Spans: 942 records (5.4% of data)
+| File | Empty records | Total records | % Empty |
+|------|--------------|---------------|---------|
+| llm_annotated_apt.jsonl | 517 | 4,554 | 11.4% |
+| llm_annotated_mitre_v2.jsonl | 244 | 1,984 | 12.3% |
+| llm_annotated_nvd_v2.jsonl | 175 | 3,000 | 5.8% |
+| llm_annotated_news.jsonl | 3 | 51 | 5.9% |
+| llm_annotated_vendor_blogs.jsonl | 3 | 67 | 4.5% |
+**Analysis:** These are texts where the LLM annotator found no entities. Some are legitimate (generic descriptions without named entities), others are short/degenerate texts that should have been filtered.
+### Recommendation:
+- **Keep ~50%** as negative examples (texts with no entities are useful for training the model to predict `O` tags)
+- **Remove** the ones that are short (<50 chars) or degenerate ("Ransomware.", "WebShell.")
+- Cap negatives at ~5% of training data to avoid class imbalance
+---
+## 9. Label Distribution
+| Label | Count | % of all spans |
+|-------|-------|----------------|
+| SYSTEM | 13,085 | 20.1% |
+| FILEPATH | 8,012 | 12.3% |
+| MALWARE | 7,821 | 12.0% |
+| VULNERABILITY | 7,617 | 11.7% |
+| THREAT_ACTOR | 4,028 | 6.2% |
+| IP_ADDRESS | 3,994 | 6.1% |
+| ORGANIZATION | 3,734 | 5.7% |
+| TOOL | 3,683 | 5.7% |
+| HASH | 3,322 | 5.1% |
+| URL | 3,180 | 4.9% |
+| DOMAIN | 2,658 | 4.1% |
+| EMAIL | 2,106 | 3.2% |
+| CVE_ID | 1,417 | 2.2% |
+**13 label types total.** Distribution is reasonably balanced. `SYSTEM` dominates (inflated by the `Linux kernel` repetitions in NVD data). `CVE_ID` and `EMAIL` are underrepresented.
+---
+## Priority Remediation Plan
+### P0 — Must fix before any training
+1. **Remove `TOOL: at` false positives** (495 instances) — will poison the model
+2. **Remove `FILEPATH: /01/2014` false positives** (155 instances)
+3. **Resolve all 1,060 overlapping spans** — model architecture cannot handle overlaps
+4. **Deduplicate** across files (especially mitre/mitre_v2/apt overlaps)
+### P1 — Should fix
+5. **Relabel security vendors** (ESET, Kaspersky, etc.) from SYSTEM → ORGANIZATION (200 instances)
+6. **Remove `MALWARE: Play`** false annotations from Google Play contexts
+7. **Remove records with text <20 chars** (71 records)
+8. **Strip real HTML tags** from ~471 records and re-align offsets
+### P2 — Nice to have
+9. Review `VULNERABILITY: phishing` label consistency
+10. Decide on `SYSTEM` vs `TOOL` for Python, .NET
+11. Cap empty-span records at 5% of training data
+12. Review `SYSTEM: QEMU` concentration from NVD kernel data (may cause domain bias)
+---
+## Audit Script
+The full audit script is at `scripts/audit_data_quality.py`. Detailed JSON results at `scripts/audit_results.json`.

research/notes/progress/2026-04-24-32-round4-training-overfitting.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Round 4 Training Launched + Overfitting Analysis
+## Round 4 Configuration
+- **Data**: 20,436 aggregated examples (5-class + 13-class)
+- **LR**: 5e-5, **Epochs**: 15, **BS**: 1, **Grad accum**: 8
+- **GPU**: RTX 5090 32GB on Vast.ai
+- **Script**: `/root/run_train_v4c.sh` in tmux session `arcspan`
+## Overfitting Pattern (epoch-by-epoch)
+| Epoch | Train Loss | Val Loss | Val Acc |
+|-------|-----------|----------|---------|
+| 1 | 0.248 | 0.164 | 95.1% |
+| 2 | 0.156 | 0.136 | 96.0% |
+| 3 | 0.120 | **0.126** | 96.4% |
+| **4** | 0.097 | **0.126** ← best | 96.5% |
+| 5 | 0.078 | 0.134 ↑ | 96.7% |
+| 9 | 0.042 | 0.165 ↑↑ | 96.9% |
+## Key Insight
+Best checkpoint at epoch 3-4. Classic overfitting after that — train loss keeps dropping but val loss climbs. The model learns fast but runs out of new signal in 20K examples. More data should push the sweet spot later.
+## Why This Matters
+- Confirms data scaling is the bottleneck, not model capacity
+- 15 epochs is too many for 20K data with this LR
+- Round 5 with 32K enriched data should allow deeper training

research/notes/progress/2026-04-24-36-dapt-research.md ADDED Viewed

	@@ -0,0 +1,230 @@

+# Domain-Adaptive Pretraining (DAPT) for Arcspan Cybersecurity NER
+**Date:** 2026-04-24
+**Status:** Research complete — decision needed on whether to pursue DAPT
+---
+## 1. How DAPT Works for Token Classifiers — Literature Review
+### "Don't Stop Pretraining" (Gururangan et al., ACL 2020)
+- **Core finding:** Continued MLM pretraining on in-domain unlabeled text before fine-tuning improves downstream task performance across all four tested domains (biomed, CS, news, reviews).
+- **DAPT + TAPT stacking:** Domain-adaptive pretraining → Task-adaptive pretraining (on the unlabeled task corpus itself) → supervised fine-tuning yields the best results. Sequential DAPT→TAPT is optimal.
+- **Critical caveat:** Adapting on domain-*irrelevant* corpora **degrades** performance. Domain match matters.
+- **Gains:** Consistent improvements across high- and low-resource settings. Typical improvements are 1-3 F1 points on classification tasks; larger gains in low-resource scenarios.
+- **Source:** https://aclanthology.org/2020.acl-main.740/
+### SecureBERT (Aghaei et al., 2022) → SecureBERT 2.0 (2025)
+- **Original SecureBERT:** Continued pretraining of RoBERTa on 1.1B words (98K cybersecurity documents). Custom BPE tokenizer added 17,673 cyber-specific tokens. Used Gaussian noise injection (μ=0, σ=0.01) on embeddings to prevent overfitting.
+  - NER on MalwareTextDB: F1=86.65 (vs RoBERTa-base 86.20) — modest +0.45 gain
+- **SecureBERT 2.0:** Built on ModernBERT, pretrained on **13B+ text tokens + 53M code tokens**. 13x more data than v1.
+  - NER F1=**0.945** (vs original SecureBERT 0.734, vs CyBERT 0.351)
+  - The massive DAPT corpus is the primary differentiator — same NER fine-tuning approach, dramatically better results.
+- **Lesson:** Scale of DAPT corpus matters enormously for cybersecurity NER. 1B tokens → marginal gains. 13B tokens → transformative gains.
+- **Sources:** https://huggingface.co/cisco-ai/SecureBERT2.0-NER, https://arxiv.org/abs/2510.00240
+### CySecBERT (Bayer et al., 2022)
+- Domain-adapted BERT using diverse sub-corpora: blogs, arXiv papers, NVD data, Twitter.
+- **Source:** https://arxiv.org/pdf/2212.02974
+### CyLLM-DAP (2025) — Efficient DAPT
+- Achieved competitive cybersecurity performance with only **118.8M tokens** (vs 2.77B in comparable models).
+- Shows that careful curation can substitute for raw scale.
+- **Source:** https://arxiv.org/html/2507.02964v1
+---
+## 2. Can We Do DAPT with This MoE Architecture?
+### Architecture Recap
+- 1.5B total params, 50M active (top-4 of 128 experts per token)
+- 8 transformer layers, d_model=640, banded attention (257-token window)
+- Output head: Linear(640, num_labels) — this is the NER classification head
+- Embedding: `nn.Embedding(vocab_size, 640)` — standard token embeddings
+- Custom tiktoken tokenizer
+### What DAPT Would Require
+For MLM continued pretraining, we need to:
+1. **Replace the output head** with an MLM head: `Linear(640, vocab_size)` instead of `Linear(640, num_labels)`. The current unembedding is a small NER head (33 or 97 classes). MLM needs vocab_size (~100K+) output.
+2. **Implement MLM masking:** Randomly mask 15% of input tokens, predict original tokens.
+3. **Write a custom training loop** — the existing `opf train` runner only does supervised token classification (cross-entropy on label predictions). It cannot do MLM.
+### MoE-Specific Risks
+**Catastrophic Forgetting in MoE:**
+- Recent research (DES-MoE, EMNLP 2025) shows MoE models are particularly susceptible to catastrophic forgetting during domain adaptation, due to cross-domain interference across expert routing.
+- DES-MoE proposed: (1) adaptive router with distillation, (2) real-time expert-domain correlation mapping, (3) three-phase progressive freezing. Achieved 89% reduction in forgetting.
+- **For our case:** We're adapting to a *single* new domain (cyber), not multi-domain. This is simpler. The main risk is degrading the model's general English token-classification ability.
+- **Source:** https://arxiv.org/abs/2509.16882
+**Mitigation strategies:**
+- **Replay mixing:** Include some general English text (e.g., Wikipedia) alongside cyber text during DAPT (SaulLM approach). Ratio: ~80% domain, 20% general.
+- **Lower learning rate:** Use 1/10th to 1/5th of original pretraining LR for continued pretraining.
+- **No warmup:** Research shows warmup during continued pretraining causes regressions.
+- **Progressive expert freezing:** Optionally freeze router weights after initial adaptation.
+### Critical Question: Does the Tokenizer Limit Us?
+- The model uses a custom tiktoken encoding. Cybersecurity terms like "CVE-2024-12345", "Emotet", "mimikatz" may be over-tokenized into subwords.
+- SecureBERT added 17,673 custom tokens to handle this. **We cannot easily do this** — adding tokens would require resizing the embedding matrix and invalidating pretrained weights.
+- **Mitigation:** The existing tokenizer likely handles most cyber terms via subword composition (BPE). The model can still learn representations for multi-token entities via DAPT, just less efficiently than with a custom tokenizer.
+---
+## 3. Available Raw Cybersecurity Text
+### What We Already Have (~200K documents)
+| Source | Count | Est. Tokens |
+|--------|-------|-------------|
+| NVD CVE descriptions | 193K | ~50-80M |
+| APT reports (CyberCorpus) | 4.5K | ~20-40M |
+| Exploit-DB entries | 4.3K | ~5-10M |
+| MITRE ATT&CK descriptions | 2K | ~2-3M |
+| **Total existing** | **~204K** | **~80-130M** |
+### Freely Available Additional Sources
+**Large-scale pretraining corpora:**
+| Dataset | Size | Access |
+|---------|------|--------|
+| **Alpha-Root** (Common Crawl cyber extraction) | 3B tokens, 2.8M webpages | HuggingFace |
+| **PRIMUS / Primus-FineWeb** | 2.57B tokens, 3.38M examples | HuggingFace (filtered FineWeb) |
+| **STUCCO auto-labeled corpus** | Unknown size | GitHub (github.com/stucco/auto-labeled-corpus) |
+**Specialized text sources:**
+| Source | Content | Access |
+|--------|---------|--------|
+| MITRE CVE full database | 200K+ CVE records with descriptions | cve.org bulk download |
+| MITRE CWE descriptions | ~900 weakness types | mitre.org |
+| MITRE CAPEC | ~500 attack pattern descriptions | mitre.org |
+| SecurityFocus/BugTraq archives | Vulnerability advisories | Web archives |
+| APTnotes (GitHub) | ~500+ APT report PDFs | github.com/aptnotes |
+| Malpedia descriptions | Malware family descriptions | malpedia.caad.fkie.fraunhofer.de |
+| NIST SP 800-series | Security standards/guides | nist.gov |
+| RFC security-related docs | Protocol security specs | ietf.org |
+### Realistic Corpus Assembly
+- **Quick win (minimal effort):** Use Alpha-Root or PRIMUS from HuggingFace → 2.5-3B tokens, ready to use.
+- **Medium effort:** Our existing 80-130M tokens + Alpha-Root 3B tokens = ~3.1B tokens.
+- **Maximum effort:** Curate custom corpus from all sources above → potentially 5-10B tokens but requires significant data engineering.
+**Recommendation:** Use **Alpha-Root (3B tokens)** as the primary DAPT corpus. It's already curated, cyber-specific, and available on HuggingFace. Supplement with our existing 130M tokens for domain specificity.
+---
+## 4. How Much DAPT Is Enough?
+### Empirical Benchmarks from Literature
+| Model | DAPT Tokens | NER Gain | Notes |
+|-------|-------------|----------|-------|
+| SecureBERT v1 | ~1.3B | +0.45 F1 | Modest gain on MalwareTextDB |
+| SecureBERT 2.0 | 13B | +21 F1 points | Transformative — but also changed base arch |
+| CyLLM-DAP | 118.8M | Competitive | Careful curation compensated for scale |
+| "Don't Stop Pretraining" | Varied | +1-3 F1 | Across 4 domains |
+### Guidelines for Our Model (50M active params)
+Our model is **much smaller** than BERT-base (110M) or SecureBERT 2.0 (ModernBERT, ~150M). With only 50M active parameters:
+- **Minimum viable DAPT:** ~100-500M tokens (1-3 epochs over our existing corpus + supplements). Expect marginal gains (+0.5-2 F1).
+- **Sweet spot:** ~1-3B tokens. This is where Alpha-Root fits perfectly. For a 50M active param model, 3B tokens represents ~60 tokens per parameter — well within the "compute-optimal" range for continued pretraining.
+- **Diminishing returns:** Beyond 3-5B tokens for this model size.
+### Learning Rate Schedule
+- **Start LR:** 1/10th of original pretraining LR. If original was ~3e-4, use ~3e-5 for DAPT.
+- **Schedule:** Cosine decay to 1/100th of max LR (i.e., 3e-7).
+- **No warmup** — research shows warmup hurts continued pretraining.
+- **Epochs:** 1-2 passes over the corpus if <1B tokens; single pass if >1B tokens.
+### Masking Rate
+- Standard 15% masking for MLM. Some research suggests dynamic masking schedules, but standard 15% is robust.
+---
+## 5. Implementation Feasibility
+### What Exists (in `vendor/privacy-filter/`)
+- `opf/_train/runner.py`: Full supervised fine-tuning loop with AdamW, gradient accumulation, best-epoch checkpointing, safetensors output. **Well-structured, 920 lines.**
+- `opf/_model/model.py`: `Transformer` class with `embedding → blocks → norm → unembedding` pipeline. Clean forward pass.
+- The model loads from checkpoint via `Transformer.from_checkpoint()`.
+### What We'd Need to Build
+**1. MLM Head Swap (Easy — ~20 lines)**
+```python
+# Replace unembedding (NER head) with MLM head (vocab prediction)
+mlm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+# Optionally tie weights with embedding: mlm_head.weight = model.embedding.weight
+model.unembedding = mlm_head
+```
+**2. MLM Data Pipeline (Medium — ~100 lines)**
+- Load raw text → tokenize with tiktoken → chunk into 257-token windows (matching the banded attention window)
+- Apply random 15% masking: replace with [MASK] token (80%), random token (10%), keep (10%)
+- Yield (masked_tokens, original_tokens, mask_positions) batches
+**3. MLM Training Loop (Medium — ~150 lines)**
+- Can largely copy from `_train_one_epoch` in runner.py
+- Replace label-based loss with: `CrossEntropyLoss(logits[mask_positions], original_tokens[mask_positions])`
+- Add cosine LR scheduler
+- Add checkpoint saving (reuse `save_named_tensors`)
+**4. Head Swap Back + Fine-tune (Trivial)**
+- After DAPT: save the backbone weights (everything except unembedding)
+- Load backbone → attach new NER head → run normal `opf train`
+### Estimated Implementation Effort
+- **~300-400 lines of new Python code** for a complete DAPT pipeline
+- **~2-3 days of engineering** for a clean implementation
+- **Training time:** With 3B tokens on a single A100 (80GB), ~50M active params, batch size 64, expect ~12-24 hours of DAPT training
+### Key Risk: The `num_labels` Config Entanglement
+The `config.json` ties `num_labels` to the unembedding size. During DAPT we'd set `num_labels=vocab_size` (or bypass this field). After DAPT, we restore the NER config. This requires careful checkpoint management but is straightforward.
+---
+## 6. Recommendation
+### Should We Do DAPT?
+**Arguments FOR:**
+- SecureBERT 2.0 proves massive gains from cyber DAPT on NER (0.734 → 0.945 F1)
+- Alpha-Root provides 3B ready-to-use cyber tokens on HuggingFace
+- Implementation is tractable (~300 lines, ~2-3 days)
+- Our model's general-English pretraining likely has poor coverage of cyber vocabulary
+- DAPT + TAPT + supervised fine-tuning is the empirically optimal pipeline
+**Arguments AGAINST:**
+- Our model is tiny (50M active params) — may not have capacity to absorb much domain knowledge
+- MoE routing adds catastrophic forgetting risk
+- Custom tokenizer may not handle cyber terms well regardless of DAPT
+- Engineering effort competes with other priorities (more labeled data, better label space design)
+- SecureBERT 2.0's gains may be confounded by the architecture change (BERT → ModernBERT)
+**Verdict: Worth trying, but AFTER establishing a supervised-only baseline.**
+The recommended pipeline is:
+1. **Phase 1 (now):** Fine-tune on labeled NER data → establish baseline F1
+2. **Phase 2 (if baseline is weak):** Do DAPT with Alpha-Root (3B tokens) → re-fine-tune → measure delta
+3. **Phase 3 (optional):** TAPT on our unlabeled cyber corpus → re-fine-tune → measure delta
+This way we know exactly how much DAPT buys us, with minimal wasted effort if the supervised-only baseline is already strong.
+---
+## Key Sources
+- Gururangan et al. 2020 — "Don't Stop Pretraining": https://aclanthology.org/2020.acl-main.740/
+- SecureBERT 2.0: https://arxiv.org/abs/2510.00240
+- SecureBERT 2.0 NER model card: https://huggingface.co/cisco-ai/SecureBERT2.0-NER
+- DES-MoE (catastrophic forgetting in MoE): https://arxiv.org/abs/2509.16882
+- CyLLM-DAP (efficient DAPT): https://arxiv.org/html/2507.02964v1
+- Alpha-Root (3B cyber tokens): https://arxiv.org/html/2602.22218
+- "Reuse, Don't Retrain" (LR schedules for continued pretraining): https://arxiv.org/html/2407.07263v1
+- CySecBERT: https://arxiv.org/pdf/2212.02974
+## Open Questions
+1. What is the actual vocab size of the OPF tokenizer? (Determines MLM head size and memory)
+2. Does Alpha-Root overlap significantly with our existing NVD/ATT&CK data?
+3. Can we tie MLM head weights to the embedding matrix to save memory?
+4. Should we freeze the MoE router during DAPT to prevent expert collapse?
+5. What's the [MASK] token ID in the OPF tiktoken encoding? (Or do we use a span corruption objective instead?)

research/notes/progress/2026-04-24-37-training-tricks-research.md ADDED Viewed

	@@ -0,0 +1,259 @@

+# Training Tricks Research for Arcspan NER Fine-Tuning
+**Date:** 2026-04-24
+**Context:** We use flat LR=5e-5, AdamW, no warmup, no decay. Overfitting at epoch 3-4 on 20K examples. Goal: squeeze maximum F1 from our 50M-active-param MoE model.
+---
+## 1. LR Scheduling: Warmup + Decay
+### Key Findings
+**Linear warmup is essential for transformer fine-tuning.** Without it, early gradient estimates are noisy and can push the model into bad basins. Standard recommendation: **warmup for 6-10% of total training steps**, linear ramp from 0 to peak LR.
+**Best schedules for NER fine-tuning (ranked):**
+1. **Linear warmup + cosine decay** — The workhorse. Used in most BERT/RoBERTa NER papers. Peak LR 2e-5 to 5e-5, warmup 6-10%, then cosine anneal to ~0.
+2. **Warmup-Stable-Decay (WSD)** — Emerging best practice from 2024-2025. Three phases: warmup (1-2%), stable plateau at peak LR (60-80%), then decay (10-25%). The long plateau maximizes exploration. Sqrt decay shape slightly outperforms cosine in recent benchmarks (+0.2-0.5% relative).
+3. **Linear warmup + linear decay** — Simpler, nearly as good. HuggingFace default.
+**Concrete recommendation for our setup:**
+- With ~20K examples, BS=8, ~5 epochs → ~12,500 steps
+- Warmup: 750-1250 steps (6-10%)
+- Schedule: cosine decay (simplest to implement via HF `get_scheduler`)
+- This alone should help significantly vs. flat LR — flat LR keeps updating aggressively in later epochs when the model should be settling
+**AdamW β₂ tuning:** Setting β₂=0.98 (vs default 0.999) can improve stability during decay phase. Worth trying.
+**Sources:**
+- WSD scheduling: https://www.emergentmind.com/topics/warmup-stable-decay-wsd-learning-rate-scheduling
+- Advanced fine-tuning techniques: https://towardsdatascience.com/advanced-techniques-for-fine-tuning-transformers-82e4e61e16e/
+- EMNLP 2024 LR transitions: https://aclanthology.org/2024.findings-emnlp.954.pdf
+---
+## 2. Weight Decay
+### Key Findings
+**Standard value: 0.01** for fine-tuning. This is the BERT/RoBERTa default and works well across NER benchmarks.
+**Parameter group strategy (important):**
+- Weight matrices: weight_decay=0.01
+- Bias terms: weight_decay=0.0
+- LayerNorm params: weight_decay=0.0
+This is standard practice but we should verify our training code implements it. Regularizing biases and norms is counterproductive.
+**For our overfitting problem:** Could try 0.05-0.1 (higher decay = stronger regularization). But LR scheduling will likely have a bigger impact than weight decay tuning.
+**Key insight:** In AdamW, effective regularization = lr × weight_decay. As LR decays via schedule, regularization automatically weakens — this is desirable as it lets the model settle into fine-grained adjustments late in training.
+**Sources:**
+- https://aicompetence.org/fine-tuning-with-adamw/
+- https://mbrenndoerfer.com/writing/adamw-optimizer-decoupled-weight-decay
+---
+## 3. Curriculum Learning
+### Key Findings
+**Yes, curriculum learning helps NER**, especially in low-resource settings. The Dual-Stage Curriculum Learning (DCL) framework (2024) shows consistent gains.
+**How to define difficulty for NER samples:**
+1. **Model uncertainty (best):** Train a teacher model for a few epochs, then score samples by prediction uncertainty (Monte Carlo dropout or simple softmax entropy). High uncertainty = hard.
+2. **Top-N Least Confidence:** Average confidence of the N least-confident tokens in a sentence.
+3. **Sentence length:** Longer = harder. Crude but effective as a baseline.
+4. **Entity density:** More entities per sentence = harder (our intuition is correct).
+5. **Entity type rarity:** Sentences with rare entity types are harder.
+**Practical implementation:**
+1. Train for 1-2 epochs on full data (or use a pretrained checkpoint)
+2. Score all training samples by model uncertainty
+3. Sort easy → hard
+4. Start training on easiest 30%, gradually add harder examples using a root scheduling function
+5. Full dataset incorporated by ~60% of training
+**Expected gains:** +0.5-2.0 F1 points depending on dataset difficulty distribution. The DCL paper reports 25% faster convergence with improved final performance.
+**Relevance for Arcspan:** Medium-high. Our cybersecurity NER data likely has a wide difficulty range (simple "CVE-2024-1234" mentions vs. complex nested vulnerability descriptions). Worth implementing after we nail LR scheduling.
+**Sources:**
+- DCL framework: https://arxiv.org/html/2402.13534
+- Low-resource NER + curriculum: https://www.researchgate.net/publication/353753078
+---
+## 4. Label Smoothing
+### Key Findings
+**Standard label smoothing (ε=0.1) has mixed results for token classification.** The issue is that BIOES tags have strict structural meaning — smoothing uniformly across O/B/I/E/S tags can confuse the Viterbi decoder.
+**Boundary Smoothing (NER-specific, ACL 2022):** A much better approach. Instead of uniform smoothing, it redistributes probability mass specifically to adjacent spans. E.g., if "tokens 3-5" is annotated as an entity, boundary smoothing gives small probability to "tokens 2-5" and "tokens 3-6". This:
+- Mitigates over-confidence at entity boundaries
+- Improves calibration
+- Produces flatter loss landscapes
+- Achieves SOTA on 8 NER benchmarks
+**Practical recommendation:**
+- **Don't use standard label smoothing** with our BIOES + Viterbi setup — it would smooth probability toward invalid tag transitions
+- **Boundary smoothing is architecturally compatible** with our approach but requires custom implementation
+- **Priority: LOW for now.** The gains are real but implementation effort is non-trivial given our Viterbi constraint. Revisit after higher-impact changes.
+**Sources:**
+- Boundary Smoothing for NER (ACL 2022): https://aclanthology.org/2022.acl-long.490/
+- Paper: https://arxiv.org/abs/2204.12031
+---
+## 5. Stochastic Weight Averaging (SWA) / Checkpoint Averaging
+### Key Findings
+**This is a near-free lunch. High priority.**
+**Simple checkpoint averaging:** Average the weights of the last N checkpoints (e.g., last 3-5 epochs). This is trivially implementable and consistently yields +0.3-1.0 F1 improvement over picking the single best checkpoint.
+**Why it works:** Individual checkpoints sit in different local minima. Averaging finds a point in a flatter basin with better generalization. Izmailov et al. (2018) showed SWA finds wider optima.
+**SWA (more sophisticated):**
+- After normal training, continue for extra steps with constant or cyclic LR
+- Maintain running average of weights
+- Apply batch normalization update at the end
+- Typically uses a lower constant LR (e.g., 2e-6)
+**Practical recommendation for Arcspan:**
+1. **Immediate win: Average top-3 checkpoints by val F1.** Just load checkpoints, average state_dicts, evaluate. Zero training cost.
+2. **Next: Average last 3-5 epoch checkpoints** (regardless of individual val scores). Often works even better than cherry-picking.
+3. **Later: Full SWA** with cyclical LR for the last 25% of training.
+**Implementation is trivial:**
+```python
+import torch
+ckpts = [torch.load(f"epoch_{i}.pt") for i in range(3, 6)]
+avg = {k: sum(c[k] for c in ckpts) / len(ckpts) for k in ckpts[0]}
+torch.save(avg, "averaged.pt")
+```
+**Sources:**
+- Izmailov et al. 2018: https://arxiv.org/abs/1803.05407
+- Checkpoint ensembles: https://arxiv.org/abs/1710.03282
+- SWA applied to Arabic NER: https://aclanthology.org/2023.arabicnlp-1.86/
+---
+## 6. Discriminative Fine-Tuning / Layer-wise LR Decay (LLRD)
+### Key Findings
+**LLRD assigns different learning rates to different layers**, with lower LR for earlier (more general) layers and higher LR for later (more task-specific) layers + classification head.
+**Standard approach:**
+- Classification head: base_lr (e.g., 5e-5)
+- Layer N (top): base_lr × decay^1
+- Layer N-1: base_lr × decay^2
+- ...
+- Embeddings: base_lr × decay^N
+- **Typical decay factor: 0.85-0.95 per layer**
+**For our 8-layer model:**
+- Head: 5e-5
+- Layer 7: 4.25e-5 (×0.85)
+- Layer 6: 3.6e-5
+- Layer 5: 3.1e-5
+- Layer 4: 2.6e-5
+- Layer 3: 2.2e-5
+- Layer 2: 1.9e-5
+- Layer 1: 1.6e-5
+- Embeddings: 1.35e-5
+**Evidence:** Originally from ULMFiT (Howard & Ruder, 2018). Widely adopted in NLP fine-tuning. The "Advanced Techniques for Fine-Tuning Transformers" guide recommends it with decay=0.9 per layer. Gains are typically +0.3-0.8 F1 for NER.
+**Relevance for Arcspan:** Our model is only 8 layers, so the range between top and bottom is smaller than for BERT-24. Still worth trying — the MoE routing layers in early blocks encode general token representations that benefit from conservative updates.
+**Implementation:** Requires custom param groups in the optimizer. Moderate effort.
+**Sources:**
+- Howard & Ruder, ULMFiT (2018): https://arxiv.org/abs/1801.06146
+- Advanced fine-tuning: https://towardsdatascience.com/advanced-techniques-for-fine-tuning-transformers-82e4e61e16e/
+---
+## 7. Data Augmentation for NER
+### Key Findings
+**Token-level Mixup is NOT straightforward for NER** because labels are tied to specific token positions. Standard Mixup (interpolating two input embeddings) destroys token-label alignment.
+**Effective NER augmentation techniques:**
+1. **Entity replacement / mention swapping:** Replace entity mentions with other entities of the same type from a gazetteer. E.g., swap "CVE-2024-1234" with "CVE-2023-5678". Preserves label structure. **High priority for cybersecurity NER** — we can generate synthetic CVEs, IPs, hashes easily.
+2. **Synonym replacement (non-entity tokens only):** Replace context words with synonyms. Preserves entity spans. Helps model learn that entities are context-independent.
+3. **Random token dropout:** Randomly mask/drop non-entity tokens. Forces model to rely on entity-internal patterns rather than context.
+4. **Sentence cropping:** Take sub-spans of long sentences as new training examples. Effectively increases dataset size.
+5. **Back-translation:** Translate to another language and back. Paraphrases context while (ideally) preserving entities. Noisy but effective.
+6. **LLM-generated synthetic data:** Use an LLM to generate new sentences containing target entity types. **Already in our pipeline — this is our primary augmentation strategy.**
+**Priority for Arcspan:** Entity replacement is the highest-value technique we're not already doing. For cybersecurity entities (CVEs, IPs, domains, malware names), we can build simple gazetteers and do systematic replacement augmentation.
+---
+## 8. Batch Size and Gradient Accumulation
+### Key Findings
+**Current setup:** BS=4, grad_accum=2 → effective BS=8.
+**General principles:**
+- For fine-tuning transformers, **effective BS of 16-32 is the sweet spot** for most NER tasks
+- Larger BS → more stable gradients → can use higher LR (linear scaling rule: double BS → double LR)
+- But for small datasets, **smaller BS provides implicit regularization** through gradient noise
+- Diminishing returns above BS=32 for NER; some papers report degradation above BS=64
+**Recommendation:**
+- Try grad_accum=4 (effective BS=16) with proportionally higher LR
+- Try grad_accum=8 (effective BS=32) as upper bound
+- With 20K examples and BS=32: ~625 steps/epoch, ~3125 total steps for 5 epochs — still enough for meaningful LR scheduling
+**Important interaction:** Larger batch size + LR warmup work synergistically. Warmup is MORE important with larger batches because the initial gradient estimates are more aggressive.
+**The "linear scaling rule" (Goyal et al., 2017):** When multiplying batch size by k, multiply LR by k. So BS=8 at LR=5e-5 → BS=32 at LR=2e-4. But for fine-tuning (vs. pretraining), be more conservative — try BS=32 at LR=1e-4.
+---
+## Priority-Ranked Implementation Plan
+Based on expected impact vs. implementation effort:
+### Tier 1: Do Immediately (high impact, low effort)
+1. **LR warmup + cosine decay** — Expected to directly address our overfitting. 10% warmup, cosine to 0. Implementation: 2 lines of code change.
+2. **Checkpoint averaging** — Average top-3 or last-3 checkpoints. Zero training cost, +0.3-1.0 F1. Implementation: 10 lines of post-processing.
+### Tier 2: Do Next (high impact, moderate effort)
+3. **Weight decay param groups** — Exclude bias/LayerNorm from decay. Set decay=0.01-0.05. Implementation: refactor optimizer setup.
+4. **Increase effective batch size** — Try BS=16 and BS=32 with proportional LR scaling. Implementation: change config values.
+5. **Layer-wise LR decay** — Decay=0.9 per layer. Implementation: custom param groups.
+### Tier 3: Experiment After Basics (moderate impact, higher effort)
+6. **Entity replacement augmentation** — Build cybersecurity gazetteers, systematic replacement. Implementation: data pipeline changes.
+7. **Curriculum learning** — Score samples by model uncertainty, train easy→hard. Implementation: training loop refactor.
+### Tier 4: Later Investigation (uncertain impact, high effort)
+8. **Boundary smoothing** — NER-specific label smoothing. Needs custom loss + Viterbi interaction analysis.
+9. **Full SWA** — Cyclical LR + weight averaging in final training phase.
+---
+## Open Questions
+- Does our Viterbi decoding interact with LR scheduling in unexpected ways? (The transition constraints should be independent of training dynamics, but worth verifying.)
+- Our MoE routing: should expert selection layers have their own LR group? The routing weights are crucial — too-aggressive updates could destabilize expert assignment.
+- For checkpoint averaging: do we average the full model including MoE routing weights, or exclude them? Averaging routing weights from different checkpoints might create inconsistent expert assignments.
+- β₂=0.98 vs 0.999: worth a quick ablation.

research/notes/progress/2026-04-24-39-class-weighting-data-scaling-research.md ADDED Viewed

	@@ -0,0 +1,296 @@

+# Class Weighting, Data Scaling & Error Analysis for NER
+**Date:** 2026-04-24
+**Context:** Arcspan cybersecurity NER — severe class imbalance (Indicator ~3% F1, Vulnerability 0% in early rounds). Enriched dataset: ~32K examples, ~79K spans. Model: OpenAI Privacy Filter (50M active params, BIOES + Viterbi).
+---
+## 1. Class-Weighted Cross-Entropy for BIOES Token Classification
+### How to Compute Weights
+Two strategies, combinable:
+**A. Weight by tag frequency (inverse frequency):**
+- Count occurrences of each of the `1 + N*4` BIOES classes across training tokens
+- Weight_c = total_tokens / (num_classes * count_c), or simpler: Weight_c = 1 / freq_c, normalized
+- The O tag (often 85-95% of tokens) gets weight ~0.05-0.1; rare B-Vulnerability tags get weight ~5-20
+**B. Weight by entity type (group BIOES tags):**
+- All B/I/O/E/S tags for the same entity type share a base weight derived from entity-type frequency
+- Then optionally further scale B and S tags higher (they're critical for span detection)
+### Practical Implementation
+```python
+# Inverse-frequency weights
+from collections import Counter
+counts = Counter(all_training_tags)
+total = sum(counts.values())
+weights = {tag: total / (len(counts) * counts[tag]) for tag in counts}
+# Clamp to avoid extreme values
+weights = {t: min(w, 20.0) for t, w in weights.items()}
+```
+### Key Findings
+- Standard approach in NER literature; used in spaCy, HuggingFace token classifiers
+- **Risk**: Extreme weights destabilize training. Clamp at 10-20x max.
+- **Recommendation**: Start with sqrt(inverse frequency) — less aggressive than raw inverse frequency but still effective. Monitor whether model stops predicting O entirely (overcorrection).
+**Sources:**
+- https://datascience.stackexchange.com/questions/94021/imbalance-classes-in-named-entity-recognition
+- https://stats.stackexchange.com/questions/63635/named-entity-recognition-and-class-imbalance
+---
+## 2. Focal Loss for NER
+### Formula
+`FL(p_t) = -α_t * (1 - p_t)^γ * log(p_t)`
+### Does It Help with O-Tag Dominance?
+**Yes, directly.** O-tag tokens are "easy" examples (model quickly learns to predict them with high confidence). Focal loss down-weights these easy examples via the (1-p_t)^γ factor, automatically focusing gradient on hard tokens (entity boundaries, rare types).
+### Recommended Gamma Values
+| Source | Recommended γ | Context |
+|--------|--------------|---------|
+| Lin et al. (original paper) | **2.0** | Object detection (analogous imbalance) |
+| NLP practitioners | **0.5 - 1.5** | Text classification, moderate imbalance |
+| Extreme imbalance | **2.0 - 5.0** | When majority class >90% |
+**For our case** (O-tag likely 85-92%): Start with **γ=2.0**, sweep {1.0, 2.0, 3.0, 5.0}.
+### Focal vs. Class-Weighted CE
+- Focal loss is **adaptive** — it learns which examples are hard per-token
+- Class-weighted CE is **static** — same weight regardless of model confidence
+- Can combine both: weighted focal loss (α per class + γ focusing)
+- **Recommendation**: Try focal loss alone first (γ=2), then weighted focal if needed
+### MoM (Majority or Minority) Learning — Novel Alternative
+Paper: "Majority or Minority: Data Imbalance Learning Method for NER" (arXiv 2401.11431, 2024):
+- Adds auxiliary loss computed only on O-class samples
+- Single hyperparameter λ instead of per-class weights
+- Outperformed focal loss and dice loss on CoNLL2003, OntoNotes5.0, KWDLC
+- **Worth investigating** for our use case
+**Sources:**
+- https://arxiv.org/html/2401.11431
+- https://stats.stackexchange.com/questions/567859/how-to-choose-gamma-parameter-in-focal-loss
+- https://aicompetence.org/tuning-gamma-in-focal-loss/
+---
+## 3. Dice Loss / F1-Oriented Loss
+### Key Insight
+Dice coefficient ≡ F1 score mathematically: `2TP / (2TP + FP + FN)`
+Dice loss = 1 - Dice coefficient. Directly optimizes the metric we care about.
+### Paper: "Dice Loss for Data-imbalanced NLP Tasks" (Li et al., 2020)
+- Proposed replacing CE with Dice loss for NER, text classification, MRC
+- Dice loss **attaches equal importance to FP and FN**, making it robust to class imbalance
+- On NER benchmarks: improved F1 for minority classes without hurting majority
+### Implementation for Token Classification
+- Compute per-class soft Dice over all tokens in a batch
+- Average across entity classes (optionally excluding O)
+- Self-adjusting Dice (DSC) variant adds a smoothing term
+### Tradeoffs
+| Aspect | Dice Loss | CE Loss |
+|--------|-----------|---------|
+| Class imbalance handling | Excellent (inherent) | Requires weighting |
+| Gradient quality | Can be noisy for rare classes | Stable |
+| Optimization landscape | Non-convex | Convex |
+| Common practice | Growing adoption | Standard default |
+### Recommendation
+**Use as secondary experiment.** Dice loss is promising but the non-convex optimization can cause training instability. Try: (1) CE baseline, (2) focal loss γ=2, (3) Dice loss, and compare.
+**Sources:**
+- https://www.semanticscholar.org/paper/Dice-Loss-for-Data-imbalanced-NLP-Tasks-Li-Sun/5487dadb5b4b8b240ab4ae28705acc0b9f138db0
+- https://www.researchgate.net/publication/343302480_Dice_Loss_for_Data-imbalanced_NLP_Tasks
+---
+## 4. Data Scaling — Expected Gains from 3-5x More Training Data
+### Current State
+- Using ~3K of 193K NVD records, ~1.7K of ~2K MITRE records
+- Total: ~32K examples, ~79K spans
+- Potential: 100K+ examples with more NVD sampling
+### Scaling Laws for NER
+NER follows **power-law scaling**: performance ∝ D^α where α is typically 0.1-0.3 for token classification tasks with pre-trained models.
+**What this means concretely:**
+- Going from 32K → 100K examples (3x) would yield roughly **3^0.2 ≈ 1.25x** improvement factor
+- If current F1 gap from ceiling is 20 points, expect ~5 point gain
+- Going from 32K → 160K (5x) → **5^0.2 ≈ 1.38x** → ~7-8 point gain
+### Key Findings from Literature
+1. **Pre-trained models reduce data hunger**: Since our model is fine-tuned (not trained from scratch), we're in the "transfer learning" regime where gains from more data are more modest
+2. **Diminishing returns are real**: Most gains come in the first 10-50K examples for pre-trained models
+3. **Per-entity scaling matters more than total**: Adding 10K more examples with zero Vulnerability mentions won't help Vulnerability F1. **Data scaling should target underrepresented entity types**
+4. **Quality > quantity past a threshold**: At 100K+ examples, annotation noise dominates gains
+### Recommendation
+- **High priority**: Scale Vulnerability and Indicator examples specifically (these are our failure modes)
+- **Medium priority**: Go from 3K → 15K NVD records (5x) — expect meaningful but not transformative gains
+- **Track learning curves**: Train on 10%, 25%, 50%, 100% of data and plot F1 per entity type to find actual diminishing returns point for OUR data
+**Sources:**
+- https://arxiv.org/abs/2001.08361 (Scaling Laws for Neural Language Models)
+- https://pmc.ncbi.nlm.nih.gov/articles/PMC11228526/ (Explaining Neural Scaling Laws)
+---
+## 5. Active Learning for NER
+### Does It Help?
+**Yes — up to 66% annotation savings** compared to random sampling (PMC4934373).
+### Best Strategy: Uncertainty Sampling
+1. Train initial model on small seed set
+2. Run inference on unlabeled pool
+3. Select examples where model is **least confident** (highest token-level entropy, especially on entity tokens)
+4. Annotate those, retrain, repeat
+### Specific Methods (ranked by effectiveness)
+1. **Entity Entropy**: Sum entropy only over tokens the model predicts as B-* tags. Best for targeting entity-specific uncertainty.
+2. **N-best Sequence Entropy**: Entropy over top-N label sequences (N=3 typical)
+3. **Least Confidence**: 1 - P(best sequence)
+4. **Margin Sampling**: Difference between top-2 predictions
+### For Our Use Case
+Since we have 193K unlabeled NVD records and 190K unused:
+- Train initial model on current 32K
+- Score all 190K unused NVD records by uncertainty
+- Select top 5-10K with highest entity entropy
+- Generate spans for those (via our LLM pipeline)
+- This targets exactly the examples our model struggles with
+### Practical Concern
+Our annotation is LLM-based (not human), so "annotation cost" is API cost, not human time. Active learning still helps by **ensuring we generate training data for the hardest cases** rather than easy/redundant ones.
+**Sources:**
+- https://pmc.ncbi.nlm.nih.gov/articles/PMC4934373/
+- https://journals.sagepub.com/doi/10.3233/IDT-200048
+---
+## 6. Oversampling Rare Entity Types
+### Approaches
+1. **Simple duplication**: Repeat examples containing rare entities N times in training data
+2. **Sentence-level oversampling**: Duplicate entire sentences, not just the entity tokens
+3. **Context variation**: Use the rare entity in different synthetic contexts (LLM-generated)
+4. **Entity replacement augmentation**: Replace common entities with rare type entities in existing sentences
+### Risks
+- **Overfitting**: Duplicated examples reduce effective dataset diversity; model memorizes specific contexts
+- **Distribution shift**: Training distribution no longer matches real-world frequency
+- **Boundary artifacts**: If always seeing the same token patterns around rare entities, model learns spurious boundary cues
+### Better Alternatives (ranked)
+1. **Loss weighting** (focal/class-weighted) — addresses imbalance without altering data distribution
+2. **Targeted data generation** — generate NEW diverse examples for rare types via LLM, don't just duplicate
+3. **Moderate oversampling (2-3x)** combined with loss weighting — lower overfitting risk than aggressive oversampling
+4. **SMOTE-like approaches** — don't translate well to NER (token sequences aren't continuous vectors)
+### Recommendation
+**Don't oversample more than 3x.** Prefer generating genuinely new examples for rare types (we already have the LLM pipeline for this). Combine with focal loss γ=2 for remaining imbalance.
+---
+## 7. Error Analysis Methodology
+### Span-Level Error Taxonomy
+Standard NER error categories:
+| Error Type | Description | Example |
+|-----------|-------------|---------|
+| **Missing** | Entity in gold, not predicted | Failed to detect CVE-2024-1234 |
+| **Spurious** | Predicted entity not in gold | Marked "version" as Software |
+| **Type error** | Correct span, wrong label | Detected "log4j" but labeled as Indicator instead of Software |
+| **Boundary error (left)** | Span starts too early/late | "Apache Log4j" → only detected "Log4j" |
+| **Boundary error (right)** | Span ends too early/late | "CVE-2024-1234" → detected "CVE-2024" |
+| **Boundary + Type** | Both span and label wrong | Combined error |
+### Tools
+1. **seqeval** (Python): Standard NER evaluation, entity-level P/R/F1 per type. `pip install seqeval`
+2. **nervaluate** (Python): Supports partial match scoring (exact, partial, type, overlap). `pip install nervaluate`
+3. **Custom span-diff scripts**: Compare gold vs pred span lists, categorize each error
+4. **Confusion matrices**: Build entity-type confusion matrix (what gets confused with what)
+5. **Length-stratified analysis**: Group spans by token length (1, 2-3, 4+) and compute F1 per bucket
+### Recommended Error Analysis Pipeline for Arcspan
+```
+1. Run inference on eval set
+2. Extract (gold_spans, pred_spans) per example
+3. Align spans using IoU overlap
+4. Categorize each error: missing/spurious/type/boundary
+5. Aggregate by entity type → identify systematic patterns
+6. Stratify by span length, position in sentence, entity density
+7. Sample 50 worst errors per type for qualitative review
+```
+### Key Diagnostic Questions
+- Which entity types have high recall but low precision? (spurious predictions)
+- Which have low recall? (model can't detect them — need more data or better features)
+- Are boundary errors concentrated on multi-token spans? (common with BIOES)
+- Does the model confuse specific type pairs? (e.g., Indicator vs. Software)
+---
+## 8. Ensemble Methods for NER with Viterbi Decoding
+### Standard Approaches
+1. **Logit averaging**: Average pre-softmax logits from N models, then run single Viterbi decode
+2. **Probability averaging**: Average softmax probabilities, then Viterbi decode
+3. **Span-level voting**: Each model produces spans independently; take majority vote on span boundaries + types
+4. **Stacking**: Train a meta-model on concatenated features from N base models
+### Compatibility with Viterbi Decoding
+**Logit/probability averaging + Viterbi is the cleanest approach:**
+- Each model produces per-token logits over BIOES classes
+- Average the logits (or log-probs) across models
+- Run Viterbi on the averaged emission scores with the standard BIOES transition constraints
+- This preserves valid BIOES sequences while benefiting from ensemble diversity
+**Span-level voting is simpler but loses Viterbi benefits:**
+- Each model independently runs Viterbi → produces span list
+- Merge spans: keep spans where ≥K of N models agree (on type + boundary overlap ≥50%)
+- Risk: may produce fewer spans (conservative) or need tie-breaking rules
+### Expected Gains
+- Typical NER ensemble gains: **+0.5 to 2.0 F1 points** over best single model
+- Gains are larger when individual models have diverse errors (different seeds, different data subsets)
+- Diminishing returns beyond 3-5 models
+### Practical Considerations for Arcspan
+- Our model is small (50M active params) → training 3-5 variants is cheap
+- Diversity sources: different random seeds, different data subsets (bagging), different loss functions (CE vs focal vs dice)
+- **Recommendation**: Train 3 models (different seeds + CE/focal/dice), average logits before Viterbi
+- Inference cost: 3x for ensemble, but base model is already fast (50M params)
+---
+## Summary: Priority-Ordered Action Plan
+| Priority | Action | Expected Impact | Effort |
+|----------|--------|----------------|--------|
+| **P0** | Implement focal loss (γ=2) | +3-5 F1 on rare types | Low |
+| **P0** | Class-weighted CE as baseline comparison | Baseline improvement | Low |
+| **P0** | Error analysis pipeline | Diagnostic (guides all other work) | Medium |
+| **P1** | Scale Vulnerability/Indicator training data specifically | +5-10 F1 on those types | Medium |
+| **P1** | Learning curve analysis (10/25/50/100% data) | Know where we are on scaling curve | Medium |
+| **P1** | Active learning for NVD selection | Better data efficiency | Medium |
+| **P2** | Dice loss experiment | May outperform focal for rare types | Low |
+| **P2** | MoM loss (arxiv 2401.11431) | Novel, shown to beat focal+dice | Medium |
+| **P2** | 3-model ensemble (logit averaging + Viterbi) | +1-2 F1 overall | Medium |
+| **P3** | Moderate oversampling (2-3x) of rare types | Small gains, risk of overfitting | Low |
+### Key Insight
+**Loss function changes (P0) are highest ROI**: they require minimal code changes and directly address our core problem. Data scaling (P1) is next — but should be targeted at underperforming entity types, not uniform scaling. Ensembles (P2) are a reliable but incremental improvement for after the fundamentals are solid.

research/notes/progress/2026-04-24-40-round4b-killed-no-checkpoint.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# Round 4b (13-class) Training Results
+## Configuration
+- **Data**: 20,436 aggregated examples, 13-class (53 token labels)
+- **LR**: 5e-5, **Epochs**: 15, **BS**: 1, **Grad accum**: 8
+- **GPU**: RTX 5090 32GB
+## Epoch-by-Epoch Val Loss
+| Epoch | Train Loss | Val Loss | Val Acc |
+|-------|-----------|----------|---------|
+| 1 | 0.360 | 0.237 | 92.8% |
+| 2 | 0.237 | 0.196 | 94.1% |
+| 3 | 0.188 | 0.179 | 94.7% |
+| **4** | **0.155** | **0.178** ← best | **94.8%** |
+| 5 | 0.129 | 0.178 | 95.1% |
+| 6 | 0.109 | 0.183 ↑ | 95.3% |
+| 7 | 0.092 | 0.198 ↑↑ | 95.3% |
+Killed at epoch 7 due to clear overfitting (3 consecutive epochs of val_loss increase).
+## Key Finding
+- **No checkpoint saved** — `opf train` saves best checkpoint at end of training, not per-epoch. Killing mid-run = no checkpoint.
+- Same overfitting pattern as R4a (5-class): best at epoch 3-4, then degradation
+- 13-class converges slower (val_loss 0.178 vs 0.126 for 5-class) — expected with 53 vs 21 token classes
+- **Lesson**: Need to let training finish naturally or implement per-epoch checkpoint saving
+## Impact
+- R4b eval results lost. R5b on enriched 32K data will supersede this anyway.
+- Confirms early stopping at patience=3 is correct strategy for R5.

research/notes/progress/2026-04-24-44-r5a-baseline-results.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# R5a Baseline Results — 5-class Enriched (Epoch 3 Best)
+## Training Summary
+- **Config**: flat LR=5e-5, CE loss, no warmup, no LLRD, BS=4 + grad_accum=2
+- **Best epoch**: 3 (val_loss=0.145), early stopping killed at epoch 6 (val_loss=0.160)
+- **Overfitting**: train_loss dropped to 0.032 by epoch 6, val_loss plateau at 0.145-0.147
+## Span F1 Results
+| Test Set | Span F1 | Span Precision | Span Recall |
+|----------|---------|----------------|-------------|
+| Enriched test (3853 ex) | **0.6315** | 0.7776 | 0.5316 |
+| CyNER test (748 ex) | **0.5153** | 0.8261 | 0.3744 |
+| SecureBERT2 test (200 ex) | **0.4654** | 0.8421 | 0.3216 |
+## Per-Class Span F1 (Enriched test)
+| Class | Precision | Recall | F1 |
+|-------|-----------|--------|----|
+| Vulnerability | 0.816 | 0.674 | 0.738 |
+| Malware | 0.817 | 0.561 | 0.665 |
+| Organization | 0.780 | 0.526 | 0.628 |
+| Indicator | 0.531 | 0.635 | 0.579 |
+| System | 0.768 | 0.364 | 0.494 |
+## Key Findings
+1. **Precision >> Recall everywhere** — model is conservative, under-predicts spans
+2. **Indicator collapses on CyNER/SB2** — 4-5% recall vs 64% on enriched test. Distribution mismatch.
+3. **System weakest class** — 36% recall, struggles with entity boundaries
+4. **Vulnerability strongest** — best F1 at 0.738, probably cleanest label boundaries
+5. **S- (single-token) tags severely underperformed** — S-Malware F1=0.26, S-Indicator F1=0.00
+## Gap Analysis (target: 80% span F1)
+Current: 63% → Need: 80% = **17 point gap**
+Levers to close it:
+- **Focal loss** (γ=2): Should boost recall by down-weighting easy O tokens
+- **Cosine LR + warmup**: Better convergence, less overfitting
+- **LLRD** (0.9): Preserve pretrained features in lower layers
+- **Viterbi tuning**: Free +1-3 F1 from transition bias calibration
+- **Checkpoint averaging**: Smooth out noise
+## Inference Speed
+- Enriched: 7,757 tok/s (longer sequences = better batching)
+- CyNER: 4,043 tok/s
+- SB2: 3,577 tok/s
+## Source
+- Checkpoint: `checkpoints/r5a_enriched_5class/epoch_3/`
+- Logs: `eval_r5a_enriched.log`, `eval_r5a_cyner.log`, `eval_r5a_sb2.log`

research/notes/progress/2026-04-24-45-data-quality-audit.md ADDED Viewed

	@@ -0,0 +1,175 @@

+# Data Quality Audit: enriched_5class Training Data
+**Date:** 2026-04-24
+**Context:** Model trained on this data has precision=0.78 but recall=0.53, suggesting systematic missed-entity problems in training annotations.
+---
+## 1. Dataset Overview
+| Split | Examples | Empty (no entities) | Entity char density |
+|-------|---------|-------------------|-------------------|
+| Train | 31,510 | 5,183 (16.4%) | 9.7% |
+| Enriched Test | 3,853 | 1,242 (32.2%) | 7.4% |
+| CyNER Test | 748 | 317 (42.4%) | 10.5% |
+**Train sources** (17 sources, highly heterogeneous):
+| Source | Examples | Entity density |
+|--------|---------|---------------|
+| cyberner_stix_train | 7,922 | 9.1% |
+| cyner2_train | 4,952 | 7.0% |
+| **exploitdb** | **4,344** | **78.6%** |
+| dnrti_train | 3,187 | 5.2% |
+| nvd_v2 | 2,661 | 2.8% |
+| apt_reports | 2,295 | 6.7% |
+| **synthetic_v2** | **2,000** | **36.3%** |
+| cyner_train | 1,893 | 4.2% |
+| mitre_attack_v2 | 1,544 | 3.5% |
+| synthetic_ioc | 100 | 40.7% |
+| + 7 smaller sources | 512 | varies |
+Near-duplicates: only 2 exact-duplicate texts found. Not a major issue.
+---
+## 2. CRITICAL: Massive Unlabeled Indicators (Root Cause of Low Recall)
+**Finding:** In the first 5,000 training examples alone, **367 obvious IP addresses and hashes appear in the text but are NOT labeled as Indicator entities.** Extrapolating to the full 31.5K examples, there are likely **2,000+ unlabeled IOCs** in the training data.
+**Examples of missed labels:**
+- `"MD5 : 69b4b32e4636f1981841cbbe3b927560"` — hash appears in text immediately after "MD5 :" but is not labeled (cyner_train)
+- `"b45defca452a640b303288131eb64c485f442aae0682a3c56489d24d59439b47 d96017..."` — a line full of hashes, none labeled (cyner_train)
+- `"Indicators of Compromise SHA256 Package App label 332e68d8..."` — IOC listing section where multiple hashes go unlabeled
+**Impact:** This is the single biggest driver of low recall. The model learns from these examples that hashes/IPs in text should be tagged `O`, directly suppressing Indicator detection. This creates a **false-negative training signal** that explains both:
+- Low overall recall (0.53)
+- Catastrophic CyNER Indicator recall (4%)
+---
+## 3. S-tag (Single-Token Entity) Analysis
+Train span counts and single-token fractions:
+| Class | Total spans | Single-token | % Single |
+|-------|-----------|-------------|---------|
+| Indicator | 17,964 | 17,621 | **98.1%** |
+| Malware | 18,700 | 15,243 | 81.5% |
+| Organization | 16,892 | 10,894 | 64.5% |
+| System | 16,444 | 7,946 | 48.3% |
+| Vulnerability | 11,854 | 4,738 | 40.0% |
+**S-tag representation is actually abundant in training.** The problem is NOT insufficient S-tag examples. Rather:
+- **S-Indicator F1=0.00** is caused by the massive missed-entity problem (Section 2) — the model learns to suppress Indicator detection.
+- **S-Malware F1=0.26** may reflect boundary confusion: 81.5% of Malware spans are single-token, which is actually very high. The issue is more likely label noise (see Section 5).
+---
+## 4. Indicator Distribution Mismatch (Enriched vs CyNER)
+### Type distribution comparison:
+| Indicator type | Train | CyNER Test | Enriched Test |
+|---------------|-------|-----------|--------------|
+| Domain | 6,356 (35%) | 139 (53%) | similar |
+| IP/partial-IP | 4,507 (25%) | 7 (3%) | similar |
+| Hash | 3,692 (21%) | 23 (9%) | similar |
+| URL | 1,907 (11%) | 0 (0%) | similar |
+| Filename | 590 (3%) | 23 (9%) | similar |
+| Filepath | 43 (<1%) | 17 (7%) | minimal |
+| Other | 819 (5%) | 51 (20%) | minimal |
+**Key mismatches:**
+1. **CyNER is domain-heavy (53%)** while train has more IPs and hashes. But more critically, CyNER's "domains" are often **defanged** (`uyghurapps [ .`, `negg.ddns [ .`) while training data mostly contains clean indicators.
+2. **CyNER has many "other" indicators (20%)** including registry paths (`Software\Microsoft\Windows\CurrentVersion\Run`), package names (`jp.naver.line.android`, `com.whatsapp`), and tool names (`DroidVPN`). These types are rare in training.
+3. **CyNER uses defanged notation** (`hxxp`, `[ .`, `[.]`) in 38/748 examples. Training has defanged indicators in only 1,588/31,510 examples (5%) and even then they're handled inconsistently.
+**This explains the 64% enriched Indicator recall vs 4% CyNER Indicator recall:** the enriched test set was drawn from the same distribution as training (clean IOCs, standard formats), while CyNER contains defanged indicators, partial IPs (`222.139.212 [ .`), and unconventional indicator types that the model never learned to recognize.
+---
+## 5. ExploitDB Source: Degenerate Data Distribution
+The `exploitdb` source (4,344 examples = 13.8% of training) has **78.6% entity density** — nearly the entire text is entities. These examples are exploit titles:
+```
+TEXT: "Android - ashmem Readonly Bypasses via remap_file_pages() and ASHMEM_UNPIN"
+SPANS: System: "Android", Vulnerability: "ashmem Readonly Bypasses via remap_file_pages()..."
+TEXT: "FLEX 1080 < 1085 Web 1.6.0 - Denial of Service"
+SPANS: System: "FLEX 1080 < 1085 Web 1.6.0", Vulnerability: "Denial of Service"
+```
+**Problems:**
+1. **Not real sentences** — these are structured titles with a `System - Vulnerability` pattern. The model learns a degenerate heuristic: "everything before the dash is System, everything after is Vulnerability."
+2. **Inflates System and Vulnerability counts** with formulaic patterns, hurting generalization to natural text.
+3. **System spans include version numbers** (e.g., "FLEX 1080 < 1085 Web 1.6.0") which creates artificially long, oddly-bounded System entities.
+### Synthetic data (synthetic_v2, synthetic_ioc): 2,100 examples, 36-41% density
+Similarly templated:
+```
+"FireEye published a threat intelligence report linking Velvet Tempest to a new
+campaign exploiting CVE-2021-10425 in Apache Struts."
+```
+All follow identical sentence structure. While less degenerate than ExploitDB, these teach the model template-matching rather than genuine entity recognition.
+---
+## 6. Span Length Distribution Issues
+**Vulnerability spans are often very long:**
+- Train: mean 2.1 tokens, but many 3-token spans (2,756 instances). Common pattern: `"Denial of Service"`, `"Remote Code Execution"`, `"SQL Injection"`.
+- ExploitDB contributes very long vulnerability descriptions as single spans: `"ashmem Readonly Bypasses via remap_file_pages() and ASHMEM_UNPIN"` (8+ tokens).
+**Indicator spans are almost always single-token (98.1%):**
+This makes sense — IPs, hashes, and URLs are typically single whitespace-delimited tokens. But it means multi-token Indicator spans in CyNER (like `"% APPDATA % /myupd/gen/ % Y % m..."` at 55 tokens!) are completely out-of-distribution.
+---
+## 7. Label Consistency Issues (Spot-Check)
+Examining specific examples:
+1. **"Apple" labeled inconsistently:** In line 21 it's `System: Apple` ("Apple devices"), in line 22 it's `Organization: Apple` ("Apple phishing site"). Both are arguably correct in context, but this teaches the model conflicting signals for the same surface form.
+2. **File extensions labeled as Indicator:** `.dll`, `.exe`, `.NET`, `.PDF` etc. appear as Indicator spans (67 total). These are not IOCs — they're generic file types. This adds noise to the Indicator class.
+3. **Generic terms as Indicator:** `".NET"` appears as both Indicator and as part of malware names. The class is polluted with non-IOC content.
+4. **Vulnerability spans too vague:** `"security vulnerabilities"` (CyNER test line 9) is labeled as Vulnerability — this is a generic phrase, not a specific vulnerability.
+---
+## 8. Summary of Root Causes for Low Recall
+Ranked by estimated impact:
+| # | Issue | Impact | Affected classes |
+|---|-------|--------|-----------------|
+| 1 | **~2,000+ unlabeled IOCs in training** | **CRITICAL** — teaches model to suppress Indicator detection | Indicator (primary), overall recall |
+| 2 | **Indicator distribution mismatch** — defanged notation, package names, registry paths absent from training | **HIGH** — explains 64%→4% CyNER Indicator recall drop | Indicator |
+| 3 | **ExploitDB degenerate patterns** — 4,344 title-format examples with 78.6% entity density | **MEDIUM** — inflates System/Vuln counts with non-generalizable patterns | System, Vulnerability |
+| 4 | **Synthetic template monotony** — 2,100 examples from identical sentence templates | **MEDIUM** — model memorizes template rather than entity features | All classes |
+| 5 | **Label inconsistency** — same surface form gets different labels (Apple=System vs Organization) | **LOW-MEDIUM** — creates conflicting gradients | Organization, System |
+| 6 | **Indicator class pollution** — file extensions, protocol names labeled as Indicator | **LOW** — 76 noisy instances out of 17,964 | Indicator |
+---
+## 9. Recommendations
+1. **Fix unlabeled IOCs** — Run regex-based annotation over training data to label all IPs, hashes, URLs, and domains that currently have no span annotation. This is the highest-ROI fix.
+2. **Add defanged indicator examples** — Either add CyNER-style defanged IOCs to training, or normalize both train and test to the same format.
+3. **Downweight or remove ExploitDB** — These 4,344 title-format examples are hurting generalization. Either remove them or downweight by 5-10x during training.
+4. **Diversify synthetic templates** — The 2,000 synthetic_v2 examples use identical structure. Rewrite with varied sentence patterns.
+5. **Add CyNER-style indicator types to training** — Package names, registry paths, and file paths are underrepresented. Add examples from CyNER train or synthesize them.
+6. **Clean Indicator class** — Remove bare file extensions (`.dll`, `.exe`) from Indicator labels. These aren't IOCs.

research/notes/progress/2026-04-24-46-competitor-deep-dive.md ADDED Viewed

	@@ -0,0 +1,261 @@

+# Competitor Deep-Dive: Cybersecurity NER Models
+**Date:** 2026-04-24
+**Context:** Our Arcspan baseline gets 63% span F1 on cybersecurity NER. Understanding exactly what the top models did to reach their numbers.
+---
+## 1. SecureBERT 2.0 (Cisco AI) — F1: 94.5%
+**Paper:** Aghaei et al., "SecureBERT 2.0: Advanced Language Model for Cybersecurity Intelligence" (arXiv:2510.00240, Sep 2025)
+**HuggingFace:** [cisco-ai/SecureBERT2.0-NER](https://huggingface.co/cisco-ai/SecureBERT2.0-NER)
+**GitHub:** [cisco-ai-defense/securebert2](https://github.com/cisco-ai-defense/securebert2)
+### Architecture
+- **Base model:** ModernBERT (custom pretrained) — 22 layers, d_model=768, 12 heads
+- **Max seq length:** 8192 (fine-tuned on 1024)
+- **Vocab:** 50,368
+### Training Recipe
+| Setting | Value |
+|---------|-------|
+| Optimizer | AdamW |
+| LR | **1e-5** |
+| Scheduler | Linear |
+| Weight Decay | 0.001 |
+| Batch Size | 8 per GPU (8x A100) |
+| Epochs | **20** |
+| Max Seq Len | 1024 |
+| Grad Clip | 1.0 |
+| Precision | fp16 |
+| Loss | **Token-wise Cross Entropy** (no CRF, no class weights mentioned) |
+### Data Pipeline
+- **Train:** 3,400 samples / **Test:** 717 samples
+- **Source:** Manually annotated threat intelligence documents by domain experts
+- **Preprocessing:** Subword tokenization with label alignment to subword tokens
+- No augmentation mentioned
+### Label Space
+- **5 entity types:** Malware, Indicator, System, Organization, Vulnerability
+- **BIO scheme** (not BIOES): B-X, I-X, O → 11 classes total
+- Same label space as the CyNER dataset
+### Special Techniques
+- **None documented** — no CRF, no class weighting, no focal loss, no ensemble
+- The key weapon is **domain-adaptive pretraining**: SecureBERT 2.0 base was pretrained on 13.6B cybersecurity tokens (13x more than v1)
+### Per-Class Results
+- **Not published per-class** — only aggregate F1=0.945, R=0.965, P=0.927
+### What They Say Matters
+> "Domain-adaptive pretraining and fine-tuning on cybersecurity corpora dramatically improves NER performance."
+**Key insight for us:** The jump from SecureBERT v1 (73.4%) to v2 (94.5%) came primarily from **massive domain pretraining** (13.6B tokens), NOT from NER-specific tricks. The NER fine-tuning recipe is completely vanilla.
+---
+## 2. SecureBERT v1 (Original) — F1: 73.4%
+**Paper:** Aghaei et al., "SecureBERT: A Domain-Specific Language Model for Cybersecurity" (SecureComm 2022, Springer)
+**HuggingFace:** [ehsanaghaei/SecureBERT](https://huggingface.co/ehsanaghaei/SecureBERT)
+### Architecture
+- **Base model:** RoBERTa-base, further pretrained on cybersecurity corpus via MLM
+- Standard BERT-base size (~125M params)
+### Training Recipe (NER fine-tuning)
+- Details not fully published in model card; paper behind Springer paywall
+- Same CyNER dataset and label space as v2
+- Standard token classification head
+### What We Know
+- Pretrained on a cybersecurity corpus (much smaller than v2's 13.6B tokens)
+- NER F1 of 73.4% on same eval set as v2
+- The 21-point gap to v2 is almost entirely attributable to the improved pretraining
+---
+## 3. SecureModernBERT-NER (attack-vector) — F1: 84.8%
+**HuggingFace:** [attack-vector/SecureModernBERT-NER](https://huggingface.co/attack-vector/SecureModernBERT-NER)
+**No paper** — community model, documented only on HF model card.
+### Architecture
+- **Base model:** answerdotai/ModernBERT-large (NOT domain-pretrained, just the general ModernBERT)
+- Standard token classification head
+### Training Recipe
+| Setting | Value |
+|---------|-------|
+| Optimizer | AdamW (torch) |
+| LR | **5e-5** |
+| Scheduler | **Cosine** |
+| Batch Size | **128** |
+| Epochs | **5** |
+| Max Seq Len | **128** |
+| Precision | fp16 |
+| Grad Accum | 1 |
+| Hardware | Single L40S |
+### Data Pipeline
+- **502,726 labeled spans** — one of the largest CTI NER datasets
+- Sources: real-world threat reports, vulnerability advisories, incident analyses
+- Manually curated + automated heuristic conflict resolution
+- Span distribution: ORG ~198k, PRODUCT ~79k, MALWARE ~67k, PLATFORM ~57k, THREAT-ACTOR ~49k, CVE ~41k
+### Label Space
+- **22 entity types** — much broader than SecureBERT's 5
+- **BIO scheme**
+- Types: URL, ORG, SERVICE, SECTOR, FILEPATH, DOMAIN, PLATFORM, THREAT-ACTOR, PRODUCT, MALWARE, LOC, CVE, TOOL, IPV4, MITRE-TACTIC, MD5, CAMPAIGN, SHA1, SHA256, EMAIL, IPV6, REGISTRY-KEYS
+- This is 45 classes (22 × 2 BIO tags + O)
+### Per-Class Results (accuracy, not F1)
+| Entity | Accuracy |
+|--------|----------|
+| CVE | 0.9995 |
+| SHA256 | 0.9874 |
+| URL | 0.9801 |
+| IPV4 | 0.9631 |
+| (others not published individually) |
+- Overall: P=0.8468, R=0.8484, **F1=0.8476**
+- Macro accuracy across all 22 types: 0.8776
+### Special Techniques
+- **None** — completely vanilla token classification
+- No CRF, no class weighting, no augmentation documented
+### What They Say Matters
+> "combining the state-of-the-art architecture of ModernBERT with one of the largest and most diverse CTI-labelled NER corpora ever built"
+**Key insight for us:** This model wins through **data scale** (500k+ spans) not architecture tricks. ModernBERT-large is general-purpose, not domain-pretrained. The 84.8% F1 with 22 classes is impressive but note: structured indicators (CVE, SHA256, URL, IPV4) are easy — regex could catch those. The harder classes (ORG, THREAT-ACTOR, TOOL) likely drag the average down.
+---
+## 4. CyNER (Alam et al., 2022) — F1: 76.7%
+**Paper:** "CyNER: A Python Library for Cybersecurity Named Entity Recognition" (arXiv:2204.05754)
+**GitHub:** [aiforsec/CyNER](https://github.com/aiforsec/CyNER)
+### Architecture
+- **Best model:** XLM-RoBERTa-large (~560M params)
+- Standard token classification head
+- Also integrates heuristic (regex) extractors and Flair/spaCy NER
+### Training Recipe
+- Standard Hugging Face token classification fine-tuning
+- Specific LR/epochs not documented in detail in the paper
+### Data Pipeline
+- **~60 threat intelligence reports** from MITRE ATT&CK (Android malware focus)
+- **106,000+ tokens**, **4,530 tagged entities**
+- Annotated using BRAT tool by cybersecurity-trained annotators
+- Very small dataset by modern standards
+### Label Space
+- **5 entity types:** Malware, Indicator, System, Organization, Vulnerability
+- **BIO scheme** (B-I-O tagging)
+- 11 classes total (same as SecureBERT's eval set — this IS the canonical benchmark)
+### Per-Class Results (XLM-RoBERTa-large)
+| Entity | Precision | Recall | F1 |
+|--------|-----------|--------|-----|
+| **Malware** | 79.82% | 75.11% | **77.39%** |
+| **Indicator** | 78.34% | 86.62% | **82.27%** |
+| **System** | 70.36% | 79.93% | **74.84%** |
+| **Organization** | 70.64% | 60.16% | **64.98%** |
+| **Vulnerability** | 100.0% | 80.0% | **88.89%** |
+| **Overall** | — | — | **76.66%** |
+### Special Techniques
+- Multi-approach fusion: transformer + regex heuristics + generic NER (Flair/spaCy)
+- Configurable priority between approaches
+### Hardest Classes
+1. **Organization** (64.98% F1) — low recall (60.16%), ambiguous entities
+2. **System** (74.84% F1) — broad category, confusable with products
+3. **Malware** (77.39% F1) — novel names hard to catch
+### What They Say Matters
+- Domain-specific pretraining helps (XLM-RoBERTa wasn't cybersec-specific but still won)
+- Combining transformer + heuristic extractors for IOCs
+- Small dataset is the main bottleneck
+---
+## 5. CyberNER (Harmonized STIX Corpus, 2025) — Reference Dataset
+**Paper:** arXiv:2510.26499 — "CyberNER: A Harmonized STIX Corpus for Cybersecurity Named Entity Recognition"
+**GitHub:** [yasirech-chammakhy/CyberNER](https://github.com/yasirech-chammakhy/CyberNER)
+- Unifies CyNER + DNRTI + APTNER + Attacker datasets onto STIX 2.1 standard
+- ~610k tokens, 23,477 sentences, 21 entity types
+- Best model: RoBERTa → F1=0.736
+- ~30% relative improvement over naive dataset concatenation
+- Important as a future benchmark to consider
+---
+## Comparative Summary
+| Model | F1 | Params | Pretraining | NER Data | Labels | Scheme | Special |
+|-------|-----|--------|-------------|----------|--------|--------|---------|
+| SecureBERT 2.0 | **94.5%** | ~350M | 13.6B cybersec tokens | 3.4k samples | 5 types | BIO | None — pure domain pretraining |
+| SecureModernBERT-NER | **84.8%** | ~395M (ModernBERT-large) | General | 502k spans | 22 types | BIO | None — data scale |
+| CyNER | **76.7%** | ~560M (XLM-R-large) | General multilingual | 4.5k entities | 5 types | BIO | Regex + transformer fusion |
+| SecureBERT v1 | **73.4%** | ~125M (RoBERTa-base) | Cybersec MLM | 3.4k samples | 5 types | BIO | None |
+| **Arcspan (ours)** | **63%** | **50M active** (MoE) | General | ? | ? | **BIOES** | Viterbi decoding |
+---
+## Key Takeaways for Closing Our Gap
+### 1. Domain pretraining is the #1 lever (SecureBERT v1→v2: +21 points)
+We can't do this — we have 50M active params and a fixed pretrained model. This means we need to compensate with other levers.
+### 2. Data scale is the #2 lever (SecureModernBERT: 500k spans → 84.8%)
+Our most actionable lever. SecureBERT 2.0 achieved 94.5% on just 3.4k samples, but it had massive domain pretraining. Without that, SecureModernBERT needed 500k spans for 84.8%. We likely need **significantly more training data**.
+### 3. Nobody uses CRF or class weighting
+All competitors use vanilla cross-entropy + BIO + standard token classification head. No CRF layers, no focal loss, no class weighting, no curriculum learning. This is notable — the field hasn't found these necessary.
+### 4. All competitors use BIO, we use BIOES
+Our BIOES scheme with Viterbi decoding is architecturally different. BIOES is theoretically richer (encodes span boundaries better), but all competitors do fine with BIO. Our Viterbi constraint should be an advantage if the model learns the patterns correctly.
+### 5. Organization is universally the hardest class
+CyNER: Organization=65% F1 vs Vulnerability=89% F1. This 24-point gap is consistent across models. If our per-class breakdown shows Organization dragging us down, that's expected.
+### 6. Small base models can do well with enough signal
+SecureBERT v1 at ~125M params got 73.4%. Our model at 50M active params getting 63% is not catastrophically behind — we're in the right ballpark for a model of our size without domain pretraining. A 10-point gap to close to match CyNER's 76.7% seems very achievable.
+### 7. Immediate action items
+- **More training data** — consider CyberNER harmonized corpus (610k tokens), DNRTI, APTNER datasets
+- **Longer training** — SecureBERT 2.0 used 20 epochs; are we training long enough?
+- **LR tuning** — competitors use 1e-5 to 5e-5; our default is 2e-4 which may be too high
+- **Sequence length** — SecureBERT 2.0 uses 1024; SecureModernBERT uses 128. What are we using?
+- **Evaluate per-class** — identify our weakest categories to target
+---
+## Open Questions
+1. What is our current training data size? How does it compare to the 3.4k samples SecureBERT used?
+2. Are we using the CyNER dataset specifically? If so, same train/test split?
+3. What is our learning rate? The 2e-4 default may be too aggressive for NER fine-tuning.
+4. Per-class breakdown of our 63% F1 — which classes are we failing on?
+5. Should we try BIO instead of BIOES to match the evaluation setup of competitors?
+6. Can we incorporate the CyberNER harmonized corpus for more training data?
+---
+## Sources
+- SecureBERT 2.0 paper: https://arxiv.org/abs/2510.00240
+- SecureBERT 2.0 NER model: https://huggingface.co/cisco-ai/SecureBERT2.0-NER
+- SecureBERT v1: https://huggingface.co/ehsanaghaei/SecureBERT
+- SecureModernBERT-NER: https://huggingface.co/attack-vector/SecureModernBERT-NER
+- CyNER paper: https://arxiv.org/abs/2204.05754
+- CyNER repo: https://github.com/aiforsec/CyNER
+- CyberNER corpus: https://arxiv.org/html/2510.26499v1
+- SecureBERT 2.0 GitHub: https://github.com/cisco-ai-defense/securebert2

research/notes/progress/2026-04-24-49-moe-finetuning-research.md ADDED Viewed

	@@ -0,0 +1,189 @@

+# MoE Fine-Tuning Best Practices for Arcspan
+**Date:** 2026-04-24
+**Context:** We're fine-tuning OpenAI's Privacy Filter (1.5B params, 50M active, 128 experts top-4, 8 layers, d_model=640) for cybersecurity NER. This note collects best practices for fine-tuning sparse MoE models on downstream tasks.
+---
+## 1. Load Balancing & Auxiliary Loss
+**The core problem:** Without intervention, routers develop a positive feedback loop — a few "favorite" experts receive most tokens, others starve of gradient updates (**expert collapse**).
+### Switch Transformer Auxiliary Loss (standard approach)
+```
+L_balance = N × Σ(fᵢ × Pᵢ)
+L_total = L_task + α × L_balance
+```
+- `fᵢ` = fraction of tokens routed to expert i (hard assignment)
+- `Pᵢ` = mean router probability for expert i (soft probability)
+- **α typically 0.01** — highly sensitive, task-dependent
+- Too small → expert collapse; too large → hurts task performance
+**Key finding:** Keep auxiliary loss enabled during fine-tuning, even when freezing expert parameters. It prevents overfitting and improves performance.
+### DeepSeek V3 Auxiliary-Loss-Free Alternative
+- Apply expert-wise bias to routing scores before top-K selection
+- Dynamically update bias based on recent load (outside backprop)
+- Use biased scores for selection, unbiased scores for gate weights
+- Gradients flow through unbiased weights → task loss uncontaminated
+### Router Z-Loss (ST-MoE)
+```
+L_z = mean(logsumexp(router_logits)²)
+```
+Penalizes large router logits, prevents numerical instability and training spikes. Recommended for stability, especially with FP16/mixed precision.
+**Source:** [How MoE Models Actually Learn](https://medium.com/@chris.p.hughes10/how-moe-models-actually-learn-a-guide-to-auxiliary-losses-and-expert-balancing-293084e3f600), [HuggingFace MoE Blog](https://huggingface.co/blog/moe)
+---
+## 2. Expert Freezing Strategies
+### Expert-Specialized Fine-Tuning (ESFT)
+Paper: ["Let the Expert Stick to His Last"](https://arxiv.org/html/2407.01906v1) — Expert-Specialized Fine-Tuning for Sparse Architectural LLMs.
+- Different tasks activate different experts at different magnitudes
+- Experts do specialize meaningfully across tasks
+- This specialization transfers to unseen tasks (zero-shot generalization)
+- Strategy: identify which experts are most relevant to your task, freeze the rest
+### Practical guidance for our model
+With 128 experts and top-4 routing, most experts will be underutilized for cybersecurity NER. Options:
+1. **Full fine-tune all experts** — simplest, works well if data is sufficient
+2. **Freeze experts + fine-tune router + head** — prevents catastrophic forgetting, good for small datasets
+3. **Identify active experts on cyber NER data → fine-tune only those** — best of both worlds but requires analysis pass first
+**Keep auxiliary loss on even when freezing experts** — it regularizes and improves downstream performance.
+---
+## 3. Learning Rate for MoE vs Dense
+- **MoE models tolerate higher learning rates** than dense counterparts (e.g., 3e-4 vs 1e-5 typical for dense)
+- The Privacy Filter's existing `opf train` uses **lr=2e-4 with AdamW** — this is in the right ballpark
+- **Batch size is critical:** Smaller batch sizes (32-64) are important. Larger batches (256+) cause experts to collapse during fine-tuning
+- MoE models benefit more from instruction tuning / multi-task fine-tuning than dense models
+### Recommendation for Arcspan
+- Start with lr=2e-4 (existing default), sweep 1e-4 to 5e-4
+- Use batch size 32-64, not larger
+- Warmup + cosine decay schedule
+---
+## 4. Router Behavior During Fine-Tuning
+- Router patterns **do change** during fine-tuning — experts re-specialize for the new task
+- On small datasets, routing can become inconsistent/unstable
+- **Option: freeze routing during fine-tuning** to prevent inconsistent routing on small datasets, while keeping load balancing loss as regularization
+- With sufficient data, allowing router to adapt improves performance
+### Representation Collapse
+Beyond routing collapse, hidden representations can cluster around expert centroids. Mitigations:
+- Dimension reduction before routing
+- L2 normalization of token representations and expert embeddings
+- Learnable temperature parameter τ in gating function
+**Source:** [On the Representation Collapse of Sparse MoE](https://arxiv.org/abs/2204.09179)
+---
+## 5. MoE-Specific Regularization
+| Technique | Purpose | Recommendation |
+|---|---|---|
+| Auxiliary load balancing loss (α=0.01) | Prevent expert collapse | Always use during fine-tuning |
+| Router z-loss | Prevent logit explosion / training instability | Use if training in mixed precision |
+| Small batch size (32-64) | Prevent expert collapse | Critical for fine-tuning |
+| Expert dropout | Regularization | Not widely adopted; try if overfitting |
+| Router temperature | Control expert activation sharpness | Learnable τ can help |
+---
+## 6. Full Fine-Tune vs LoRA/Adapter for MoE
+### LoRA for MoE — Key Findings
+Source: [LoRA Without Regret](https://thinkingmachines.ai/blog/lora/), [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://arxiv.org/abs/2410.21228)
+**When LoRA matches full fine-tuning:**
+- LoRA applied to **all layers** (MLP/MoE + attention), not just attention
+- Dataset size roughly matches number of trainable LoRA parameters (1.1:1 to 1.5:1 ratio)
+- Learning rate **10× higher** than full fine-tuning equivalent
+- Separate LoRA adapters per expert, rank scaled by number of active experts
+**LoRA advantages:**
+- ~10× fewer accelerators needed
+- ~2/3 FLOPs per pass
+- Multiple adapters can serve simultaneously
+**LoRA caveats:**
+- Accesses different parts of solution space than full fine-tuning ("intruder dimensions" in SVD)
+- Different generalization behavior out-of-distribution
+- Less tolerant of large batch sizes
+### Recommendation for Arcspan (50M active params)
+**Full fine-tuning is the right default.** Reasons:
+- 50M active parameters is tiny — full fine-tuning is cheap
+- The existing `opf train` already does full fine-tuning with AdamW
+- No memory pressure justifying LoRA's complexity
+- Token classification benefits from full adaptation of all expert representations
+LoRA would only make sense if we need multi-domain adapters (e.g., cyber NER + medical NER switching at inference).
+---
+## 7. MoE for Token Classification / NER
+No dedicated papers found on MoE architectures specifically for NER/token classification. This is notable — it means:
+1. **The Privacy Filter is unusually positioned** — a production MoE model doing token classification is rare
+2. Most MoE research focuses on generative LLMs (Mixtral, Switch-T, GLaM, DeepSeek)
+3. The closest analogue is BERT-style token classifiers, but those are dense
+**This is actually good for us** — less competition, the architecture is proven (OpenAI shipped it in production for PII), and the fine-tuning path is already validated by the v2→v7 label space expansion (8→24 categories).
+---
+## Summary: Practical Checklist for Arcspan Fine-Tuning
+1. **Keep auxiliary load balancing loss** (α=0.01) during fine-tuning
+2. **Add router z-loss** if not already present (check Privacy Filter code)
+3. **Batch size 32-64** — do not go larger
+4. **Learning rate 2e-4** (existing default is good), sweep 1e-4 to 5e-4
+5. **Full fine-tuning** (not LoRA) — model is small enough
+6. **Monitor expert utilization** during training — log fraction of tokens per expert
+7. **Consider freezing router** if dataset is small (<5K examples), let it adapt if dataset is large
+8. **Warm-start output head** from existing PII weights where label semantics overlap (the codebase already supports this)
+## Open Questions
+- Does the Privacy Filter codebase already include auxiliary load balancing loss? Need to check `vendor/privacy-filter/` code.
+- What is the current routing distribution on PII data? Would be useful to visualize before fine-tuning.
+- How many experts are actually active for typical text? If only 20-30 of 128 see meaningful traffic, freezing the rest is viable.
+- Should we add expert utilization logging to the training loop?
+---
+## Key References
+- [How MoE Models Actually Learn: Auxiliary Losses and Expert Balancing](https://medium.com/@chris.p.hughes10/how-moe-models-actually-learn-a-guide-to-auxiliary-losses-and-expert-balancing-293084e3f600)
+- [HuggingFace MoE Explained](https://huggingface.co/blog/moe)
+- [Auxiliary-Loss-Free Load Balancing (DeepSeek)](https://arxiv.org/abs/2408.15664)
+- [On the Representation Collapse of Sparse MoE](https://arxiv.org/abs/2204.09179)
+- [Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning](https://arxiv.org/html/2407.01906v1)
+- [LoRA Without Regret](https://thinkingmachines.ai/blog/lora/)
+- [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://arxiv.org/abs/2410.21228)
+- [ICLR 2024: MoE Parameter-Efficient Fine-Tuning](https://proceedings.iclr.cc/paper_files/paper/2024/file/6d00071564ec447466fc4577743cf1b3-Paper-Conference.pdf)
+- [MoE Load Balance Review](https://huggingface.co/blog/NormalUhr/moe-balance)

research/notes/progress/2026-04-24-50-r7-data-pipeline-plan.md ADDED Viewed

	@@ -0,0 +1,262 @@

+# R7 Data Pipeline Plan — Dataset Acquisition & Mapping
+**Date:** 2026-04-24
+**Status:** Research complete, ready for implementation
+---
+## Current Baseline
+| Metric | Value |
+|---|---|
+| Training sentences | 17,954 (5-class) |
+| Total sentences (train+test+valid) | 25,468 |
+| Total entities | 42,159 |
+| Entity breakdown | Malware: 16,122 · Organization: 15,236 · System: 4,647 · Vulnerability: 3,287 · Indicator: 2,867 |
+| Existing sources | cyberner_stix (7,922), cyner2 (4,952), dnrti (3,187), cyner (1,893) |
+Target: Competitors with 500K+ spans achieve 84.8% F1. We need substantially more data.
+---
+## Dataset Availability Assessment
+### 1. ✅ APTNER (Standalone) — **HIGH PRIORITY, NEW DATA**
+- **Repo:** https://github.com/wangxuren/APTNER
+- **Status:** ✅ Downloaded and verified
+- **Format:** CoNLL-style, **BIOES** (matches our architecture!)
+- **Files:** APTNERtrain.txt (6,745 sentences), APTNERdev.txt, APTNERtest.txt
+- **Overlap:** 3,628 sentences already in our data → **3,117 genuinely new sentences**
+- **Total entities (train):** ~6,867 (B- tags)
+**Entity types and mapping:**
+| APTNER Label | Count | → Our Label | Notes |
+|---|---|---|---|
+| IDTY (Identity) | 1,392 | Organization | People/groups mentioned in CTI |
+| ACT (Action) | 1,109 | **DROP** | Attack actions, not named entities |
+| MAL (Malware) | 845 | **Malware** | Direct match |
+| LOC (Location) | 822 | **DROP** | Geographic locations |
+| APT (APT Group) | 770 | **Organization** | Threat actor groups |
+| TIME | 753 | **DROP** | Temporal expressions |
+| TOOL | 405 | **System** | Attacker tools/software |
+| FILE | 358 | **Indicator** | File names/paths as IOCs |
+| SECTEAM | 338 | **Organization** | Security research teams |
+| VULNAME | 26 | **Vulnerability** | Vulnerability names |
+| OS | 15 | **System** | Operating systems |
+| PROT (Protocol) | 6 | **DROP** | Too few, not in our schema |
+| VULID | 5 | **Vulnerability** | CVE IDs |
+| URL | 3 | **Indicator** | URLs |
+| IP | 3 | **Indicator** | IP addresses |
+| ENCR | 2 | **DROP** | Encryption methods |
+| EMAIL | 1 | **Indicator** | Email addresses |
+| SHA2 | 1 | **Indicator** | Hash |
+| MD5 | — | **Indicator** | Hash |
+**Estimated new entities from 3,117 new sentences:** ~3,000 mappable entities (roughly half of the 6,867 total, proportional to new sentence fraction, minus DROPs)
+**Quality notes:**
+- BIOES format — no conversion needed for tag scheme
+- Some noisy tags: `S-APT/APT34`, `S-MAL:hash_value` — need regex cleanup to extract base type
+- A few multi-tag errors: `IDTY I-TOOL E-IDTY` — need to skip/fix these (~10 rows)
+### 2. ⚠️ CyberNER Combined STIX CSV — **LOW PRIORITY (mostly redundant)**
+- **Repo:** https://github.com/yasirech-chammakhy/CyberNER
+- **Status:** ✅ Downloaded
+- **Format:** CSV with columns: Word, Tag, Sentence_ID, STIX_Tag, Source
+- **Size:** 10,042 sentences, 57,300 entities (STIX tags)
+- **Overlap:** 8,518/10,042 sentences already in our data → **only 42 new sentences, 18 new entities**
+- **Verdict:** Almost entirely redundant with our existing data.
+**Potential secondary value:** The STIX_Tag column provides an alternative STIX-aligned annotation layer on sentences we already have. The STIX schema maps cleanly to our 5 classes. This could be used for label validation/correction on existing data but is NOT a source of new training examples.
+### 3. ⚠️ DNRTI Dataset — **SKIP (already included)**
+- **Repo:** https://github.com/LiuPeiP-CS/NER4CTI/tree/main/DNRTI_Dataset
+- **Status:** ✅ Downloaded
+- **Format:** CoNLL BIO (no S/E tags — would need BIOES conversion)
+- **Size:** 5,251 train sentences
+- **Overlap:** 5,145/5,251 already in our data → **only 106 new sentences**
+- **Entity types:** HackOrg, Tool, Area, OffAct, Idus, Time, SamFile, Org, Exp, SecTeam, Way, Features, Purp
+- **Verdict:** Skip — already incorporated.
+### 4. ⚠️ bnsapa/cybersecurity-ner (HuggingFace) — **SKIP (already included)**
+- **HF:** `bnsapa/cybersecurity-ner`
+- **Status:** ✅ Downloaded
+- **Format:** HuggingFace Dataset, BIO tags, 5 classes (Malware, Indicator, Organization, System, Vulnerability) — exact match to ours
+- **Size:** 2,664 train + 717 test + 785 valid = 4,166 total
+- **Overlap:** This IS our CyNER source data. Completely redundant.
+- **Verdict:** Skip.
+### 5. ✅ Stucco Auto-Labeled Corpus — **MEDIUM PRIORITY, LARGE but NVD-focused**
+- **Repo:** https://github.com/stucco/auto-labeled-corpus
+- **Status:** ✅ Downloaded
+- **Format:** JSON with token-level BIO annotations, keyed by CVE ID
+- **Subcorpora:**
+  - NVD: 15,192 docs, 685K tokens, 147K+ entities
+  - MS-Bulletin: 230 docs, 120K tokens
+  - Metasploit: 356 docs, 26K tokens
+- **Total:** 15,778 docs
+**Entity types and mapping:**
+| Stucco Label | NVD Count | → Our Label | Notes |
+|---|---|---|---|
+| relevant_term | 71,565 | **DROP** | Generic terms like "remote", "allows" — not named entities |
+| version | 29,022 | **DROP** | Version numbers alone |
+| application | 18,774 | **System** | Software names — direct match |
+| vendor | 10,460 | **Organization** | Software vendors |
+| update | 4,206 | **DROP** | Patch/update identifiers |
+| os | 3,487 | **System** | Operating systems |
+| file | 3,197 | **Indicator** | File paths/names |
+| cve id | 3,141 | **Vulnerability** | CVE identifiers — direct match |
+| function | 1,445 | **DROP** | Function names (too generic) |
+| parameter | 650 | **DROP** | API parameters |
+| hardware | 585 | **System** | Hardware devices |
+| edition | 578 | **DROP** | Software editions |
+| programming language | 163 | **DROP** | Language names |
+| method | 163 | **DROP** | Method names |
+| language | 7 | **DROP** | Natural languages |
+**Estimated usable entities (NVD only):**
+- System (application + os + hardware): ~22,846
+- Organization (vendor): ~10,460
+- Indicator (file): ~3,197
+- Vulnerability (cve id): ~3,141
+- **Total mappable: ~39,644 entities across 15,192 docs**
+**Quality concerns:**
+- Auto-labeled (not human-annotated) — expect noise
+- NVD descriptions are formulaic ("X before version Y allows...") — domain-narrow
+- `relevant_term` is the majority label and must be DROPped (it's not NER)
+- BIO format only → needs BIOES conversion
+- No Malware entities — this corpus is about vulnerabilities, not threat actors
+### 6. ❌ MalwareDB (in NER4CTI repo) — **SKIP (wrong domain)**
+- **Format:** CoNLL BIO
+- **Labels:** Action, Entity, Modifier — too generic, not cybersecurity NER
+- **Size:** ~8,525 train entities
+- **Verdict:** Skip — labels don't map to our schema meaningfully.
+---
+## Priority-Ordered Incorporation Plan
+### Priority 1: APTNER (New sentences only) — Est. +3,000 entities
+**Why first:** BIOES format (zero tag-scheme conversion), 3,117 genuinely new sentences, diverse CTI content, human-annotated.
+**Conversion steps:**
+1. Parse CoNLL-style BIOES from `APTNERtrain.txt` / `APTNERdev.txt` / `APTNERtest.txt`
+2. Clean noisy tags: strip `/` suffixes (`S-APT/APT34` → `S-APT`), strip `:` hash suffixes (`S-MAL:abc123` → `S-MAL`), fix multi-tag rows
+3. Apply label mapping (see table above)
+4. Reconstruct text from tokens, compute character offsets for spans
+5. Convert to our JSONL format: `{"text": "...", "spans": {"Malware: name": [[start, end]]}, "info": {...}}`
+6. Deduplicate against existing data (fuzzy match on text[:80])
+7. Split: merge new train into train, keep dev/test separate for validation
+### Priority 2: Stucco NVD Corpus — Est. +39,000 entities
+**Why second:** Massive volume but auto-labeled (noisy), NVD-domain-only (no Malware class), needs BIO→BIOES conversion.
+**Conversion steps:**
+1. Parse JSON token arrays from `full_corpus.json` → NVD subcorpus
+2. Apply label mapping (drop `relevant_term`, `version`, `update`, `function`, `parameter`, `edition`, `programming language`, `method`, `language`)
+3. Convert BIO → BIOES: add S- for single-token entities, add E- for final token of multi-token entities
+4. Reconstruct text, compute character offsets
+5. Convert to our JSONL format
+6. Deduplicate against existing NVD entries (we have ~26 + 2,790 LLM-annotated NVD entries)
+7. Quality filter: sample 100 random entries, manually check annotation quality before bulk incorporation
+**Risk:** Auto-labeled data may hurt more than help if noisy. Recommend incorporating in batches (e.g., 5K docs first) and evaluating impact on val F1 before adding more.
+### Priority 3: Defanged IOC Augmentation — Est. +2,000–5,000 synthetic entities
+**Why:** Our training data lacks defanged notation (`hxxp://`, `hxxps://`, `[.]`, `[@]`, etc.) which is extremely common in real CTI reports. This is a gap that will hurt recall on real-world data.
+**Approach:**
+1. **Identify clean indicators** in existing training data: URLs, domains, IPs, email addresses
+2. **Apply defanging transforms** probabilistically (50% rate):
+   - `http` → `hxxp`, `https` → `hxxps`
+   - `.` in domains/IPs → `[.]`
+   - `@` in emails → `[@]`
+   - Full URL defanging: `hxxps://example[.]com/path`
+3. **Generate new examples** by duplicating existing sentences containing indicators and replacing the indicator text with defanged version
+4. **Adjust character offsets** to account for changed string lengths
+5. **Add as augmented training examples** with `source: "defang_augmented"`
+**Implementation detail:** A Python function that takes a clean IOC string and returns all plausible defanged variants. Apply to training data to create ~1 augmented copy per indicator-bearing sentence.
+---
+## Concrete Label Mapping Summary
+| Source Dataset | → Malware | → Indicator | → System | → Organization | → Vulnerability | DROP |
+|---|---|---|---|---|---|---|
+| **APTNER** | MAL | FILE, URL, IP, EMAIL, SHA2, MD5 | TOOL, OS | IDTY, APT, SECTEAM | VULNAME, VULID | ACT, LOC, TIME, PROT, ENCR |
+| **Stucco NVD** | — | file | application, os, hardware | vendor | cve id | relevant_term, version, update, function, parameter, edition, prog lang, method, language |
+| **Defang Aug** | — | (augmented copies) | — | — | — | — |
+---
+## Estimated Total Data After R7
+| Source | New Sentences | New Entities | Quality |
+|---|---|---|---|
+| Current data | 25,468 | 42,159 | Human-annotated, mixed |
+| APTNER (new only) | ~3,100 | ~3,000 | Human-annotated, BIOES |
+| Stucco NVD | ~15,000 | ~39,600 | Auto-labeled, BIO |
+| Defang augmentation | ~1,500 | ~3,000 | Synthetic |
+| **Total** | **~45,000** | **~87,800** | Mixed |
+This roughly **doubles** our entity count. Still well short of 500K, but a meaningful step.
+---
+## Remaining Class Imbalance
+After incorporation, estimated distribution:
+- Malware: ~17,000 (existing 16K + 845 APTNER MAL)
+- Organization: ~27,000 (existing 15K + 2,500 APTNER + 10,460 Stucco vendor)
+- System: ~28,000 (existing 4.6K + 420 APTNER + 22,846 Stucco app/os/hw)
+- Vulnerability: ~6,500 (existing 3.3K + 31 APTNER + 3,141 Stucco CVE)
+- Indicator: ~6,400 (existing 2.9K + 366 APTNER + 3,197 Stucco file)
+**System and Organization will be overrepresented** due to Stucco NVD. Consider downsampling Stucco or using class-weighted loss.
+---
+## Open Questions / Blockers
+1. **Stucco quality gate:** Need to manually inspect ~100 random Stucco NVD annotations before bulk incorporation. Auto-labeled data could introduce systematic errors.
+2. **APTNER tag noise:** ~10 rows have corrupted multi-tag annotations (e.g., `IDTY I-TOOL E-IDTY`). Strategy: skip these rows (trivial loss).
+3. **Class imbalance after Stucco:** Stucco adds heavily to System/Organization but zero to Malware. Options:
+   - Downsample Stucco to ~5K docs
+   - Use class-weighted loss during training
+   - Or both
+4. **NVD overlap with our LLM-annotated NVD:** We have 2,790 LLM-annotated NVD entries. Need to deduplicate by CVE ID against Stucco's NVD docs (keyed by CVE ID — straightforward).
+5. **Defanging scope:** Should we defang indicators in ALL datasets or only in our existing data? Stucco NVD descriptions probably don't contain defanged IOCs in practice.
+6. **Additional data sources not yet explored:**
+   - SemEval-2018 Task 8 (MalwareTextDB) — focused on malware, could help balance
+   - CASIE dataset — cybersecurity event extraction
+   - Generating synthetic CTI text with LLMs + our label space
+---
+## Implementation Order
+1. **APTNER converter** — ~2 hours of scripting. Parse BIOES, clean tags, map labels, deduplicate, emit JSONL.
+2. **Stucco NVD converter** — ~2 hours. Parse JSON, map labels, BIO→BIOES, emit JSONL. Quality-gate before merge.
+3. **Defanging augmenter** — ~1 hour. Regex-based IOC defanging + offset adjustment.
+4. **Aggregation script update** — merge new sources into `aggregated_5class_*.jsonl`, recompute stats.
+5. **Retrain + evaluate** — compare F1 before/after each data addition to measure impact.

research/notes/progress/2026-04-24-51-audit-ioc-coverage.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Audit: IOC Label Coverage in Training Data
+**Date:** 2026-04-24
+**Files audited:**
+- `enriched_5class_train.jsonl` (ORIGINAL — 31,510 examples)
+- `enriched_5class_train_cleaned.jsonl` (CLEANED — 27,666 examples)
+## Method
+Regex scan for 6 IOC types (IPv4, MD5, SHA1, SHA256, URL, Domain) across all examples, checking whether each match overlaps with any existing span annotation.
+## Results Summary
+| Metric | ORIGINAL | CLEANED | Delta |
+|--------|----------|---------|-------|
+| Total examples | 31,510 | 27,666 | -3,844 |
+| Examples with IOCs | 4,313 | 4,073 | -240 |
+| Total IOC matches | 16,967 | 16,715 | -252 |
+| Labeled IOCs | 15,629 (92.1%) | 16,638 (99.5%) | **+7.4pp** |
+| Unlabeled IOCs | 1,338 | 77 | **-94.2%** |
+## Per-Type Breakdown (CLEANED)
+| IOC Type | Found | Labeled | Unlabeled | Coverage |
+|----------|-------|---------|-----------|----------|
+| IPv4 | 4,829 | 4,828 | 1 | 100.0% |
+| MD5 | 1,244 | 1,244 | 0 | 100.0% |
+| SHA1 | 1,377 | 1,377 | 0 | 100.0% |
+| SHA256 | 1,693 | 1,693 | 0 | 100.0% |
+| URL | 1,926 | 1,926 | 0 | 100.0% |
+| Domain | 5,646 | 5,570 | 76 | 98.7% |
+## Analysis of Remaining 77 "Unlabeled" IOCs
+**All are false positives from the regex, not genuinely missed labels:**
+1. **1 IPv4 false positive:** `18.0.0.324` — an Adobe Flash version number (octet `.324` > 255, not a valid IP).
+2. **76 Domain false positives:** Almost entirely Android package names and reverse-domain app identifiers that happen to end in real TLDs:
+   - `au.com.nab.mobile`, `au.com.bankwest.mobile` (package names)
+   - `btc.org.freewallet.app`, `eth.org.freewallet.app` (crypto app packages)
+   - `jp.co.sagawa.SagawaOfficialApp` (Japanese carrier app)
+   - `com.hua.ru.quan` (package name with `.ru` TLD match)
+   These are **not IOC domains** — they are software identifiers that regex incorrectly flags. No labeling action needed.
+## Verdict
+**The cleaning step was highly effective.** IOC coverage went from 92.1% → 99.5%, with all remaining "gaps" being regex false positives rather than genuine unlabeled indicators. The cleaned dataset has **zero genuine unlabeled IOCs** across all six categories.
+### Before vs After (ORIGINAL → CLEANED)
+- IPv4: 94.9% → 100.0% (258 gaps closed)
+- MD5: 92.7% → 100.0% (91 gaps closed)
+- SHA1: 97.6% → 100.0% (33 gaps closed)
+- SHA256: 77.2% → 100.0% (386 gaps closed — biggest improvement)
+- URL: 99.3% → 100.0% (14 gaps closed)
+- Domain: 90.2% → 98.7% (480 gaps closed, 76 remaining are false positives)
+## Open Questions
+None — IOC coverage is complete. The dataset is ready for training from an IOC labeling perspective.

research/notes/progress/2026-04-24-53-audit-label-consistency.md ADDED Viewed

	@@ -0,0 +1,147 @@

+# Label Quality Audit: enriched_5class_train_cleaned.jsonl
+**Date:** 2026-04-24
+**Dataset:** `/data/processed/enriched_5class_train_cleaned.jsonl`
+**Records:** 27,666 | **Span annotations:** 75,691
+## Label Distribution
+| Label | Count |
+|---|---|
+| Indicator | 19,490 |
+| Malware | 18,700 |
+| Organization | 16,892 |
+| System | 12,597 |
+| Vulnerability | 8,012 |
+---
+## 1. Same Surface Form, Different Labels
+**574 entities** appear with multiple labels across examples. Most are low-noise (dominant label >99%), but several warrant attention:
+### Top 20 Most Ambiguous
+| Entity | Total | Distribution | Severity |
+|---|---|---|---|
+| `backdoor` | 2,493 | Malware:2484, Indicator:9 | LOW — 9 mislabels |
+| `windows` | 975 | System:973, Malware:2 | LOW |
+| `oracle` | 880 | Org:877, System:3 | LOW |
+| `android` | 848 | System:839, Malware:8, Org:1 | LOW |
+| `google` | 683 | Org:679, Malware:1, System:3 | LOW |
+| `java` | 580 | System:579, Org:1 | LOW |
+| `microsoft` | 555 | Org:554, Malware:1 | LOW |
+| `linux` | 435 | System:434, Malware:1 | LOW |
+| `malware` | 386 | Malware:371, Indicator:15 | LOW — generic word labeled as Indicator in 15 cases |
+| `kaspersky` | 279 | Org:278, Malware:1 | LOW |
+| `exploit` | 238 | Vuln:221, Malware:17 | **MEDIUM** — genuine ambiguity (exploit kit vs vulnerability) |
+| `juniper` | 235 | System:5, Org:230 | LOW — System cases may be correct (Juniper devices vs company) |
+| `github` | 196 | Org:109, System:86, Malware:1 | **MEDIUM** — Org vs System is genuinely ambiguous (company vs platform) |
+| `python` | 189 | System:188, Malware:1 | LOW |
+| `ios` | 164 | System:163, Malware:1 | LOW |
+| `carbanak` | 154 | Malware:64, Indicator:3, Vuln:87 | **HIGH** — Carbanak is a Malware/APT group, NOT a Vulnerability. 87 mislabels. |
+| `zero-day` | 143 | Vuln:142, Malware:1 | LOW |
+| `trojan` | 139 | Malware:135, Vuln:3, Indicator:1 | LOW |
+| `wordpress` | 137 | Org:2, System:135 | LOW |
+| `facebook` | 127 | System:57, Org:70 | **MEDIUM** — genuine ambiguity (platform vs company) |
+### Action Items
+- **Carbanak as Vulnerability (87 instances):** This is clearly wrong. Carbanak is malware/APT group. Likely a systematic source-data error. **Should be corrected to Malware.**
+- **`exploit` as Malware (17):** Some may be valid (exploit kits), but most are likely label noise.
+- **`github`/`facebook`:** Genuinely dual-natured. Could accept as-is or pick a canonical label.
+---
+## 2. Span Boundary Issues
+| Check | Count | Verdict |
+|---|---|---|
+| Zero-length spans | 0 | PASS |
+| Beyond text bounds | 0 | PASS |
+| Mid-word boundaries | 33 | **NEEDS REVIEW** |
+| Overlapping spans | 0 | PASS |
+| Surface/offset mismatches | 0 | PASS |
+### Mid-Word Boundary Details (33 cases)
+These are spans where the character offset splits a compound word. Examples:
+- `Intel` extracted from "at **Intel**lig" (rec 19562) — annotation grabs "Intel" from "Intelligence"
+- `Disco` extracted from "**Disco**ver" (recs 20859, 20873) — annotation grabs "Disco" from "Discover"
+- `Access` from "**Access**Tok" (rec 21709) — from "AccessToken"
+- `Exchange` from "MSExchangeIS" (rec 21711) — partial match inside compound
+- `API` from "bleAPISer" (rec 21711) — "API" inside "DisableAPIService"
+- `Native API` extending into "APIs" (rec 21767)
+**Root cause:** These appear to be substring-match annotation artifacts where an entity name (Intel, API, Exchange, etc.) was matched inside a larger token. **33 cases out of 75,691 spans = 0.04%** — very low rate.
+**Action:** Fix the 33 cases or filter them out. Most are from MITRE ATT&CK technique records (recs 21xxx).
+---
+## 3. Span Offset Consistency
+**0 mismatches** between span key surface text and extracted text at offsets. The data is fully consistent — every `"Label: surface_text"` key matches `text[start:end]` exactly.
+---
+## 4. Manual Inspection (30 Random Samples)
+### Issues Found
+1. **Trailing punctuation in spans** (cyner2 source):
+   - rec 2081: `[Malware: NewPosThings.]` — trailing period included
+   - rec 2081: `[Malware: variant,]` — trailing comma included
+   - rec 2156: multiple alias lists labeled as single backdoor span (not actually wrong, but noisy)
+2. **Generic words labeled as entities** (cyberner_stix source):
+   - rec 9227: `[Vulnerability: high turnover of staff]` and `[Vulnerability: difficult to ensure all staff have cybersecurity training]` — these are NOT vulnerabilities, they're descriptions of organizational challenges. Systematic labeling error in cyberner_stix source.
+   - rec 11735: `[Indicator: Microsoft Word attachment]` — not an IOC
+   - rec 11595: `[Indicator: Filensfer]` — this is actually a malware family name, mislabeled as Indicator
+3. **MITRE/kernel records are noisy:**
+   - rec 21605: `[Malware: attrib]` — attrib is a legitimate Windows utility, not malware (though in MITRE context it's a "tool used by adversaries")
+   - rec 22902, 22916, 24772: Linux kernel vulnerability descriptions with very sparse annotations — mostly just `[System: Linux kernel]` and `[System: QEMU]`. These are fine but low entity density.
+   - rec 22916: `[Indicator: a533010b71dab205ad2f507188ce8c82203b0254]` — this is a git commit hash inside a kernel log, not a malware indicator. Likely auto-matched as a SHA hash.
+4. **Synthetic records (synth_v2) look clean** — well-formed, entities are correct and complete. Good quality.
+### Quality by Source
+| Source | Quality | Notes |
+|---|---|---|
+| cyner_train | Good | Some trailing punctuation in spans |
+| cyner2_train | Fair | Trailing punctuation, some alias-list noise |
+| cyberner_stix | Poor | Worst quality — generic phrases as Vuln, mislabeled entities |
+| MITRE/kernel | Fair | Git hashes as Indicators, tools-as-malware edge cases |
+| synth_v2 | Excellent | Clean, well-formed, accurate |
+---
+## 5. Bare File Extensions as Indicator
+**0 found.** The cleaning step successfully removed all bare file extension annotations (.dll, .exe, .pdf, .doc, .zip, .bat, .ps1, .vbs, .js).
+---
+## Summary & Recommendations
+### Data Quality Score: **B+ (Good, with fixable issues)**
+| Issue | Count | Priority |
+|---|---|---|
+| Carbanak mislabeled as Vulnerability | 87 | **HIGH** — systematic, fixable |
+| cyberner_stix generic phrases as Vulnerability | ~50+ est. | **HIGH** — source-level issue |
+| Mid-word boundary splits | 33 | MEDIUM — 0.04%, easy to filter |
+| Trailing punctuation in spans | ~20+ est. | MEDIUM — cyner2 source |
+| Git commit hashes as Indicators | ~10 est. | LOW |
+| `exploit` label ambiguity | 17 | LOW — may be acceptable |
+| Bare file extensions | 0 | RESOLVED |
+### Recommended Next Steps
+1. **Fix Carbanak labels:** Bulk relabel `Vulnerability: carbanak` → `Malware: Carbanak` (87 instances)
+2. **Audit cyberner_stix Vulnerability labels:** Many are generic descriptions, not actual CVEs or vulnerability names. Consider filtering Vulnerability spans from this source that don't match CVE patterns or known vuln names.
+3. **Strip trailing punctuation** from span boundaries (regex: strip `[.,;:!?]` from span ends, adjust offsets)
+4. **Remove mid-word boundary spans** (33 cases) or expand them to word boundaries
+5. **Filter git commit hashes** from Indicator labels in kernel vulnerability records

research/notes/progress/2026-04-24-59-aptner-held-out-test.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# APTNER Independent Held-Out Test Set
+**Date:** 2026-04-24
+**File:** `data/processed/aptner_5class_test_clean.jsonl`
+## Summary
+Built an independent test set from the APTNER test split (APT threat intelligence reports). **Zero leakage** found — no exact or prefix-80 matches against any of our 4 training files (28,675 unique training texts).
+## Stats
+| Metric | Value |
+|---|---|
+| Examples | 172 |
+| Total entity mentions | 340 |
+| Avg entities/example | 2.0 |
+| Malware | 102 |
+| Organization | 91 |
+| System | 87 |
+| Indicator | 55 |
+| Vulnerability | 5 |
+## Entity Memorization
+57.3% of test entity surface forms (90/157) also appear in training data. This is expected — common malware names and organizations recur across cybersecurity corpora. The model must still detect spans in novel contexts, so this doesn't invalidate the benchmark.
+## Independence vs. Enriched Test
+Our 3 existing test sets (enriched, CyNER, SB2) share 96–98% text overlap because they derive from the same annotation pipelines. APTNER has **0% text overlap** with training data and originates from a completely separate annotation effort on APT reports. This makes it a genuinely independent benchmark.
+## Weakness
+Vulnerability class has only 5 mentions — too few for reliable per-class metrics on that label. The other 4 classes are well-represented.
+## Source
+Original data: `data/raw/APTNER/APTNERtest.txt` → converted via `scripts/convert_aptner.py`

research/notes/progress/2026-04-24-cyner-deep-dive-and-datasets.md ADDED Viewed

	@@ -0,0 +1,408 @@

+# CyNER Deep Dive & Cybersecurity NER Dataset Catalog
+**Date:** 2026-04-24
+**Purpose:** Deep analysis of CyNER benchmark competitor + exhaustive dataset discovery for Arcspan training
+---
+## 1. CyNER Analysis
+### 1.1 Architecture
+- **Backbone:** XLM-RoBERTa-large (560M params) — best performer among tested models
+- **Head:** Single linear layer on top of transformer hidden representations for token classification
+- **Training framework:** T-NER library (Ushio & Camacho-Collados, 2021) + HuggingFace Transformers
+- **Sequence length:** 128 tokens max
+- **Optimizer:** AdamW, LR 5e-6 (large models) / 1e-5 (base models)
+- **Batch size:** 32
+- **Epochs:** 20
+- **Hardware:** Single Nvidia Tesla V100
+### 1.2 Entity Label Schema (5 classes)
+| Class | Definition | Examples |
+|-------|-----------|----------|
+| **Malware** | Viruses, trojans, ransomware, etc. | FluBot, DroidJack RAT |
+| **Indicator** | IOCs: domain, URL, IP, filename, hash, email, port | SHA256 hashes, IPs |
+| **System** | OS, software, hardware | Android, Windows, Adobe Flash |
+| **Organization** | Companies, groups, institutions | Proofpoint, Kaspersky |
+| **Vulnerability** | CVE IDs and exploit mentions | CVE-2012-2825, "master key vulnerability" |
+**Format:** BIO tagging (B-Entity, I-Entity, O) in CoNLL 2003 format.
+**Notable exclusions:** Location, Person — delegated to off-the-shelf models (Flair/SpaCy).
+### 1.3 Training Corpus
+- ~60 threat intelligence reports from MITRE ATT&CK software category
+- Reports from Kaspersky, Symantec, McAfee (2018-2021)
+- Manually cleaned text (not raw HTML/PDF)
+- Annotated with BRAT annotation tool by trained graduate students
+- **Total:** 106,991 tokens, 4,530 tagged entity spans
+| Split | Malware | Indicator | System | Organization | Vulnerability |
+|-------|---------|-----------|--------|-------------|---------------|
+| Train (40 docs) | 703 | 1,021 | 837 | 284 | 48 |
+| Dev (10 docs) | 254 | 208 | 182 | 92 | 9 |
+| Test (10 docs) | 242 | 261 | 248 | 131 | 10 |
+**Key observation:** Vulnerability class is extremely small (48 train / 10 test). Organization also underrepresented.
+### 1.4 Benchmark Results (span micro-F1 via seqeval)
+**Overall model comparison:**
+| Model | Precision | Recall | F1 |
+|-------|-----------|--------|-----|
+| BERT-base-uncased | 69.67 | 69.88 | 69.77 |
+| BERT-large-uncased | 72.69 | 73.45 | 73.07 |
+| RoBERTa-base | 37.22 | 42.50 | 39.69 |
+| RoBERTa-large | 34.76 | 44.18 | 38.91 |
+| XLM-RoBERTa-base | 74.57 | 77.23 | 75.88 |
+| **XLM-RoBERTa-large** | **75.30** | **78.07** | **76.66** |
+**Per-class results (XLM-RoBERTa-large):**
+| Class | Precision | Recall | F1 |
+|-------|-----------|--------|-----|
+| Malware | 79.82 | 75.11 | 77.39 |
+| Indicator | 78.34 | 86.62 | 82.27 |
+| System | 70.36 | 79.93 | 74.84 |
+| Organization | 70.64 | 60.16 | **64.98** |
+| Vulnerability | 100.0 | 80.0 | 88.89 |
+### 1.5 Hybrid Pipeline (Priority-Based Merging)
+CyNER combines three extraction approaches with configurable priority:
+1. **Heuristic (H):** Regex patterns for IOCs — SHA256, SHA1, CVE, IPv4, email, filepath. Highest default priority.
+2. **Transformer (T):** XLM-RoBERTa-large fine-tuned on cybersecurity corpus.
+3. **Flair (F):** Pre-trained Flair NER model for generic entities (PER, LOC, ORG, MISC).
+4. **SpaCy (S):** SpaCy NER for additional generic entities.
+Default priority: **HTFS** — Heuristic > Transformer > Flair > SpaCy. When entity spans overlap, the higher-priority source wins.
+**Key design insight:** Regex gets priority for IOC indicators because they don't need context. Transformer handles semantic entities (malware names, systems). Flair/SpaCy catch generic entities (locations, people) that CyNER deliberately excludes from its own training.
+### 1.6 Evaluation Methodology
+- **Metric:** Span-level micro-F1 via seqeval library
+- **Test set:** 10 documents with 892 entity spans
+- **No cross-validation reported** — single train/dev/test split
+- **No evaluation of the hybrid pipeline end-to-end** — only transformer component benchmarked in paper
+### 1.7 Strengths
+1. Clean, modular design — easy to use as a library
+2. Hybrid approach is pragmatic: regex for structured IOCs, ML for semantic entities
+3. XLM-RoBERTa backbone handles multilingual threat reports
+4. Open source with pretrained models available
+### 1.8 Weaknesses & Limitations
+1. **Small training data:** Only 4,530 entity spans from 60 documents. Vulnerability class has only 48 training examples.
+2. **76.66% F1 is mediocre.** Organization class at 64.98% F1 is particularly weak.
+3. **Short context window:** 128 tokens max — misses long-range dependencies in threat reports.
+4. **No BIOES tagging** — uses BIO only, losing boundary precision.
+5. **RoBERTa models perform terribly** (39% F1) — suspicious, possibly a training bug.
+6. **No evaluation of hybrid pipeline** — paper only benchmarks transformer component.
+7. **Android malware bias** — corpus is android malware focused, may not generalize to network intrusions, APTs, etc.
+8. **Indicator class is a catch-all** — lumps URLs, IPs, hashes, domains, emails together. No sub-type distinction at the model level (regex handles sub-types separately).
+9. **No Viterbi/CRF decoding** — just a linear head, no structured prediction.
+### 1.9 Implications for Arcspan
+Our advantages over CyNER:
+- **Finer entity types** — we can distinguish IP, URL, hash, domain, CVE at model level
+- **BIOES tagging** with Viterbi decoding — better boundary detection
+- **MoE architecture** — 50M active params vs 560M, 10x more efficient
+- **Bidirectional token classifier** — same paradigm but more modern architecture
+- Their 76.66% F1 is very beatable with better data and architecture
+---
+## 2. PRISM Benchmark
+### 2.1 Overview
+PRISM (Froudakis et al., 2025) is the first openly available, expert-validated benchmark for IoC extraction from threat reports.
+- **Published:** ACSAC 2025, arXiv:2506.11325
+- **Code/Data:** https://github.com/EvanFr/LANCE (GPL-3.0)
+### 2.2 Scope & Entity Types
+PRISM focuses on **4 IoC types only:**
+| Type | Definition |
+|------|-----------|
+| **IP** | IPv4/IPv6 addresses |
+| **Domain** | Domain names |
+| **URL** | Full URLs |
+| **Hash** | File hashes (MD5, SHA1, SHA256) |
+**Explicitly excluded:** Bitcoin addresses, email addresses, file paths, registry keys — "harder to verify at scale."
+### 2.3 Dataset Size
+| Indicator Type | BAP | GAP | Total (non-unique instances) |
+|---------------|-----|-----|-----|
+| IP | 177 | 112 | 289 |
+| Domain | 694 | 729 | 1,423 |
+| URL | 426 | 445 | 871 |
+| Hash | 962 | 758 | 1,720 |
+| **Total** | **2,259** | **2,044** | **4,303** |
+**Unique labeled indicators:** 1,774 total (1,401 IoC + 373 nonIoC)
+- 50 real-world threat reports from ORKL (Apr 2023 - Nov 2024)
+- Sources: Kaspersky, Palo Alto Networks, Microsoft (via AlienVault)
+### 2.4 Annotation Methodology
+- **LANCE:** LLM-Assisted Notation and Classification Engine (ChatGPT-4o based)
+- **Human-in-the-loop:** 5 junior analysts (PhD cybersecurity students) + 1 senior analyst (7yr experience)
+- **Two-phase annotation:**
+  - BAP (Baseline Annotation Pass): Analysts label without LLM assistance
+  - GAP (Guided Annotation Pass): Analysts see LANCE labels + justifications
+- **Dispute resolution:** Senior analyst resolves disagreements
+- **Binary classification:** Each indicator labeled IoC or nonIoC
+### 2.5 Baseline Results
+**LANCE (ChatGPT-4o) performance on PRISM:**
+- Overall F1: **97.6%**
+- IP: F1 ~1.00
+- Hash: F1 ~1.00
+- URL: F1 ~0.98-0.99
+- Domain: F1 ~0.87 (lowest — context-dependent benign vs malicious)
+**Other methods on PRISM:**
+- RegEx + Whitelist: High recall, low precision (many FPs)
+- AlienVault: Low recall (~25% for URLs), coverage gaps
+- VirusTotal (threshold=1): F1 ~86%
+- VirusTotal (threshold=5): Higher precision but low recall
+- Naive ChatGPT prompting: F1 = 66.9%
+**Cross-LLM generalization (on BAP subset):**
+- GPT-4o: F1 = 0.98
+- Gemma 3 27b: F1 = 0.92
+- Gemini 2.0 Flash: F1 = 0.98
+- Llama 3.3 70b: F1 = 0.83
+- Nvidia Nemotron 70b: F1 = 0.85
+### 2.6 Key Differences from CyNER
+| Aspect | CyNER | PRISM |
+|--------|-------|-------|
+| Task | NER (span extraction) | IoC classification (binary: IoC/nonIoC) |
+| Entity types | 5 semantic types | 4 indicator types |
+| Approach | Token classification | Document-level indicator labeling |
+| Granularity | Token-level BIO tags | Indicator-level binary labels |
+**Important:** PRISM is NOT a NER dataset in the traditional sense. It's an IoC classification benchmark. Indicators are first extracted by regex, then labeled as malicious/benign. This is complementary to, not a replacement for, NER training data.
+### 2.7 Data Format & Download
+- **Repository:** https://github.com/EvanFr/LANCE
+- **Dataset location:** `PRISM/GT.json` (ground truth), `PRISM/ReportsJSON/`, `PRISM/ReportsPDF/`
+- **Format:** JSON (indicator-level labels, not token-level BIO/BIOES)
+- **License:** GPL-3.0
+- **Convertibility to BIOES:** Would require re-annotating at token level. The indicator boundaries are known from regex extraction, so conversion is feasible but requires mapping back to document text.
+---
+## 3. Dataset Catalog
+### 3.1 Summary Table
+| # | Dataset | Size | Entity Types | Format | License | Download | Convertible to BIOES? |
+|---|---------|------|-------------|--------|---------|----------|----------------------|
+| 1 | **CyNER MITRE corpus** | 107K tokens, 4,530 spans, 60 docs | Malware, Indicator, System, Organization, Vulnerability | CoNLL BIO | Open (GitHub) | `github.com/aiforsec/CyNER/dataset/mitre/` | Yes — BIO→BIOES is trivial |
+| 2 | **PRISM** | 1,774 unique indicators, 50 reports | IP, Domain, URL, Hash (binary IoC/nonIoC) | JSON | GPL-3.0 | `github.com/EvanFr/LANCE/PRISM/` | Partial — need token-level realignment |
+| 3 | **bnsapa/cybersecurity-ner** (HuggingFace) | 4,166 rows (2,660 train / 785 val / 717 test) | ~10 types: File, Malware, Organization, Application, URL/Domain, Malware variant, Company, Product version | Token tags (numeric) | Apache 2.0 | `load_dataset("bnsapa/cybersecurity-ner")` | Yes — map numeric tags to BIOES |
+| 4 | **Universal-NER/Pile-NER-type** | 45,889 passages, 13K+ entity types | Superset includes cybersecurity-relevant types (needs filtering) | Conversational JSON | CC-BY-NC-4.0 | `load_dataset("Universal-NER/Pile-NER-type")` | Needs extraction + filtering |
+| 5 | **MITRE ATT&CK STIX** | 700+ techniques, 130+ groups, 600+ software | Threat actors, malware, techniques, tools, campaigns | STIX 2.1 JSON | Open | `github.com/mitre/cti` | Synthetic NER generation possible |
+| 6 | **MITRE CVE/NVD** | 200K+ CVEs | Vulnerability IDs, affected software, versions | JSON | Public domain | `nvd.nist.gov/developers` | Gazetteer, not NER training |
+### 3.2 Detailed Notes Per Dataset
+#### 3.2.1 CyNER MITRE Corpus
+- **Location:** `github.com/aiforsec/CyNER/tree/main/dataset/mitre/` — `train.txt`, `valid.txt`, `test.txt`
+- **Format:** CoNLL 2003 (token\tBIO-tag per line, blank lines between sentences)
+- **Conversion:** BIO→BIOES is deterministic: B tags at end of entity become S, I tags at end become E
+- **Quality:** Manually annotated by trained graduate students, BRAT tool
+- **Limitation:** Small (4,530 spans), android malware biased
+#### 3.2.2 PRISM
+- **Best use:** Evaluation benchmark for IoC extraction, NOT primary training data
+- **Conversion path:** Extract indicator spans from GT.json, map back to report text, generate token-level BIOES annotations. Feasible but engineering effort required.
+#### 3.2.3 bnsapa/cybersecurity-ner (HuggingFace)
+- **Appears to be derived from CyNER** — similar content (DroidJack RAT, FakeSpy examples), similar size
+- **388 kB total** — very small
+- **Has numeric labels** — need to verify exact mapping to entity type names
+- Pre-trained model available: `yasserrmd/bert_cyber_ner`
+#### 3.2.4 Pile-NER-type
+- **Massive:** 45.9K passages with 13K+ entity types generated by GPT-3.5-turbo
+- **Cybersecurity filtering strategy:**
+  - Filter conversation content for cybersecurity keywords (malware, CVE, vulnerability, exploit, threat, etc.)
+  - Filter by entity type names containing: IP_address, hash, malware, vulnerability, CVE, domain, URL, threat_actor, etc.
+  - Expected yield: ~500-2000 relevant passages (rough estimate)
+- **Quality concern:** GPT-3.5-turbo generated, not human-validated
+- **License:** CC-BY-NC-4.0 (non-commercial only)
+#### 3.2.5 MITRE ATT&CK STIX
+- **Download:** `git clone https://github.com/mitre/cti.git`
+- **Contains:** Enterprise ATT&CK, Mobile ATT&CK, ICS ATT&CK
+- **NER conversion:** Can extract (entity_name, entity_type) pairs from structured data, then use them as:
+  - Gazetteer for regex-based tagging of raw text
+  - Seed entities for distant supervision / weak labeling
+  - Entity dictionaries for data augmentation
+- **Entity types extractable:** Threat groups (APT28, Lazarus), malware (Emotet, Cobalt Strike), tools (Mimikatz, PsExec), techniques (T1059, Credential Dumping)
+### 3.3 Datasets NOT Found / Not Publicly Available
+| Dataset | Status |
+|---------|--------|
+| **DNRTI** | Referenced in some papers but no public download found |
+| **SecureNLP** | No specific dataset found under this name |
+| **APTNER** | No HuggingFace dataset found |
+| **iACE dataset** | Paper mentions 1,500 IoC + 3,000 nonIoC but code/data not available |
+| **Long et al. dataset** | 69,032 samples mentioned in PRISM survey but not available |
+---
+## 4. Proposed Label Taxonomy
+### 4.1 Recommended Unified Label Set
+Based on cross-dataset analysis, we propose a **two-tier taxonomy** for Arcspan:
+#### Tier 1: Core Labels (model-predicted, BIOES tagged)
+| Label | Definition | Rationale |
+|-------|-----------|-----------|
+| **MALWARE** | Malware family names, variants | Appears in CyNER, bnsapa, Pile-NER. High value. Clear boundaries. |
+| **THREAT_ACTOR** | APT groups, threat actor names | Extractable from ATT&CK. High value for attribution. |
+| **TOOL** | Legitimate tools used in attacks | Mimikatz, Cobalt Strike, PsExec. Distinct from malware. |
+| **VULNERABILITY** | CVE IDs and named vulnerabilities | Clear boundaries (CVE-XXXX-XXXXX). High value. |
+| **SYSTEM** | OS, software, hardware platforms | CyNER class. Useful for affected-product extraction. |
+| **ORGANIZATION** | Companies, institutions | CyNER class. Context for attribution and targeting. |
+#### Tier 2: IOC Indicators (regex-extracted, optionally model-confirmed)
+| Label | Definition | Rationale |
+|-------|-----------|-----------|
+| **IP_ADDRESS** | IPv4/IPv6 addresses | Regex handles well. Model confirms context (malicious vs benign). |
+| **DOMAIN** | Domain names | Regex extracts, model disambiguates. |
+| **URL** | Full URLs | Regex extracts, model disambiguates. |
+| **HASH** | MD5, SHA1, SHA256 file hashes | Regex handles perfectly. |
+| **EMAIL** | Email addresses | Regex handles well. |
+| **CVE_ID** | CVE identifiers specifically | Regex: `CVE-\d{4}-\d{4,}`. Overlap with VULNERABILITY is intentional. |
+| **FILEPATH** | File paths (Windows/Unix) | Regex-extractable. |
+#### Excluded from model labeling (handled by regex only)
+| Type | Reason for exclusion |
+|------|---------------------|
+| Registry keys | Too long, complex boundaries |
+| Bitcoin addresses | Very rare, regex-perfect |
+| Port numbers | Usually just integers, no NER value |
+| MITRE technique IDs | Fixed pattern T\d{4}.\d{3}, pure regex |
+### 4.2 Rationale
+1. **Tier 1 entities** require semantic understanding — a malware name looks like any other proper noun. These benefit from learned representations and BIOES tagging.
+2. **Tier 2 entities** have structural patterns that regex handles well. The model's role is **confirmation/disambiguation** (is this IP malicious or benign in context?), similar to PRISM's approach.
+3. **Cross-dataset coverage:**
+   - CyNER maps to: MALWARE, SYSTEM, ORGANIZATION, VULNERABILITY + Tier 2 (Indicator → split into specific types)
+   - PRISM maps to: IP_ADDRESS, DOMAIN, URL, HASH (binary IoC/nonIoC overlay)
+   - bnsapa maps to: MALWARE, ORGANIZATION, APPLICATION (≈SYSTEM)
+   - ATT&CK provides: THREAT_ACTOR, MALWARE, TOOL gazetteers
+4. **6 Tier 1 + 7 Tier 2 = 13 total types.** For initial training, we could start with just Tier 1 (6 types) and use regex for Tier 2.
+---
+## 5. Data Acquisition Plan
+### 5.1 Immediate Downloads
+| Priority | Dataset | Command | Estimated Effort |
+|----------|---------|---------|-----------------|
+| P0 | CyNER MITRE corpus | `git clone https://github.com/aiforsec/CyNER.git` → `dataset/mitre/` | 30 min to convert BIO→BIOES JSONL |
+| P0 | bnsapa/cybersecurity-ner | `load_dataset("bnsapa/cybersecurity-ner")` | 1 hr to map labels and convert |
+| P1 | PRISM | `git clone https://github.com/EvanFr/LANCE.git` → `PRISM/` | 4 hr to re-align to token-level BIOES |
+| P1 | MITRE ATT&CK | `git clone https://github.com/mitre/cti.git` | 2 hr to extract entity gazetteers |
+| P2 | Pile-NER cyber subset | `load_dataset("Universal-NER/Pile-NER-type")` + filter | 4 hr to filter, extract, convert |
+### 5.2 Conversion Pipeline
+1. **CyNER BIO → BIOES JSONL:**
+   - Read CoNLL format
+   - Convert BIO to BIOES (B at end of entity → S, I at end → E)
+   - Split "Indicator" class into sub-types using regex on the entity text
+   - Output: `{"tokens": [...], "labels": [...]}` JSONL
+2. **bnsapa → BIOES JSONL:**
+   - Map numeric tags to named types
+   - Already tokenized, just need format conversion
+3. **PRISM → BIOES JSONL:**
+   - Parse GT.json for indicator spans + labels
+   - Load report text from ReportsJSON/
+   - Tokenize reports, align indicator spans to token boundaries
+   - Generate BIOES tags (all Tier 2 types)
+   - This gives us **context-aware IoC/nonIoC labels** — unique training signal
+4. **ATT&CK → Gazetteers + Distant Supervision:**
+   - Extract all (name, type) pairs from STIX JSON
+   - Use as entity dictionaries for weak labeling of unlabeled text
+   - Can generate synthetic NER training data via string matching on large CTI corpus
+5. **Pile-NER → BIOES JSONL:**
+   - Filter passages containing cybersecurity content
+   - Extract entity spans from conversational format
+   - Map entity types to our taxonomy
+   - Convert to BIOES
+### 5.3 Estimated Total Training Data After Conversion
+| Source | Estimated Spans | Quality |
+|--------|----------------|---------|
+| CyNER | 4,530 | High (human annotated) |
+| bnsapa | ~4,000 | Medium (possibly CyNER derivative) |
+| PRISM | ~1,774 | High (expert validated, but IoC types only) |
+| Pile-NER filtered | ~2,000-5,000 | Low-medium (GPT-3.5 generated) |
+| ATT&CK distant supervision | ~10,000-50,000 | Low (noisy distant supervision) |
+| **Total** | **~20,000-65,000** | Mixed |
+**Key gap:** We have good coverage for Tier 2 IOC types but limited training data for Tier 1 semantic types (THREAT_ACTOR, TOOL). MITRE ATT&CK distant supervision is crucial to fill this gap.
+---
+## 6. Sources
+### Papers
+- Alam et al. (2022). "CyNER: A Python Library for Cybersecurity Named Entity Recognition." arXiv:2204.05754. https://arxiv.org/abs/2204.05754
+- Froudakis et al. (2025). "Revealing the True Indicators: Understanding and Improving IoC Extraction From Threat Reports." arXiv:2506.11325, ACSAC 2025. https://arxiv.org/abs/2506.11325
+### Repositories & Datasets
+- CyNER GitHub: https://github.com/aiforsec/CyNER
+- CyNER dataset: https://github.com/aiforsec/CyNER/tree/main/dataset/mitre
+- LANCE/PRISM GitHub: https://github.com/EvanFr/LANCE
+- PRISM dataset: https://github.com/EvanFr/LANCE/tree/main/PRISM
+- bnsapa/cybersecurity-ner: https://huggingface.co/datasets/bnsapa/cybersecurity-ner
+- Universal-NER/Pile-NER-type: https://huggingface.co/datasets/Universal-NER/Pile-NER-type
+- MITRE ATT&CK STIX: https://github.com/mitre/cti
+- MITRE CVE: https://cve.mitre.org/
+- NVD: https://nvd.nist.gov/
+### Other References
+- MALOnt2.0: Christian et al. (2021), ACM CCS
+- MALOnt: Rastogi et al. (2020), SIGKDD Workshop
+- T-NER: Ushio & Camacho-Collados (2021), EACL
+- seqeval: https://pypi.org/project/seqeval/
+- BRAT annotation tool: Stenetorp et al. (2012)
+- ORKL threat report repository: orkl.eu

research/notes/progress/2026-04-24-landscape-research-opus.md ADDED Viewed

	@@ -0,0 +1,390 @@

+# Landscape Analysis: Lightweight Span/Entity Detection for Arcspan
+**Date:** 2026-04-24
+**Researcher:** Claude (automated research session)
+---
+## 1. Executive Summary
+**The biggest opportunity is cybersecurity IOC extraction from threat intelligence reports.** It sits at a unique intersection: (1) data is highly sensitive (can't send to cloud APIs), (2) current tools are either regex-only (miss context-dependent IOCs) or require BERT-large/GPT (too heavy for inline/edge use), (3) entity types are well-defined and short-span (IPs, hashes, CVEs, domains, malware names), (4) labeled datasets exist (CyNER, PRISM benchmark, Pile-NER subsets), and (5) the 257-token attention window is more than sufficient for the surrounding context needed. The gap between "regex that catches 70% of IOCs" and "BERT-large that catches 95% but requires a GPU" is exactly where a 50M-active-param model shines.
+Close runners-up: **clinical/medical de-identification** (privacy-critical, well-funded, abundant datasets) and **developer tooling** (secret scanning, TODO detection — huge TAM, runs in CI/CD pipelines on CPU).
+**Honest caveat:** The landscape is more crowded than expected at the ~100M param level (GLiNER, bert-base-NER, SpaCy transformers). Arcspan's edge is not raw accuracy — it's the combination of (a) extreme efficiency (50M active params via MoE vs 110M dense BERT), (b) trivial label-space reconfiguration via JSON, and (c) proven data efficiency (F1 0.962 with 10% of data). The play is "faster to customize, cheaper to run" rather than "better F1 on standard benchmarks."
+---
+## 2. Existing Tools Landscape
+### 2.1 Comparison Table
+| Tool/Model | Params | Size on Disk | Domains | Local/Cloud | Fine-tunable | Key Gaps |
+|---|---|---|---|---|---|---|
+| **SpaCy en_core_web_sm** | ~4M (CNN) | ~40 MB | General (18 OntoNotes types) | Local | Yes (spacy train) | Low F1 (~85-86), no domain-specific types, CNN-based |
+| **SpaCy en_core_web_md** | ~4M + vectors | ~140 MB | General (18 types) | Local | Yes | Marginal improvement over sm, bulky vectors |
+| **SpaCy en_core_web_lg** | ~4M + vectors | ~600-800 MB | General (18 types) | Local | Yes | Large disk footprint for minimal NER gain (~86-87 F1) |
+| **SpaCy en_core_web_trf** | ~110M (RoBERTa) | ~400 MB | General (18 types) | Local | Yes (slow) | Slow inference, GPU-preferred, 110M dense params |
+| **dslim/bert-base-NER** | 110M | ~440 MB | General (4 CoNLL types: PER/ORG/LOC/MISC) | Local | Yes | Only 4 entity types, 110M dense, F1=91.3 on CoNLL |
+| **GLiNER-S** | 50M (DeBERTa-v3-small) | ~200 MB | Zero-shot any type | Local | Limited | Zero-shot avg F1=52.7, needs many entity types at once |
+| **GLiNER-M** | 90M (DeBERTa-v3-base) | ~360 MB | Zero-shot any type | Local | Limited | Avg F1=55.4 zero-shot, no BIOES, span-matching only |
+| **GLiNER-L** | 300M (DeBERTa-v3-large) | ~1.2 GB | Zero-shot any type | Local | Limited | 300M params, F1=60.9 zero-shot avg, too large for edge |
+| **GLiNER-BioMed** | Multiple scales | Varies | Biomedical zero-shot NER | Local | Yes (distilled) | Domain-specific, 5.96% F1 improvement over baselines |
+| **StanfordAIMI deidentifier** | ~110M (PubMedBERT) | ~440 MB | Medical PHI/PII | Local | Yes | Medical-only, 110M dense, F1=97.9-99.6 on radiology |
+| **Microsoft Presidio** | Varies (SpaCy backend) | ~100 MB+ | PII (credit cards, SSN, names, etc.) | Local | Extensible | Relies on SpaCy + regex + checksums; limited ML depth |
+| **Google Cloud DLP** | Unknown (proprietary) | N/A | PII (50+ types) | Cloud only | No | Cloud-only, can't run locally, expensive at scale |
+| **AWS Comprehend** | Unknown (proprietary) | N/A | General NER + PII | Cloud only | Custom models possible | Cloud-only, latency, cost per API call |
+| **Flair NER (english-fast)** | ~50M (Flair embeddings) | ~250 MB | General (4 CoNLL types) | Local | Yes | Sequential LSTM, slower than transformers, limited types |
+| **CyNER** | ~560M (XLM-RoBERTa-large) | ~2 GB | Cybersecurity IOCs | Local | Yes | Very large model, combines regex+transformer+SpaCy |
+| **d4data/biomedical-ner-all** | 66M | ~265 MB | Biomedical (multiple entity types) | Local | Yes | Small community, limited benchmarking |
+| **Arcspan (this project)** | 50M active (1.5B total MoE) | TBD | Any (JSON config) | Local/Edge/Browser | Yes (very data-efficient) | Unproven on non-PII tasks, 257-token window |
+### 2.2 Key Observations
+**GLiNER is the closest competitor** to what Arcspan could become. Key differences:
+- GLiNER uses span-matching in latent space (dot product of entity-type embeddings and span embeddings). Arcspan uses BIOES sequence labeling with Viterbi decoding.
+- GLiNER-S at 50M params achieves avg F1 of 52.7 in zero-shot. This is the zero-shot ceiling for this parameter budget.
+- GLiNER's strength is arbitrary entity types without retraining. Arcspan's strength would be higher F1 after minimal fine-tuning (F1 0.962 with 10% data).
+- **Critical differentiator:** GLiNER requires entity type descriptions at inference time (added to context window). Arcspan's labels are baked in at training — no context-window tax.
+**SpaCy dominates the "just works" space** but is stuck on 18 OntoNotes entity types for NER. Custom NER requires full retraining with `spacy train`. The CNN-based models (sm/md/lg) are fast but weak on edge cases; the transformer model (trf) is accurate but as heavy as BERT.
+**The 110M-param dense BERT remains the workhorse.** dslim/bert-base-NER has 1.86M downloads. Most domain-specific NER models on HuggingFace are fine-tuned BERT-base variants (~110M params). This is the bar Arcspan must beat on efficiency while matching on accuracy.
+---
+## 3. Vertical-by-Vertical Analysis
+### 3.1 Cybersecurity — IOC Extraction from Threat Intelligence
+**Current State:**
+- **CyNER** (Alam et al., 2022): Python library combining XLM-RoBERTa-large (~560M params) + regex + SpaCy. Entities: malware names, threat actors, attack types, IOCs.
+- **PRISM benchmark** (Froudakis et al., 2025): 1,791 labeled IOCs from 50 real-world threat reports. First high-quality ground truth for IOC extraction.
+- Most SOC teams use regex-based YARA rules + manual extraction. Heavy tools like BERT are used in research papers but not deployed inline.
+- OTuHunt framework proposes NLP-based IOC extraction for OT/ICS environments using MITRE ATT&CK mapping.
+**Gaps:**
+- CyNER is 560M+ params — can't run in a SIEM plugin or browser extension
+- Regex catches structured IOCs (IPs, hashes) well but misses: malware family names in context, CVE references without standard format, threat actor aliases, attack technique descriptions
+- No lightweight (<100M) production-ready cybersecurity NER model exists
+- Privacy concern: threat reports often contain internal network details that can't be sent to cloud APIs
+**Opportunity Assessment: HIGH**
+- Entity types are well-bounded and short-span (perfect for 257-token window)
+- PRISM + CyNER training data exists
+- Clear "this is done with 560M params but could be done with 50M active" opportunity
+- Data efficiency matters: new threat types emerge constantly, need fast retraining
+- Commercial market: every SIEM vendor needs this
+**Available Datasets:**
+- Pile-NER (Universal-NER): 44,889 passages, 240k entity spans, 13k entity types — filter for cybersecurity subset
+- PRISM: 1,791 labeled IOCs from 50 threat reports
+- CyNER training corpus (publicly available)
+- MITRE ATT&CK technique descriptions (structured, convertible)
+### 3.2 Medical/Clinical — De-identification and Entity Extraction
+**Current State:**
+- **StanfordAIMI de-identifier**: PubMedBERT-based (~110M), F1 97.9-99.6 on radiology reports. Gold standard for medical PII.
+- **i2b2 2006/2014 shared tasks**: Standard benchmarks for clinical NER and de-identification.
+- **GLiNER-BioMed** (Yazdani et al., 2025): Domain-adapted GLiNER for biomedical NER, 5.96% F1 improvement over baselines in zero-shot.
+- **d4data/biomedical-ner-all**: 66M param model, 91.5k downloads.
+- Discontinuous NER is a known challenge in medical text (Chen & Lin, 2024 — ensemble of 5 SOTA + ChatGPT on CADEC, ShARe13/14 datasets).
+**Gaps:**
+- Clinical NER models are mostly 110M+ params (BERT-base or larger)
+- HIPAA compliance requires local processing — cloud APIs are a non-starter for many hospitals
+- Edge deployment on medical devices (e.g., bedside monitors parsing clinical notes) needs <50M models
+- Drug-drug interaction entities, adverse event mentions, and dosage spans are context-dependent (regex fails)
+- Non-English clinical NER is severely underserved
+**Opportunity Assessment: MEDIUM-HIGH**
+- Strong need for local, lightweight processing (HIPAA)
+- But: medical NER demands very high recall (missed entities = patient safety risk)
+- The 257-token window may be limiting for long clinical documents
+- StanfordAIMI at 110M with F1 99.6 is a high bar
+- Better angle: Arcspan for quick fine-tuning to new clinical entity types (new drugs, new conditions) where the data-efficiency advantage shines
+**Available Datasets:**
+- i2b2 2006: ~870 clinical records, PHI entities (DUA required)
+- i2b2 2014: ~1,304 records, more comprehensive PHI types (DUA required)
+- CADEC: Adverse drug event corpus
+- ShARe/CLEF 2013/2014: Clinical disorder mentions
+- BC5CDR: Chemical-disease relations, ~1,500 PubMed articles
+- JNLPBA: Gene/protein NER from MEDLINE abstracts
+- NCBI Disease: Disease name recognition, 793 PubMed abstracts
+- MedMentions: 4,392 PubMed abstracts, UMLS concepts
+### 3.3 Financial — Entities in SEC Filings, Reports, and News
+**Current State:**
+- **SEC-filings NER dataset**: CC-BY 3.0, financial entity annotations
+- Most financial NER is done with general-purpose models (BERT-base) or LLMs via API
+- Bloomberg, Reuters, and financial data providers use proprietary NER systems
+- Ticker symbol detection is largely regex-based but fails for ambiguous tickers (e.g., "META" as word vs ticker, "CASH" the ticker vs the word)
+**Gaps:**
+- No widely-adopted lightweight financial NER model exists
+- Monetary amounts with complex formatting (ranges, currencies, percentages in context)
+- Risk indicators and sentiment-bearing financial terms are context-dependent
+- Compliance teams need local processing for material non-public information (MNPI)
+**Opportunity Assessment: MEDIUM**
+- Financial text is relatively structured — regex handles more than in other domains
+- But context-dependent ambiguity (ticker vs. word, amount vs. reference) is real
+- MNPI sensitivity argues for local processing
+- Smaller market than cybersecurity or medical
+### 3.4 Legal — Contract Analysis
+**Current State:**
+- German court decision segmentation dataset: 251,038 decisions (Darji et al., 2026)
+- LegalEval@SemEval2023: Rhetorical role prediction in legal opinions
+- Most legal NER is done with BERT-based models or LLMs
+- Contract analysis tools (Kira Systems, Luminance, Ironclad) use proprietary models
+**Gaps:**
+- No public lightweight legal NER model for contract entities (parties, obligations, dates, defined terms)
+- Legal language is highly domain-specific with nested references
+- Clause boundary detection is not standard NER but is a span detection task
+- Confidential contracts can't go to cloud APIs
+**Opportunity Assessment: MEDIUM**
+- Clear privacy need (contracts are confidential)
+- But legal entities are often long spans (entire clauses) — may exceed the 257-token window
+- Defined terms are often multi-word and require document-level context (cross-reference to definitions section)
+- Would need significant labeled data creation
+### 3.5 Developer Tools — Secret Scanning, TODO/FIXME Detection
+**Current State:**
+- GitHub secret scanning uses regex patterns for known secret formats
+- Tools like `detect-secrets` (Yelp), `truffleHog`, `gitleaks` use regex + entropy-based detection
+- TODO/FIXME detection is purely regex (`grep -r "TODO"`)
+- License detection tools (ScanCode, licensee) use text matching
+**Gaps:**
+- Regex-based secret scanning has known false positive/negative issues:
+  - High-entropy strings that aren't secrets (base64-encoded data, UUIDs)
+  - Secrets in non-standard formats (custom API key patterns)
+  - Test/example keys vs. real keys (context matters: `example_key = "sk_test_..."` vs `API_KEY = "sk_live_..."`)
+- TODO/FIXME detection misses natural language equivalents ("we need to fix this later", "temporary workaround")
+- No context-aware lightweight code annotation detector exists
+**Opportunity Assessment: HIGH**
+- Massive TAM (every development team, every CI/CD pipeline)
+- Runs in CI/CD where speed and CPU-only inference matter
+- Short spans with clear contextual cues — perfect for 257-token window
+- False positive reduction in secret scanning is commercially valuable
+- Data can be synthetically generated from code repositories
+### 3.6 Scientific Literature — Chemical/Gene/Material Entities
+**Current State:**
+- Biomedical NER is well-served (see section 3.2)
+- Chemical NER: ChemNER, CHEMDNER dataset
+- Gene NER: JNLPBA, BioCreative
+- Material science NER: relatively underserved, mostly custom BERT models
+**Opportunity Assessment: LOW-MEDIUM**
+- Well-served by existing BERT-based models
+- Scientific entities are often long, complex terms (chemical formulas, gene names with variants)
+- Domain expertise needed for label schema design
+- Niche market
+### 3.7 Energy/Power Systems — Equipment and Fault Entities
+**Current State:**
+- **No publicly available NER model or dataset specifically for power systems exists.** This was confirmed by extensive searching.
+- Maintenance logs, SCADA alarm descriptions, and protection relay settings contain equipment IDs, fault codes, and protection settings
+- Currently handled by regex patterns or manual extraction
+- Some proprietary systems within ABB, Siemens, GE Vernova use custom NLP
+**Gaps:**
+- Equipment naming is highly inconsistent across utilities (e.g., "Breaker 101" vs "CB-101" vs "52-1" for the same physical device)
+- Fault descriptions mix technical jargon with natural language ("phase A-to-ground fault on 138kV bus" — multiple entities interleaved)
+- Protection settings are contextual ("Zone 1 reach set to 80% of line impedance" — "80%" is meaningless without context)
+**Opportunity Assessment: LOW (for now)**
+- No training data exists publicly
+- Tiny market (hundreds of utilities vs. millions of developers)
+- Would require partnerships with utilities to get labeled data
+- The human's domain expertise is relevant but shouldn't force this choice
+- Could revisit if a utility partner materializes
+### 3.8 Supply Chain/Manufacturing — Part Numbers and Defects
+**Current State:**
+- Part number extraction is largely regex-based
+- Defect description NER is underserved
+- Quality management systems use proprietary text analysis
+**Opportunity Assessment: LOW-MEDIUM**
+- Part numbers are highly structured (good for regex)
+- Defect descriptions need context but data is proprietary
+- No public datasets
+### 3.9 Education — Learning Objectives and Prerequisites
+**Current State:**
+- Minimal NER work in education domain
+- Curriculum analysis is mostly manual or LLM-based
+**Opportunity Assessment: LOW**
+- Entities are often full sentences (learning objectives) — too long for span detection
+- Small market
+- No data
+### 3.10 HR/Recruiting — Resume Entities
+**Current State:**
+- Resume parsing is a mature field (Sovren, HireEZ, etc.)
+- Most use regex + rule-based systems or cloud NER APIs
+- Privacy concerns are significant (resume data is PII-heavy)
+**Opportunity Assessment: MEDIUM**
+- Clear privacy need (resumes contain PII)
+- Skills, certifications, and experience spans are well-suited entity types
+- But: commercial resume parsers are entrenched
+- Resume NER datasets exist on HuggingFace (Chinese Resume NER dataset used in research)
+---
+## 4. The Regex Ceiling — Where Rules Fail and ML Is Needed
+### 4.1 Concrete Examples
+**1. Cybersecurity: Malware Family Names**
+- Regex can match `CVE-2024-\d{4,5}` but cannot identify "the Lazarus group deployed a new variant of their DreamJob toolkit" — "Lazarus group" (threat actor) and "DreamJob" (malware family) require contextual understanding.
+**2. Financial: Ticker Symbol Ambiguity**
+- "META" appears in: "Meta Platforms (META) reported earnings" vs. "the meta-analysis of clinical trials" vs. "META filed a 10-K." Regex matches all; only context resolves.
+**3. Medical: Drug Names in Clinical Notes**
+- "Patient was started on aspirin" is easy. "Pt was started on ASA 81 daily" requires knowing ASA=aspirin. "Started on baby aspirin" — "baby" modifies the entity but isn't part of the drug name.
+**4. Secret Scanning: Context-Dependent Sensitivity**
+- `password = "correct horse battery staple"` — is this a real password or a reference to the famous XKCD comic? Context (file path, variable name patterns, test file vs. production) matters.
+- `API_KEY = "sk_test_abc123"` — test key, low risk. `API_KEY = "sk_live_abc123"` — production key, high risk. Regex treats both the same.
+**5. Legal: Party References**
+- "Acme Corp (hereinafter 'the Company')" — after this point, "the Company" refers to Acme Corp. Regex can't resolve coreference.
+**6. Energy: Equipment References**
+- "Open breaker at Station A" vs. "The breaker opened due to a fault at Station A" — "breaker" is equipment in both, but the action and causality differ. More critically: "The 138kV line from Station A to Station B" — the entity is the entire phrase, not just "138kV" or "Station A."
+### 4.2 The Pattern
+Regex fails when:
+1. **The same surface form has different meanings** depending on context (META, ASA, "the Company")
+2. **Entity boundaries are context-dependent** ("baby aspirin" — is "baby" part of the entity?)
+3. **Entity types require understanding surrounding text** (a string is only a "secret" if it's in a production config, not a test file)
+4. **Entities are described in natural language** rather than following a pattern ("the threat actor known as...")
+5. **Domain-specific abbreviations** resolve differently in context (ASA = aspirin vs. ASA = American Standards Association)
+This is precisely the sweet spot for a lightweight ML model: the entities are short spans, the context window needed is small (usually within a sentence or two), and the contextual cues are learnable.
+---
+## 5. Top 5 Opportunity Areas (Ranked)
+### Rank 1: Cybersecurity IOC Extraction
+- **Impact:** High (every SOC team, every SIEM vendor)
+- **Feasibility:** High (datasets exist, entity types are well-defined, short spans)
+- **Gap in market:** Large (CyNER is 560M+ params, regex misses 30%+ of contextual IOCs)
+- **Arcspan advantage:** 50M active params for inline/edge use, data-efficient retraining for new threat types
+- **Score: 9/10**
+### Rank 2: Developer Tools — Context-Aware Secret & Annotation Scanning
+- **Impact:** Very High (millions of developers, every CI/CD pipeline)
+- **Feasibility:** High (code is structured, synthetic training data possible, short spans)
+- **Gap in market:** Moderate (regex tools exist but high false positive rate; no ML-based lightweight alternative)
+- **Arcspan advantage:** CPU-only inference in CI/CD, browser extension for code review, JSON-configurable for custom secret patterns
+- **Score: 8.5/10**
+### Rank 3: Clinical/Medical De-identification
+- **Impact:** High (HIPAA compliance, patient safety)
+- **Feasibility:** Medium-High (i2b2 datasets exist but DUA-restricted, entity types well-defined)
+- **Gap in market:** Moderate (StanfordAIMI exists at 110M but nothing at 50M; edge medical devices need smaller)
+- **Arcspan advantage:** 2x smaller than BERT-base, data-efficient for new institution adaptation
+- **Score: 7.5/10**
+### Rank 4: PII Detection (Improved Presidio Backend)
+- **Impact:** High (GDPR/CCPA compliance is universal)
+- **Feasibility:** Very High (model is already trained for PII, just extend/adapt)
+- **Gap in market:** Moderate (Presidio exists but uses SpaCy CNN or full BERT; Arcspan could be a drop-in replacement with better accuracy/speed)
+- **Arcspan advantage:** Native capability (the model was designed for PII), MoE efficiency, already proven
+- **Score: 7/10**
+### Rank 5: Financial Entity Extraction
+- **Impact:** Medium-High (compliance, MNPI handling)
+- **Feasibility:** Medium (SEC-filings dataset exists, but financial text is semi-structured)
+- **Gap in market:** Moderate (no lightweight financial NER, but regex covers a lot)
+- **Arcspan advantage:** Context-dependent disambiguation (ticker vs. word), local processing for MNPI
+- **Score: 6/10**
+---
+## 6. Available Datasets
+| Dataset | Domain | Size | Entity Types | Source/Link | BIOES-Convertible? |
+|---|---|---|---|---|---|
+| **CoNLL-2003** | General news | 14,987 sentences, 203K tokens | PER, ORG, LOC, MISC | Standard benchmark | Yes (natively BIO, trivial BIOES conversion) |
+| **OntoNotes 5** | Mixed | ~1.7M tokens | 18 types (PERSON, ORG, GPE, etc.) | LDC2013T19 | Yes |
+| **Pile-NER** | Multi-domain | 44,889 passages, 240K spans, 13K entity types | Open/diverse | [HuggingFace](https://huggingface.co/datasets/Universal-NER/Pile-NER-type) | Needs conversion from span format |
+| **PRISM** | Cybersecurity IOCs | 1,791 IOCs from 50 threat reports | IP, hash, domain, CVE, malware name | arxiv:2506.11325 | Needs conversion |
+| **CyNER corpus** | Cybersecurity | Unknown size (publicly available) | Malware, threat actor, attack type, IOC | [GitHub](https://github.com/aiforsec/CyNER) | Needs conversion |
+| **i2b2 2006** | Clinical PHI | ~870 records | PHI types (name, date, location, etc.) | DUA required | Yes (natively BIO) |
+| **i2b2 2014** | Clinical PHI | ~1,304 records | Expanded PHI types | DUA required | Yes |
+| **BC5CDR** | Biomedical | ~1,500 PubMed articles | Chemical, Disease | [BioCreative](https://biocreative.bioinformatics.udel.edu/) | Yes (BIO format) |
+| **JNLPBA** | Biomedical | 2,404 MEDLINE abstracts | Protein, DNA, RNA, cell line, cell type | JNLPBA shared task | Yes (natively BIO) |
+| **NCBI Disease** | Biomedical | 793 PubMed abstracts | Disease names | [NCBI](https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/) | Yes |
+| **SEC-filings** | Financial | Unknown size | Financial entities | CC-BY 3.0 | Likely convertible |
+| **WikiNER** | General (multilingual) | ~170K sentences (French), similar for other langs | PER, ORG, LOC, MISC | Multiple HuggingFace repos | Yes |
+| **WNUT17** | Social media | ~5K tweets | Person, Location, Corporation, Product, Creative Work, Group | CC-BY 4.0 | Yes (BIO format) |
+| **Few-NERD** | General (fine-grained) | 188K sentences | 66 fine-grained types in 8 coarse types | [HuggingFace](https://huggingface.co/datasets/DFKI-SLT/few-nerd) | Yes |
+| **conll2025-ner** | General | 144K examples | Multiple types | [HuggingFace](https://huggingface.co/datasets/boltuix/conll2025-ner) | Likely yes |
+| **MedMentions** | Biomedical | 4,392 PubMed abstracts | UMLS concepts | PubMed | Needs conversion from UMLS |
+| **pasteproof-pii-dataset** | PII | 150K examples | PII entity types | [HuggingFace](https://huggingface.co/datasets/joneauxedgar/pasteproof-pii-dataset-v2) | Likely yes |
+---
+## 7. Sources
+### Papers
+- Zaratiana et al. (2023). "GLiNER: Generalist Model for Named Entity Recognition using Bidirectional Transformer." arXiv:2311.08526. https://arxiv.org/abs/2311.08526
+- Yazdani et al. (2025). "GLiNER-biomed: A Suite of Efficient Models for Open Biomedical Named Entity Recognition." arXiv:2504.00676. https://arxiv.org/abs/2504.00676
+- Yoo et al. (2025). "ReProCon: Scalable and Resource-Efficient Few-Shot Biomedical NER." arXiv:2508.16833
+- Xu et al. (2024). "GoalBERT: A Lightweight Named-Entity Recognition Model Based on Multiple Fusion." Applied Sciences 14(23):11003
+- Alam et al. (2022). "CyNER: A Python Library for Cybersecurity Named Entity Recognition." arXiv:2204.05754. https://arxiv.org/abs/2204.05754
+- Froudakis et al. (2025). "Revealing the True Indicators: Understanding and Improving IoC Extraction From Threat Reports." arXiv:2506.11325. https://arxiv.org/abs/2506.11325
+- Arikkat et al. (2023). "Discerning Reliable Cyber Threat Indicators for Timely Cyber Threat Intelligence." arXiv:2306.16087
+- Chen & Lin (2024). "On Fusing ChatGPT and Ensemble Learning in Discontinuous NER in Health Corpora." arXiv:2412.16976
+- Darji et al. (2026). "Segmentation and Processing of German Court Decisions from Open Legal Data." arXiv:2601.01449
+- Belfathi et al. (2023). "Enhancing Pre-Trained Language Models with Sentence Position Embeddings for Rhetorical Roles Recognition in Legal Opinions." arXiv:2310.05276
+- Atuhurra et al. (2024). "NERsocial: Efficient NER Dataset Construction for HRI Utilizing RapidNER." arXiv:2412.09634
+### Models & Tools (HuggingFace / GitHub)
+- dslim/bert-base-NER: https://huggingface.co/dslim/bert-base-NER (110M params, F1=91.3 CoNLL)
+- GLiNER models: https://huggingface.co/urchade/gliner_medium-v2.1 (50M/90M/300M variants)
+- StanfordAIMI de-identifier: https://huggingface.co/StanfordAIMI/stanford-deidentifier-base (F1=97.9-99.6)
+- d4data/biomedical-ner-all: https://huggingface.co/d4data/biomedical-ner-all (66M params)
+- CyNER: https://github.com/aiforsec/CyNER
+- Microsoft Presidio: https://github.com/microsoft/presidio
+- GLiNER repo: https://github.com/urchade/GLiNER
+- Entity recognition datasets list: https://github.com/juand-r/entity-recognition-datasets
+- SpaCy models: https://spacy.io/models/en
+### Web Sources
+- Edge AI & Vision Alliance (2026). "On-Device LLMs in 2026: What Changed, What Matters." https://www.edge-ai-vision.com/2026/01/on-device-llms-in-2026-what-changed-what-matters-whats-next/
+- ACM Queue. "Generative AI at the Edge: Challenges and Opportunities." https://queue.acm.org/detail.cfm?id=3733702
+- Facebook Research. MobileLLM. https://github.com/facebookresearch/MobileLLM
+- HuggingFace token-classification models: https://huggingface.co/models?pipeline_tag=token-classification&sort=downloads
+- HuggingFace token-classification datasets: https://huggingface.co/datasets?task_categories=token-classification&sort=downloads
+### Research Limitations
+- Could not access OpenAI's PII redaction blog post (403 error) — technical details about the base model architecture sourced from the user's project description
+- PapersWithCode redirected to HuggingFace (302) — could not access benchmark leaderboards directly
+- Brave Search MCP was intermittently returning no results for many queries
+- Paper search engine returned many irrelevant results for domain-specific queries (NER search polluted with unrelated papers)
+- GLiNER paper PDF could not be parsed by WebFetch — extracted text via paper-search CLI instead

research/notes/progress/2026-04-24-ner-recall-improvement-techniques.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# NER Recall Improvement Techniques Research
+**Date:** 2026-04-24
+**Context:** Cybersecurity NER, ~63% span F1, recall bottleneck (40-60%), 1.5B sparse MoE (50M active), BIOES + Viterbi
+## Ranked Recommendations (implementable in 1-2 days)
+### Tier 1: Highest impact, lowest effort
+1. **Lower the entity confidence threshold / bias the O-class down**
+   Viterbi decoding uses transition + emission scores. Add a negative bias to the "O" emission logit (e.g., -0.5 to -2.0) to make the model less eager to predict outside-entity. Tune on val set. Zero training cost.
+   - Source: calibration paper https://ar5iv.labs.arxiv.org/html/2004.04361
+2. **Self-training / pseudo-labeling on unlabeled cyber text**
+   Run current model on large unlabeled corpus (CVE descriptions, threat reports), keep only high-confidence spans (>0.9), add to training data, retrain. SeqUST framework shows this matches 3-8x more labeled data.
+   - Source: https://arxiv.org/abs/2302.08659 (AAAI), https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0246310
+3. **Negative sampling / entity ratio rebalancing**
+   If training data is dominated by O tokens (typical >80%), downsample O-tagged sentences or upsample entity-rich sentences. Alternatively, use focal loss or class-weighted cross-entropy penalizing O less.
+### Tier 2: Medium effort, strong evidence
+4. **Label taxonomy harmonization (CyberNER insight)**
+   CyberNER paper showed harmonizing disparate tag schemas onto STIX 2.1 improved F1 from 0.569 to 0.736 (30% relative). Review if our label definitions have overlapping/ambiguous categories.
+   - Source: https://arxiv.org/html/2510.26499v1
+5. **Two-stage: binary entity detection then classification**
+   Train first with binary BIOES (entity vs not-entity), then fine-tune with full label set. Addresses recall by simplifying the initial detection task. Common in biomedical NER.
+6. **Auxiliary binary "is-entity" head**
+   Add a parallel binary classification head alongside the BIOES head during training. Multi-task signal forces the model to first learn entity boundaries.
+### Tier 3: MoE-specific and architecture tweaks
+7. **Reduce auxiliary load-balancing loss coefficient during fine-tuning**
+   Default load-balancing loss may fight task adaptation. Reduce coefficient (e.g., 0.01 → 0.001) to let experts specialize for cyber entities.
+   - Source: https://apxml.com/courses/mixture-of-experts-advanced-implementation/chapter-3-training-large-scale-moes/fine-tuning-pretrained-moe
+8. **Freeze router, fine-tune experts only (or vice versa)**
+   BOND-MoE paper shows document-level expert routing helps NER. Try freezing router weights for first N epochs, then unfreezing.
+   - Source: https://arxiv.org/pdf/2404.19192
+9. **Data augmentation: entity mention replacement**
+   Replace entity mentions with synonyms/variants (e.g., swap malware names, IP formats). Preserves context while diversifying entity surface forms. Low effort with regex + lookup tables.
+### Tier 4: Worth trying if time permits
+10. **Iterative label re-correction**
+    Use model predictions to find false negatives in training data (model predicts entity but gold says O). Manually review top disagreements — often reveals annotation errors.
+    - Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC8170952/
+## Key Cyber NER Benchmarks
+- CyberNER (RoBERTa): 73.6% F1 on unified STIX corpus
+- Fine-tuned LLMs (Mistral 7B etc.): ~74% F1
+- BERT fine-tuned on clean single-domain: up to 96% F1 (narrow scope)
+- Zero-shot self-improving: 61-75% F1 depending on dataset
+## Open Questions
+- What is our current O-token ratio in training data?
+- Are we using any load-balancing loss? What coefficient?
+- Have we tried threshold tuning on Viterbi emission scores?

research/notes/progress/2026-04-26-01-feasibility-check-approach.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Feasibility Check: Is the Arcspan Cyber-NER Approach Viable?
+**Date:** 2026-04-26
+**Status:** External validation pass after R8/R9 local work
+## What we found
+Three external facts materially support the current direction:
+1. **OpenAI Privacy Filter is explicitly designed for domain adaptation and precision/recall control.**
+   The model card states that the model can be adapted through fine-tuning, that runtime decoding can control precision/recall tradeoffs, and that even small fine-tuning sets can yield large gains on shifted distributions.
+2. **CyberNER validates the harmonization thesis.**
+   The CyberNER paper argues that naive concatenation of cyber NER datasets degrades performance, while principled STIX-based harmonization yields about a **30% relative F1 improvement** over naive merging. This directly supports the repo’s effort to normalize CyNER, DNRTI, APTNER, and related sources before training.
+3. **SecureBERT 2.0 is strong evidence of the ceiling, but mostly because of domain pretraining rather than just supervised NER fine-tuning.**
+   The SecureBERT 2.0 paper reports pretraining on **13B+ cybersecurity text tokens** plus **53M code tokens** and then achieving state-of-the-art benchmark performance. This is a useful target, but not a fair apples-to-apples baseline for the current Arcspan setup, which is adapting a general span model rather than a cyber-native encoder.
+Also relevant:
+4. **MoE for NER is plausible, but evidence is about robustness to noisy labels more than guaranteed domain transfer.**
+   BOND-MoE supports the general idea that MoE can help NER under noisy or weak supervision, but it does not by itself prove that the current OPF routing behavior is optimal for cyber CTI extraction.
+## Why it matters
+The current approach is **feasible**, but only under the right claim:
+- Feasible claim: "A compact general-purpose span detector can be retargeted into a useful cyber NER model with careful schema harmonization, leakage control, and targeted data augmentation."
+- Not yet supported claim: "This approach will match or beat cyber-native models like SecureBERT 2.0 on top-end benchmark F1."
+The repo’s current bottleneck is no longer whether the base OPF model can learn the task at all. R8 already shows it can. The bottleneck is whether data representation and decoding can close the Organization/System recall gap enough to produce a competitive and useful model.
+## Source
+- OpenAI Privacy Filter model card:
+  https://cdn.openai.com/pdf/c66281ed-b638-456a-8ce1-97e9f5264a90/OpenAI-Privacy-Filter-Model-Card.pdf
+- CyberNER:
+  https://arxiv.org/abs/2510.26499
+- SecureBERT 2.0:
+  https://arxiv.org/abs/2510.00240
+- BOND-MoE:
+  https://arxiv.org/abs/2404.19192
+## Open questions
+- Can strict R9 training plus decode calibration move exact-match APTNER F1 from ~0.50 into the low 0.60s?
+- Is the propagated dataset genuinely useful, or does it mainly inject memorization and ambiguous false positives?
+- Would the highest-leverage next step be better decoding/calibration, or another round of targeted Org/System data creation?

research/notes/progress/2026-04-26-02-cyner-exact-match-and-gap-analysis.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# R8 Exact-Match on CyNER + Gap Analysis vs APTNER
+**Date:** 2026-04-26
+**Status:** Completed locally on saved R8 checkpoint
+## What we found
+Ran the exact-match evaluator on the saved R8 checkpoint against the converted original CyNER test set:
+- Checkpoint: `checkpoints/r8_5class/epoch_4`
+- Test data: `data/processed/cyner_test.jsonl`
+- Output JSON: `results/r8_cyner_exact_match.json`
+### CyNER exact-match results
+| Metric | Value |
+|---|---:|
+| Micro precision | 0.4540 |
+| Micro recall | 0.3655 |
+| **Micro F1** | **0.4050** |
+For comparison, containment on the same run was:
+| Metric | Value |
+|---|---:|
+| Micro precision | 0.5097 |
+| Micro recall | 0.4283 |
+| **Micro F1** | **0.4655** |
+Per-class exact-match F1 on CyNER:
+| Class | Precision | Recall | F1 | Support |
+|---|---:|---:|---:|---:|
+| Malware | 0.5847 | 0.5702 | **0.5774** | 242 |
+| Indicator | 0.5181 | 0.1648 | **0.2500** | 261 |
+| Organization | 0.2875 | 0.3511 | **0.3162** | 131 |
+| System | 0.4120 | 0.3871 | **0.3992** | 248 |
+| Vulnerability | 0.5000 | 0.3000 | **0.3750** | 10 |
+### Test-distribution comparison
+Span distribution by class:
+| Class | R8 train | R9 train | CyNER test | APTNER test |
+|---|---:|---:|---:|---:|
+| Indicator | 32.1% | 23.8% | 29.3% | 16.2% |
+| Malware | 22.0% | 24.1% | 27.1% | 30.0% |
+| Organization | 19.3% | 24.5% | 14.7% | 26.8% |
+| System | 17.3% | 18.5% | 27.8% | 25.6% |
+| Vulnerability | 9.3% | 9.1% | 1.1% | 1.5% |
+### Main gap signals
+1. **CyNER is the hardest Indicator benchmark** for the current model.
+   The repo’s audit already found that CyNER indicators are often defanged, partial, or unconventional: package names, registry paths, and odd multi-token indicators. R8 exact-match confirms this with Indicator F1 collapsing to **0.25**.
+2. **APTNER and CyNER stress different weaknesses.**
+   - APTNER exposes **Organization/System contextual recall** problems in APT-report text.
+   - CyNER exposes **Indicator format coverage** problems plus some boundary issues.
+3. **R9 is better aligned for APTNER than for CyNER.**
+   R9 meaningfully increases Organization share (19.3% → 24.5%) and slightly System share, which should help APTNER more than CyNER. It does **not** directly solve the CyNER-specific defanged/unconventional Indicator problem.
+## Why it matters
+This clarifies the anti-benchmaxxing position:
+- Improving APTNER requires **APT-style Org/System data**.
+- Improving CyNER requires **CyNER-style Indicator coverage**.
+Those are related, but not identical, objectives. A generalized model needs both. If we only optimize for one benchmark distribution, we will regress on the other.
+## Source
+- Result file: `results/r8_cyner_exact_match.json`
+- Evaluator: `scripts/eval_exact_match.py`
+- Prior audits:
+  - `research/notes/progress/2026-04-24-45-data-quality-audit.md`
+  - `research/notes/progress/2026-04-24-54-audit-train-test-leakage.md`
+## Open questions
+- Should we split future data work into two explicit buckets: `APT-style Org/System` and `CyNER-style Indicator`?
+- Is a separate Indicator-format normalization pass more valuable than adding more generic cybersecurity spans?
+- Should R9 strict be followed immediately by an `R9+indicator-coverage` dataset branch rather than a single monolithic next round?

research/paper/outline.md ADDED Viewed

	@@ -0,0 +1,150 @@

+# Paper Outline: Arcspan — Efficient Cybersecurity Entity Extraction via Sparse Mixture-of-Experts
+**Working title — will refine later**
+---
+## Abstract (draft skeleton)
+We present Arcspan, a cybersecurity entity extraction system built by fine-tuning a 1.5B-parameter sparse Mixture-of-Experts (MoE) bidirectional token classifier with only 50M active parameters per token. Starting from OpenAI's Privacy Filter architecture, we replace the PII label space with cybersecurity-specific entity types and fine-tune on [datasets TBD]. On [benchmark TBD], Arcspan achieves [F1 TBD] compared to CyNER's [F1 TBD] while using 11x fewer active parameters, running [Nx] faster on CPU, and requiring as few as [N] labeled examples to reach [F1] accuracy. Our results demonstrate that sparse MoE architectures offer a compelling efficiency–accuracy tradeoff for domain-specific entity extraction, enabling deployment in resource-constrained environments where dense transformers are impractical.
+---
+## 1. Introduction
+- The need for automated IOC/entity extraction from threat intelligence reports
+- Current approaches: regex (fast but misses contextual entities), dense transformers (accurate but heavy), hybrid (CyNER)
+- The opportunity: sparse MoE models offer large parameter capacity with small active compute
+- Our contribution: first application of sparse MoE token classification to cybersecurity NER
+- Preview of results
+## 2. Related Work
+### 2.1 Cybersecurity NER
+- CyNER (Alam et al., 2022) — hybrid regex + XLM-RoBERTa-large + SpaCy
+- PRISM benchmark (Froudakis et al., 2025) — IOC extraction evaluation
+- Regex-based tools (YARA, Sigma rules, IOC parsers)
+- OTuHunt and other OT/ICS-specific NLP
+### 2.2 Lightweight NER
+- GLiNER (Zaratiana et al., 2023) — zero-shot span detection
+- SpaCy models — CNN and transformer variants
+- BERT-base-NER (dslim) — the dense baseline
+- Flair, Stanza, and other lightweight alternatives
+### 2.3 Sparse Mixture-of-Experts for NLP
+- MoE in language modeling (Switch Transformer, GShard)
+- MoE for classification tasks (less explored)
+- The Privacy Filter architecture as an MoE token classifier
+## 3. Architecture
+### 3.1 Base Model
+- OpenAI Privacy Filter: 1.5B total / 50M active params
+- Architecture: 8-layer pre-norm transformer encoder, 128 experts (top-4 routing), GQA (14 Q / 2 KV heads), banded bidirectional attention (257-token window)
+- Pretraining: autoregressive on gpt-oss pipeline, then converted to bidirectional classifier
+- Decoding: constrained Viterbi with BIOES transition enforcement and 6 tunable transition biases
+### 3.2 Label Space Adaptation
+- Custom label-space JSON → automatic output head rebuild with warm-start row copying
+- Our cybersecurity label taxonomy: [entity types TBD]
+- BIOES encoding: 1 + N×4 token-level classes
+### 3.3 Fine-tuning Procedure
+- Full-model fine-tuning (all parameters, not just head)
+- AdamW optimizer, hyperparameters: [TBD, will tune]
+- Windowed training for long documents
+## 4. Experimental Setup
+### 4.1 Datasets
+- Training data: [TBD — CyNER corpus + PRISM + Pile-NER subset]
+- Evaluation data: [held-out test split from same, plus cross-dataset generalization]
+- Data conversion pipeline: source format → BIOES JSONL
+### 4.2 Baselines
+1. **CyNER** (Alam et al., 2022) — 560M params, hybrid pipeline (our primary comparison)
+2. **BERT-base-NER** fine-tuned on same data — 110M dense params (standard NER baseline)
+3. **GLiNER-M** zero-shot — 90M params (zero-shot ceiling)
+4. **Regex-only** — pattern matching for structured IOCs (lower bound)
+5. **SpaCy en_core_web_trf** — 110M params, general NER (out-of-domain baseline)
+### 4.3 Evaluation Protocol
+- **Primary metric:** Span-level F1, precision, recall (exact match)
+- **Secondary metrics:** Token-level F1, partial span overlap F1
+- **Per-entity-type breakdown** — critical for understanding where MoE wins/loses
+- **Statistical significance:** bootstrap confidence intervals on F1
+- All on same held-out test set, 5 random seed runs for variance estimation
+### 4.4 Efficiency Metrics
+- Inference latency: mean ± std ms per document (CPU single-thread, batch size 1)
+- Throughput: documents per second (CPU, various batch sizes)
+- Memory: peak RSS during inference
+- Model size on disk (bf16)
+- All measured on same hardware (document specs: CPU model, RAM, OS)
+## 5. Results
+### 5.1 Main Results (Table 1)
+- Arcspan vs. all baselines: span-level F1, precision, recall, latency, model size
+- Per-entity-type F1 breakdown (Table 2)
+### 5.2 Data Efficiency (Figure 1 — the money chart)
+- Learning curve: F1 at {1%, 5%, 10%, 25%, 50%, 100%} of training data
+- Compare Arcspan curve vs. BERT-base fine-tuned on same fractions
+- This is where the MoE pretraining advantage should be most visible
+### 5.3 Efficiency–Accuracy Tradeoff (Figure 2)
+- Scatter plot: F1 vs. active parameters for all models
+- Scatter plot: F1 vs. inference latency (CPU)
+- Arcspan should be Pareto-optimal or near-Pareto
+### 5.4 Viterbi Decoding Analysis
+- Impact of Viterbi vs. per-token argmax on span-level F1
+- Precision–recall curves at different Viterbi operating points
+- This is unique to our architecture — no other lightweight NER has this
+### 5.5 Error Analysis
+- Where does Arcspan fail vs. CyNER?
+- Structured IOCs (IPs, hashes) — how much do regex pre-filters help?
+- Context-dependent entities (malware names, threat actors) — the ML sweet spot
+- Obfuscated/defanged indicators — known hard case
+### 5.6 Ablation Studies
+- Full model fine-tuning vs. head-only fine-tuning
+- Effect of number of active experts (top-2 vs top-4 vs top-8)
+- Effect of Viterbi transition biases
+- Effect of banded attention window size (if modifiable)
+## 6. Discussion
+- When does sparse MoE beat dense transformers for NER? (our hypothesis: when domain adaptation is needed and data is limited)
+- Practical deployment considerations (CPU inference, edge devices, browser)
+- Limitations: 257-token window, English-centric, static label space
+- The label-space reconfigurability as a platform feature (same base model, different verticals)
+## 7. Conclusion
+- Summary of contribution
+- Open-source release: fine-tuned checkpoint, label space config, evaluation scripts
+- Future work: additional verticals (energy, medical), multilingual cybersecurity NER
+---
+## Key Experiments We MUST Run
+1. **Main comparison table** — Arcspan vs CyNER vs BERT-base vs GLiNER vs regex (span F1, latency, size)
+2. **Data efficiency curve** — F1 vs training data fraction (the killer chart)
+3. **Per-entity-type breakdown** — where MoE wins/loses vs dense
+4. **Viterbi vs argmax** — unique architectural advantage
+5. **Ablation: experts_per_token** — the `OPF_EXPERTS_PER_TOKEN` env var lets us test top-2 vs top-4
+## What We Need Before We Can Run Experiments
+- [ ] Finalized label taxonomy (from CyNER deep dive)
+- [ ] Training data in BIOES JSONL format
+- [ ] CyNER installed and runnable as baseline
+- [ ] BERT-base fine-tuning pipeline (HuggingFace Trainer)
+- [ ] GLiNER inference pipeline for zero-shot baseline
+- [ ] Evaluation harness that runs all models on same test set
+- [ ] Hardware specs documented

research/securebert2/.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+doc embedding/attack_bert_cont_eval.py
+doc embedding/CrossEncoder_infer.py
+mlm/code_mlm_eval_secure_bert.py
+mlm/code_mlm_eval.py
+mlm/dataset.py
+mlm/primus_load.py
+mlm/run_modernbert.py
+mlm/SecureBERT_mlm_eval.py
+mlm/code_mlm_eval_secure_bert.py
+ner/dataset.py
+vuln_classification/CodeVuln_infer.py
+ner/NER_infer.py

research/securebert2/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[oss-conduct@cisco.com](mailto:oss-conduct@cisco.com). All complaints will be reviewed and investigated
+promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series of
+actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder][Mozilla CoC].
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations

research/securebert2/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# How to Contribute
+Thanks for your interest in contributing to `securebert2`! Here are a few
+general guidelines on contributing and reporting bugs that we ask you to review.
+Following these guidelines helps to communicate that you respect the time of the
+contributors managing and developing this open source project. In return, they
+should reciprocate that respect in addressing your issue, assessing changes, and
+helping you finalize your pull requests. In that spirit of mutual respect, we
+endeavor to review incoming issues and pull requests within 10 days, and will
+close any lingering issues or pull requests after 60 days of inactivity.
+Please note that all of your interactions in the project are subject to our
+[Code of Conduct](/CODE_OF_CONDUCT.md). This includes creation of issues or pull
+requests, commenting on issues or pull requests, and extends to all interactions
+in any real-time space e.g., Slack, Discord, etc.
+## Reporting Issues
+Before reporting a new issue, please ensure that the issue was not already
+reported or fixed by searching through our [issues
+list](https://github.com/cisco-ai-defense/securebert2/issues).
+When creating a new issue, please be sure to include a **title and clear
+description**, as much relevant information as possible, and, if possible, a
+test case.
+**If you discover a security bug, please do not report it through GitHub.
+Instead, please see security procedures in [SECURITY.md](/SECURITY.md).**
+## Sending Pull Requests
+Before sending a new pull request, take a look at existing pull requests and
+issues to see if the proposed change or fix has been discussed in the past, or
+if the change was already implemented but not yet released.
+We expect new pull requests to include tests for any affected behavior, and, as
+we follow semantic versioning, we may reserve breaking changes until the next
+major version release.
+## Other Ways to Contribute
+We welcome anyone that wants to contribute to `securebert2` to triage and
+reply to open issues to help troubleshoot and fix existing bugs. Here is what
+you can do:
+- Help ensure that existing issues follows the recommendations from the
+  _[Reporting Issues](#reporting-issues)_ section, providing feedback to the
+  issue's author on what might be missing.
+- Review and update the existing content of our
+  [Wiki](https://github.com/cisco-ai-defense/securebert2) with up-to-date
+  instructions and code samples.
+- Review existing pull requests, and testing patches against real existing
+  applications that use `securebert2`.
+- Write a test, or add a missing test case to an existing test.
+Thanks again for your interest on contributing to `securebert2`!
+:heart:

research/securebert2/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2025] [CISCO - AI DEFENSE]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

research/securebert2/MAINTAINERS.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Maintainers
2	+ - [ai-threat-intel@cisco.com](mailto:ai-threat-intel@cisco.com)

research/securebert2/README.md ADDED Viewed

	@@ -0,0 +1,231 @@

+# SecureBERT 2.0: Advanced Domain-Specific Language Model for Cybersecurity Intelligence
+## About The Project
+[**SecureBERT 2.0**](https://arxiv.org/pdf/2510.00240) is **Cisco AI**'s officially released, domain-adapted encoder-based language model for cybersecurity and threat intelligence. Built on the ModernBERT architecture, it incorporates hierarchical encoding and long-context modeling, enabling effective processing of complex cybersecurity documents, source code, and threat intelligence reports. Pretrained on a massive, multi-modal corpus—including over 13 billion text tokens and 53 million code tokens—SecureBERT 2.0 achieves state-of-the-art performance in semantic search, named entity recognition, code vulnerability detection, and threat analysis. With this release, Cisco aims to advance research in cybersecurity and AI by promoting transparency, enabling collaboration, and empowering practitioners, researchers, and organizations to build upon this work, accelerate innovation, and strengthen defenses against emerging cyber threats.
+---
+## Key Features
+- **Domain-Specific Pretraining**: Extensive cybersecurity corpus, including threat reports, vulnerability advisories, technical blogs, and source code.
+- **Multi-Modal Understanding**: Integrates natural language and code for advanced vulnerability detection and threat intelligence.
+- **Hierarchical & Long-Context Modeling**: Captures both fine-grained and high-level structures across extended documents.
+- **Optimized for Cybersecurity Tasks**:
+  - Semantic search and document retrieval
+  - Named entity recognition (NER)
+  - Code vulnerability detection
+  - Threat intelligence analysis
+---
+## Pretraining Dataset
+| Dataset Category             | Code Tokens | Text Tokens |
+|-------------------------------|------------|------------|
+| Seed corpus                   | 9,406,451  | 256,859,788 |
+| Large-scale web text          | 268,993    | 12,231,942,693 |
+| Reasoning-focused data        | --         | 3,229,293 |
+| Instruction-tuning data       | 61,590     | 2,336,218 |
+| Code vulnerability corpus     | 2,146,875  | -- |
+| Cybersecurity dialogue data   | 41,503,749 | 56,871,556 |
+| Original baseline dataset     | --         | 1,072,798,637 |
+| **Total**                     | 53,387,658 | 13,623,037,185 |
+---
+## MLM Evaluation (Masked Language Modeling)
+SecureBERT 2.0 demonstrates strong domain-specific understanding:
+| Top-n | Objects (Nouns) | Verbs (Actions) | Code Tokens |
+|-------|----------------|----------------|-------------|
+| 1     | 56.20%         | 45.02%         | 39.27%      |
+| 5     | 82.72%         | 74.12%         | 55.41%      |
+| 10    | 88.80%         | 81.64%         | 60.03%      |
+> Outperforms general-purpose models in predicting cybersecurity-specific terms and code elements.
+---
+## Downstream Tasks
+### 1. Document Embedding
+**Cross-Encoder Results**
+| Model                | mAP   | R@1   | NDCG@10 | MRR@10 |
+|----------------------|-------|-------|---------|--------|
+| ms-marco-TinyBERT-L2 | 0.920 | 0.849 | 0.964   | 0.955  |
+| **SecureBERT 2.0**   | 0.955 | 0.948 | 0.986   | 0.983  |
+**Bi-Encoder Results**
+| Model                    | mAP   | R@1   | MRR@10 |
+|--------------------------|-------|-------|--------|
+| all-MiniLM-L12-v2        | 0.912 | 0.924 | 0.945  |
+| **SecureBERT 2.0**       | 0.951 | 0.984 | 0.989  |
+> Demonstrates high precision in semantic search and scalable retrieval.
+---
+### 2. Named Entity Recognition (NER)
+| Model               | F1    | Recall | Precision |
+|--------------------|-------|--------|-----------|
+| CyBERT              | 0.351 | 0.281  | 0.467     |
+| SecureBERT          | 0.734 | 0.759  | 0.717     |
+| **SecureBERT 2.0**  | 0.945 | 0.965  | 0.927     |
+> Near-perfect recognition of cybersecurity entities such as Malware, Vulnerability, System, Indicator, and Organization.
+---
+### 3. Code Vulnerability Detection
+| Model        | Accuracy | F1    | Recall | Precision |
+|-------------|----------|-------|--------|-----------|
+| CodeBERT     | 0.627    | 0.372 | 0.241  | 0.821     |
+| CyBERT       | 0.459    | 0.630 | 1.000  | 0.459     |
+| **SecureBERT 2.0** | 0.655    | 0.616 | 0.602  | 0.630     |
+> Balanced detection performance with higher F1 score and reduced false positives compared to prior models.
+** All for models are available on Huggingface **
+## Hugging Face Model Paths
+| Task | Model Path |
+|------|------------|
+| SecureBERT 2.0 | `cisco-ai/SecureBERT2.0-base` |
+| Cross Encoder | `cisco-ai/SecureBERT2.0-cross_encoder` |
+| Bi-Encoder | `cisco-ai/SecureBERT2.0-biencoder` |
+| Named Entity Recognition (NER) | `cisco-ai/SecureBERT2.0-NER` |
+| Vulnerability Classification | `cisco-ai/SecureBERT2.0-code-vuln-detection` |
+# Getting Started
+This repository provides the full framework for pretraining, fine-tuning, and evaluating SecureBERT 2.0 across key cybersecurity tasks.
+Repository Structure
+```
+.
+├── mlm/                       # Model pretraining (Masked Language Modeling)
+│   ├── train.py                # Pretraining script for MLM
+│   └── SecureBERT_mlm_eval.py # MLM evaluation script
+├── vuln_classification/        # Code vulnerability detection
+│   ├── CodeVuln_train.py       # Fine-tuning SecureBERT for vulnerability detection
+│   └── CodeVuln_eval.py        # Evaluation on code vulnerability datasets
+├── rt2/ner/                    # Named Entity Recognition (NER) tasks
+│   ├── NER_train.py            # Fine-tuning SecureBERT for cybersecurity NER
+│   └── NER_eval.py             # Evaluation script for NER models
+├── doc_embedding/              # Document embedding tasks
+│   ├── BiEncoder_train.py      # Bi-encoder training for semantic search
+│   ├── CrossEncoder_train.py   # Cross-encoder training for fine-grained ranking
+│   ├── BiEncoder_eval.py       # Bi-encoder evaluation
+│   └── CrossEncoder_eval.py    # Cross-encoder evaluation
+├── opensource_data/            # Preprocessed datasets
+│   ├── data_vuln_dataset.parquet
+│   ├── data_vuln_dataset_test.parquet
+│   ├── data_NER_train.json
+│   ├── data_NER_test.json
+│   ├── data_sentence_pairs.parquet
+│   ├── data_sentence_pairs_test.parquet
+│   └── data_pretrain.parquet
+├── dataset.py                  # Dataset loading and preprocessing utilities
+├── requirements.txt            # Python dependencies
+├── LICENSE
+├── README.md
+├── CODE_OF_CONDUCT.md
+├── CONTRIBUTING.md
+├── SECURITY.md
+├── MAINTAINERS.md
+└── .gitignore
+```
+## Requirements
+- Python 3.10+
+- PyTorch 2.1+ with CUDA
+- Hugging Face Transformers
+- Lightning Fabric
+- tqdm
+## Installation
+1.  **Clone the repository:**
+    ```bash
+    git clone https://github.com/cisco-ai-defense/securebert2.git
+    ```
+2.  **Create a virtual environment (recommended):**
+    ```bash
+    python -m venv venv
+    source venv/bin/activate # On Windows: `venv\Scripts\activate`
+    ```
+3.  **Install the required Python packages:**
+    ```bash
+    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 # Adjust cu121 for your CUDA version
+    pip install transformers lightning tqdm pandas pyarrow
+    ```
+    *Note: Ensure your `torch` installation matches your CUDA version. The example above is for CUDA 12.1.*
+4.  **Ensure `dataset.py` is available: and also the datasets are available**
+## Train and Evaluate
+Every directory contains `train` and `eval` files. Make sure to customize them with your desired model or dataset path.
+```bash
+cd mlm
+```
+By default, the dataset is set to `ModernBertDataset()` from `dataset.py`, and the model is set to `nswerdotai/ModernBERT-base`.
+To start training on a single GPU, simply run:
+```bash
+python train.py
+```
+For multi-GPU setting, run:
+```bash
+torchrun --nproc_per_node=8 train.py
+```
+For evaluation, provide a list of Hugging Face model IDs along with the evaluation dataset. Below is an example format for the MLM task.
+```python
+    sentences = [
+        "The attacker gained access through a [MASK] vulnerability.",
+        "Users should always enable [MASK] authentication for better security.",
+        "The malicious [MASK] was detected by the intrusion detection system.",
+        "The ransomware encrypted all [MASK] on the server.",
+        "A strong [MASK] policy helps prevent brute-force attacks."
+    ]
+    ground_truths = ["software", "multi-factor", "payload", "files", "password"]
+    model_ids = [
+        "cisco-ai/SecureBERT2.0-base",
+        "answerdotai/ModernBERT-base",
+        "ehsanaghaei/SecureBERT",
+    ]
+```
+Similar to training, simply run:
+```bash
+python SecureBERT2_mlm_eval.py
+```
+# Contribution
+We welcome contributions to improve SecureBERT 2.0, including:
+* New datasets and pretraining corpora
+* Additional downstream cybersecurity tasks
+* Model architecture enhancements
+* Optimized evaluation pipelines
+Please review CONTRIBUTING.md for guidelines.

research/securebert2/SECURITY.md ADDED Viewed

	@@ -0,0 +1,57 @@

+# Security Policies and Procedures
+This document outlines security procedures and general policies for the
+`securebert2` project.
+- [Disclosing a security issue](#disclosing-a-security-issue)
+- [Vulnerability management](#vulnerability-management)
+- [Suggesting changes](#suggesting-changes)
+## Disclosing a security issue
+The `securebert2` maintainers take all security issues in the project
+seriously. Thank you for improving the security of `securebert2`. We
+appreciate your dedication to responsible disclosure and will make every effort
+to acknowledge your contributions.
+`securebert2` leverages GitHub's private vulnerability reporting.
+To learn more about this feature and how to submit a vulnerability report,
+review [GitHub's documentation on private reporting](https://docs.github.com/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability).
+Here are some helpful details to include in your report:
+- a detailed description of the issue
+- the steps required to reproduce the issue
+- versions of the project that may be affected by the issue
+- if known, any mitigations for the issue
+A maintainer will acknowledge the report within three (3) business days, and
+will send a more detailed response within an additional three (3) business days
+indicating the next steps in handling your report.
+If you've been unable to successfully draft a vulnerability report via GitHub
+or have not received a response during the alloted response window, please
+reach out via the [Cisco Open security contact email](mailto:oss-security@cisco.com).
+After the initial reply to your report, the maintainers will endeavor to keep
+you informed of the progress towards a fix and full announcement, and may ask
+for additional information or guidance.
+## Vulnerability management
+When the maintainers receive a disclosure report, they will assign it to a
+primary handler.
+This person will coordinate the fix and release process, which involves the
+following steps:
+- confirming the issue
+- determining affected versions of the project
+- auditing code to find any potential similar problems
+- preparing fixes for all releases under maintenance
+## Suggesting changes
+If you have suggestions on how this process could be improved please submit an
+issue or pull request.

research/securebert2/dataset.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright 2025 Cisco Systems, Inc. and its affiliates
+#
+# SPDX-License-Identifier: Apache-2.0
+from datasets import Dataset
+import re
+from transformers import AutoTokenizer
+import json
+import torch
+import pandas as pd
+from sentence_transformers import InputExample
+class ModernBertDataset(Dataset):
+    def __init__(self, parquet_path="./opensource_data/data_pretrain.parquet", n=None):
+        """
+        Args:
+            parquet_path (str): Path to the single .parquet file.
+            n (int, optional): If provided, randomly sample n rows from the file.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
+        # Load the parquet file (must contain a column named "text")
+        df = pd.read_parquet(parquet_path)
+        # Sample n rows if requested
+        if n is not None and n < len(df):
+            df = df.sample(n=n, random_state=42)
+        # Clean the text
+        self.txt_data = [self.clean_text(t) for t in df["text"].astype(str).tolist()]
+    def clean_text(self, text):
+        text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)   # remove markdown headings
+        text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)             # bold -> plain
+        text = re.sub(r"\*(.*?)\*", r"\1", text)                 # italic -> plain
+        text = re.sub(r"\[.*?\]\(.*?\)", "", text)               # links -> remove
+        text = re.sub(r"`([^`]*)`", r"\1", text)                 # inline code -> plain
+        text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)   # code blocks -> remove
+        text = re.sub(r"\n+", "\n", text)                        # collapse multiple newlines
+        text = re.sub(r"\s{2,}", " ", text)                      # collapse extra spaces
+        return text.strip()
+    def __getitem__(self, idx):
+        curr_text = self.txt_data[idx]
+        encoded = self.tokenizer(
+            curr_text,
+            padding="max_length",
+            truncation=True,
+            max_length=1024,
+            return_tensors="pt",
+        )
+        return {
+            "input_ids": encoded["input_ids"].squeeze(0),
+            "attention_mask": encoded["attention_mask"].squeeze(0),
+        }
+    def __len__(self):
+        return len(self.txt_data)
+class ContrastiveLearningDataset:
+    def __init__(self, parquet_path="./opensource_data/data_sentence_pairs.parquet"):
+        df = pd.read_parquet(parquet_path, engine="pyarrow")
+        self.txt_data = list(zip(df["sentence1"], df["sentence2"]))
+    def __getitem__(self, idx):
+        return self.txt_data[idx]  # already a tuple
+    def __len__(self):
+        return len(self.txt_data)
+class Eval_ContrastiveDataset:
+    def __init__(self, parquet_path="./opensource_data/data_sentence_pairs_test.parquet"):
+        df = pd.read_parquet(parquet_path, engine="pyarrow")
+        self.txt_data = list(zip(df["sentence1"], df["sentence2"]))
+    def __getitem__(self, idx):
+        return self.txt_data[idx]
+    def __len__(self):
+        return len(self.txt_data)
+class Mrr_ContrastiveLearningDataset:
+    def __init__(self, parquet_path="./opensource_data/data_sentence_pairs.parquet"):
+        df = pd.read_parquet(parquet_path, engine="pyarrow")
+        self.txt_data = list(zip(df["sentence1"], df["sentence2"]))
+    def __getitem__(self, idx):
+        curr_txt = self.txt_data[idx]
+        return InputExample(texts=[curr_txt[0], curr_txt[1]])
+    def __len__(self):
+        return len(self.txt_data)
+class NerDataset():
+    def __init__(self, data_path="./opensource_data/data_NER_test.json", mode="train"):
+        self.txt_data = list()
+        self.ner_tags = list()
+        self.load_data(data_path)
+        assert len(self.ner_tags) == len(self.txt_data)
+        # Run tokenization step separately
+        self.tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
+        self.tokenized_inputs, self.labels = self.tokenize_and_align_labels()
+    def tokenize_and_align_labels(self):
+        """Tokenizes txt_data and aligns NER tags with subword tokens."""
+        tokenized_inputs = self.tokenizer(
+            self.txt_data,
+            is_split_into_words=True,
+            truncation=True,
+            max_length=1024,
+        )
+        labels = []
+        # Align labels with word pieces
+        for i, ner_tags_for_example in enumerate(self.ner_tags):
+            word_ids = tokenized_inputs.word_ids(batch_index=i)
+            current_labels = []
+            previous_word_idx = None
+            for word_idx in word_ids:
+                if word_idx is None:
+                    current_labels.append(-100)
+                elif word_idx != previous_word_idx:
+                    current_labels.append(ner_tags_for_example[word_idx])
+                else:
+                    current_labels.append(-100)
+                previous_word_idx = word_idx
+            labels.append(current_labels)
+        return tokenized_inputs, labels
+    def __getitem__(self, idx):
+        return {
+            'input_ids': torch.tensor(self.tokenized_inputs['input_ids'][idx], dtype=torch.long),
+            'attention_mask': torch.tensor(self.tokenized_inputs['attention_mask'][idx], dtype=torch.long),
+            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
+        }
+    def __len__(self):
+        return len(self.txt_data)
+    def save_data(self, path, n=None):
+        """Save txt_data, ner_tags, and label metadata to JSON file."""
+        if not n:
+            n = len(self.txt_data)
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({
+                "txt_data": self.txt_data[:n],
+                "ner_tags": self.ner_tags[:n],
+                "num_labels": self.num_labels
+            }, f)
+    def load_data(self, path):
+        """Load txt_data, ner_tags, and label metadata from JSON file."""
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        self.txt_data = data["txt_data"]
+        self.ner_tags = data["ner_tags"]
+        self.num_labels = data["num_labels"]
+class SentimentVulnerabilityDataset():
+    def __init__(self, parquet_path="./opensource_data/data_vuln_dataset.parquet"):
+        self.txt_data = list()
+        df = pd.read_parquet(parquet_path, engine="pyarrow")
+        self.txt_data = list(zip(df["code"], df["label"]))
+    def __getitem__(self, idx):
+        curr_txt = self.txt_data[idx]
+        return curr_txt[0], curr_txt[1]
+    def __len__(self):
+        return len(self.txt_data)
+class Eval_SentimentVulnerabilityDataset():
+    def __init__(self, parquet_path="./opensource_data/data_vuln_dataset_test.parquet"):
+        self.txt_data = list()
+        df = pd.read_parquet(parquet_path, engine="pyarrow")
+        self.txt_data = list(zip(df["code"], df["label"]))
+    def __getitem__(self, idx):
+        curr_txt = self.txt_data[idx]
+        return curr_txt[0], curr_txt[1]
+    def __len__(self):
+        return len(self.txt_data)