chairulridjal commited on
Commit
a0b4998
·
verified ·
1 Parent(s): bf013b2

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. data/processed/backup/enriched_13class_train.jsonl +3 -0
  3. data/processed/backup/enriched_5class_train.jsonl +3 -0
  4. data/processed/enriched_13class_train.jsonl +3 -0
  5. data/processed/enriched_5class_train.jsonl +3 -0
  6. data/processed/enriched_5class_train_cleaned.jsonl +3 -0
  7. data/processed/enriched_5class_train_cleaned_deleaked.jsonl +3 -0
  8. data/processed/enriched_5class_train_cleaned_trimmed.jsonl +3 -0
  9. data/processed/r7_5class_train.jsonl +3 -0
  10. data/processed/r8_5class_train.jsonl +3 -0
  11. data/processed/r8_5class_train_propagated.jsonl +3 -0
  12. data/processed/r9_5class_train.jsonl +3 -0
  13. research/decisions/ADR-001-use-case-cybersecurity.md +45 -0
  14. research/decisions/ADR-002-strict-r9-and-benchmark-portfolio.md +57 -0
  15. research/notes/class_balance_audit_2026-04-24.md +30 -0
  16. research/notes/progress/2026-04-24-12-r8-dataset-build.md +27 -0
  17. research/notes/progress/2026-04-24-16-baseline-eval-script.md +33 -0
  18. research/notes/progress/2026-04-24-20-paper-direction-decided.md +24 -0
  19. research/notes/progress/2026-04-24-24-cyner2-baseline-discovered.md +30 -0
  20. research/notes/progress/2026-04-24-25-competitor-landscape-deep-dive.md +257 -0
  21. research/notes/progress/2026-04-24-26-dataset-aggregation-plan.md +99 -0
  22. research/notes/progress/2026-04-24-29-final-llm-merge-complete.md +43 -0
  23. research/notes/progress/2026-04-24-30-data-quality-audit.md +245 -0
  24. research/notes/progress/2026-04-24-32-round4-training-overfitting.md +25 -0
  25. research/notes/progress/2026-04-24-36-dapt-research.md +230 -0
  26. research/notes/progress/2026-04-24-37-training-tricks-research.md +259 -0
  27. research/notes/progress/2026-04-24-39-class-weighting-data-scaling-research.md +296 -0
  28. research/notes/progress/2026-04-24-40-round4b-killed-no-checkpoint.md +29 -0
  29. research/notes/progress/2026-04-24-44-r5a-baseline-results.md +48 -0
  30. research/notes/progress/2026-04-24-45-data-quality-audit.md +175 -0
  31. research/notes/progress/2026-04-24-46-competitor-deep-dive.md +261 -0
  32. research/notes/progress/2026-04-24-49-moe-finetuning-research.md +189 -0
  33. research/notes/progress/2026-04-24-50-r7-data-pipeline-plan.md +262 -0
  34. research/notes/progress/2026-04-24-51-audit-ioc-coverage.md +62 -0
  35. research/notes/progress/2026-04-24-53-audit-label-consistency.md +147 -0
  36. research/notes/progress/2026-04-24-59-aptner-held-out-test.md +37 -0
  37. research/notes/progress/2026-04-24-cyner-deep-dive-and-datasets.md +408 -0
  38. research/notes/progress/2026-04-24-landscape-research-opus.md +390 -0
  39. research/notes/progress/2026-04-24-ner-recall-improvement-techniques.md +61 -0
  40. research/notes/progress/2026-04-26-01-feasibility-check-approach.md +48 -0
  41. research/notes/progress/2026-04-26-02-cyner-exact-match-and-gap-analysis.md +85 -0
  42. research/paper/outline.md +150 -0
  43. research/securebert2/.gitignore +12 -0
  44. research/securebert2/CODE_OF_CONDUCT.md +132 -0
  45. research/securebert2/CONTRIBUTING.md +58 -0
  46. research/securebert2/LICENSE +201 -0
  47. research/securebert2/MAINTAINERS.md +2 -0
  48. research/securebert2/README.md +231 -0
  49. research/securebert2/SECURITY.md +57 -0
  50. research/securebert2/dataset.py +179 -0
.gitattributes CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/processed/r8_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ data/processed/enriched_13class_train.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ data/processed/enriched_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ data/processed/enriched_5class_train_cleaned_trimmed.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ data/processed/r8_5class_train_propagated.jsonl filter=lfs diff=lfs merge=lfs -text
41
+ data/processed/enriched_5class_train_cleaned.jsonl filter=lfs diff=lfs merge=lfs -text
42
+ data/processed/r7_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
43
+ data/processed/r9_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ data/processed/enriched_5class_train_cleaned_deleaked.jsonl filter=lfs diff=lfs merge=lfs -text
45
+ data/processed/backup/enriched_13class_train.jsonl filter=lfs diff=lfs merge=lfs -text
46
+ data/processed/backup/enriched_5class_train.jsonl filter=lfs diff=lfs merge=lfs -text
data/processed/backup/enriched_13class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5d1e4f4d9bd3a414fcd81d05242c1f913f575a553b3233adb15a8ae51740ecf
3
+ size 24261655
data/processed/backup/enriched_5class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c6603e997c5844aec40404907210816886f6a56bce2acc5f27577b2d7f9469
3
+ size 21643218
data/processed/enriched_13class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f723221d386fe83d06916f9c1b0885e52327750bcfa4d9ccac36d0143b79d410
3
+ size 21203019
data/processed/enriched_5class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae602b2b8c89136ac80c49061c23fc0b41edeeb677a56888580feea5476dd21a
3
+ size 19573553
data/processed/enriched_5class_train_cleaned.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f49bff19319f8210cc3e1ecfbc18488ef60d73682f318ced1f314afbe44297
3
+ size 18736010
data/processed/enriched_5class_train_cleaned_deleaked.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a047e2b5d5d8197d2601e5ed575082026876be6861eed46cb9ace034213c6d0
3
+ size 16417825
data/processed/enriched_5class_train_cleaned_trimmed.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f59710bfdd33666816cb79282c812ae8db62052478d3b740b46ec827fcad71e8
3
+ size 18097434
data/processed/r7_5class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:772a78f8c6fce9a7f0b81120082e0f5579c89b6abd2dbb4c62ccebbd182b3508
3
+ size 19579089
data/processed/r8_5class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5172ffe17d2f266d1aa9f0e815a7f03045afab7b42b9d2fdaf95555cb23fbd8
3
+ size 18041668
data/processed/r8_5class_train_propagated.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04d647cb77ac4a81db2d69e11cca934baed8fd6562488e99c67bd292e016118c
3
+ size 22410416
data/processed/r9_5class_train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d4b825c5eab6b6ac91915aac239ead3b5622844d69163fe51af4b8f9826d7f
3
+ size 14918444
research/decisions/ADR-001-use-case-cybersecurity.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ADR-001: Use Case Selection — Cybersecurity IOC/Entity Extraction
2
+
3
+ **Date:** 2026-04-24
4
+ **Status:** Accepted
5
+ **Deciders:** Human (lead) + Claude (research partner)
6
+
7
+ ## Context
8
+
9
+ We evaluated 10+ verticals for repurposing OpenAI's Privacy Filter (50M active MoE bidirectional token classifier) beyond PII detection. The Opus research agent conducted a comprehensive landscape analysis covering existing tools, market gaps, available datasets, and architectural fit.
10
+
11
+ ## Options Considered
12
+
13
+ 1. **Cybersecurity IOC extraction** — Score 9/10
14
+ 2. **Developer tools (secret/annotation scanning)** — Score 8.5/10
15
+ 3. **Clinical de-identification** — Score 7.5/10
16
+ 4. **Improved PII (Presidio backend)** — Score 7/10
17
+ 5. **Financial entity extraction** — Score 6/10
18
+ 6. **Energy/power systems** — Low (no data, tiny market)
19
+ 7. Several others scored lower (legal, scientific, education, supply chain, HR)
20
+
21
+ ## Decision
22
+
23
+ **Cybersecurity IOC/entity extraction from threat intelligence reports**, with CyNER (560M params) as the primary benchmark competitor.
24
+
25
+ ## Reasoning
26
+
27
+ - **Biggest efficiency gap:** CyNER uses 560M dense params; we use 50M active (MoE). 11x compute reduction is the clearest "same accuracy, fraction of the cost" story.
28
+ - **Architecture fit:** Cybersecurity entities (IPs, hashes, CVEs, malware names, threat actors) are short-to-medium spans with clear boundaries — ideal for BIOES + Viterbi.
29
+ - **257-token window is sufficient:** IOC context is almost always within 1-2 sentences.
30
+ - **Data exists:** PRISM benchmark, CyNER corpus, Pile-NER cybersecurity subset, MITRE ATT&CK structured data.
31
+ - **Privacy argument is strong:** Threat reports contain internal network topology, can't be sent to cloud APIs.
32
+ - **Publishable:** "Sparse MoE vs. dense transformer for cybersecurity NER" is a clean research question.
33
+ - **Practical tool:** Every SOC team, every SIEM vendor needs lightweight local IOC extraction.
34
+
35
+ ## Deliverables
36
+
37
+ 1. **Research paper** — Rigorous comparison of Arcspan vs. CyNER (and other baselines)
38
+ 2. **Open-source tool** — Fine-tuned checkpoint + CLI/library for cybersecurity entity extraction
39
+
40
+ ## Consequences
41
+
42
+ - Need to acquire and convert multiple cybersecurity NER datasets to BIOES JSONL format
43
+ - Need to design a unified label taxonomy across datasets
44
+ - Need reproducible experimental setup (fixed seeds, documented hyperparameters, held-out test sets)
45
+ - Energy/power systems remains a potential future vertical once the platform is proven
research/decisions/ADR-002-strict-r9-and-benchmark-portfolio.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ADR-002: Strict R9 Dataset and Multi-Benchmark Evaluation
2
+
3
+ **Date:** 2026-04-26
4
+ **Status:** Accepted
5
+ **Deciders:** Human (lead) + Codex replacing Claude
6
+
7
+ ## Context
8
+
9
+ R8 proved that OpenAI Privacy Filter can learn the 5-class cyber NER task, but the honest exact-match results show different benchmark weaknesses:
10
+
11
+ - APTNER exact-match micro F1: 0.4982
12
+ - CyNER exact-match micro F1: 0.4050
13
+
14
+ APTNER mainly exposes APT-report-style Organization/System recall gaps. CyNER mainly exposes Indicator boundary and format coverage gaps, especially defanged or unusual IOCs.
15
+
16
+ The entity-propagated R8 file is now available, but its audit found 156,929 added spans on top of 76,824 base spans, including many generic or ambiguous surfaces. Including it in the next run would make any result hard to interpret.
17
+
18
+ ## Options Considered
19
+
20
+ 1. **Strict R9 only:** Train on R8 + deleaked CyberNER_harmonized + deleaked DNRTI, with validation/test overlap removed before deduplication.
21
+ 2. **R9 plus propagated R8:** Add the full propagated dataset immediately to maximize recall.
22
+ 3. **Delay R9 for a larger data rebuild:** Wait until we harvest much more targeted APT-style and CyNER-style data.
23
+
24
+ ## Decision
25
+
26
+ Run **strict R9** next.
27
+
28
+ Do **not** include propagated R8 in strict R9. Treat propagation as a separate future experiment only after filtering/auditing.
29
+
30
+ Report R9 with a benchmark portfolio:
31
+
32
+ - APTNER exact-match as the independent APT-report benchmark
33
+ - CyNER exact-match as the original CyNER benchmark comparison
34
+ - Enriched 5-class and SecureBERT2 5-class as supplementary continuity checks
35
+ - OPF containment metrics as diagnostics only, not the primary paper-comparable score
36
+
37
+ ## Reasoning
38
+
39
+ - Strict R9 is leakage-clean after the readiness gate: zero exact and zero prefix-80 train overlap with validation, enriched test, CyNER, SecureBERT2, and APTNER.
40
+ - The propagated dataset is too noisy for the next controlled experiment. It would likely improve some recall numbers while injecting false positives and benchmark memorization risk.
41
+ - A multi-benchmark protocol is necessary because improving APTNER and improving CyNER are not the same task. A single benchmark can be overfit unintentionally even with honest intent.
42
+ - Strict R9 gives a clean signal before larger data scaling. If it helps APTNER but not CyNER, the next branch should target Indicators. If it helps neither, we revisit training/decoding rather than blindly adding data.
43
+
44
+ ## Consequences
45
+
46
+ - R9 may score lower than a noisy propagation-boosted run, but its result will be interpretable.
47
+ - Future data work should split into two explicit tracks:
48
+ - **Track A:** APT-report-style Organization/System examples.
49
+ - **Track B:** CyNER-style Indicator examples, including defanged domains/IPs/URLs, file paths, registry paths, package names, and odd multi-token indicators.
50
+ - Decode calibration should happen after strict R9, using validation only, then evaluated unchanged across the benchmark portfolio.
51
+
52
+ ## Source
53
+
54
+ - R9 readiness audit: `results/r9_readiness_audit.md`
55
+ - Propagation audit: `results/entity_propagation_audit.md`
56
+ - R8 CyNER exact-match note: `research/notes/progress/2026-04-26-02-cyner-exact-match-and-gap-analysis.md`
57
+ - R9 readiness note: `research/notes/progress/2026-04-26-03-r9-readiness-and-propagation-audit.md`
research/notes/class_balance_audit_2026-04-24.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Arcspan NER Dataset Class Balance Audit (2026-04-24)
2
+
3
+ ## Summary
4
+ Analyzed 5 fixed/deleaked training files comprising **54,139 examples** and **152,941 entity spans** across 5 security NER classes.
5
+
6
+ | Dataset | Examples | All-O % | Total Spans | Imbalance |
7
+ |---------|----------|---------|-------------|-----------|
8
+ | **enriched_trimmed** | 25,127 | 10.0% | 75,677 | 2.46x |
9
+ | **enriched_deleaked** | 24,339 | 19.3% | 63,831 | 2.77x |
10
+ | **aptner_deleaked** | 3,078 | 33.1% | 4,627 | 16.77x |
11
+ | **securebert2_deleaked** | 316 | 47.5% | 344 | 11.42x |
12
+ | **defanged_augmented** | 1,279 | 0.0% | 8,462 | 11.41x |
13
+ | **COMBINED** | **54,139** | **15.5%** | **152,941** | **2.94x** |
14
+
15
+ ## Entity Distribution (Combined)
16
+ - **Indicator**: 44,282 (28.9%) — most common
17
+ - **Malware**: 35,646 (23.3%)
18
+ - **Organization**: 31,946 (20.9%)
19
+ - **System**: 25,995 (17.0%)
20
+ - **Vulnerability**: 15,072 (9.8%) — least common
21
+
22
+ ## Key Findings
23
+ 1. **Enriched files dominate**: `enriched_trimmed` + `enriched_deleaked` = 49.5k examples (91% of dataset)
24
+ 2. **Moderate imbalance**: 2.94x ratio within acceptable range for sequence labeling
25
+ 3. **All-O distribution**: 15.5% negative examples (reasonable for NER)
26
+ 4. **Defanged boost**: Augmentation adds 8.4k spans, particularly boosting Indicator class
27
+ 5. **Smaller sources volatile**: `aptner` and `securebert2` show high imbalance (11–17x) but contribute <6% of total
28
+
29
+ ## Recommendation
30
+ **Dataset is well-balanced for training.** The 2.94x imbalance is healthy—Vulnerability's underrepresentation (9.8%) is acceptable given domain scarcity. Enriched files provide stable foundation; defanged augmentation adds diversity without distorting class ratios.
research/notes/progress/2026-04-24-12-r8-dataset-build.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # R8 Dataset Build
2
+
3
+ ## What we found
4
+ Built the R8 (likely final) cybersecurity NER training dataset from deleaked sources:
5
+ - **26,079 train** examples, **76,824 entities**, 12.0% all-O rate
6
+ - **2,999 valid** examples, **5,927 entities**, 12.3% all-O rate
7
+ - Sources: enriched (deleaked), APTNER (deleaked), SecureBERT2 (deleaked), defanged augmented
8
+ - Stucco excluded (too noisy)
9
+ - Trimmed all-O from 20% down to 12% by random subsampling negative examples
10
+
11
+ ## Entity distribution (train)
12
+ - Indicator: 24,685
13
+ - Malware: 16,887
14
+ - Organization: 14,815
15
+ - System: 13,320
16
+ - Vulnerability: 7,117
17
+
18
+ ## Leakage verification
19
+ - **Zero exact matches** against all 4 test sets
20
+ - Prefix-80 matches are false positives (different texts sharing common openings)
21
+
22
+ ## Why it matters
23
+ This is the final clean dataset for training. All known leakage issues resolved.
24
+
25
+ ## Open questions
26
+ - Entity propagation (cross-document) running — will it meaningfully boost recall?
27
+ - Vulnerability class is smallest (7K) — may be the hardest to learn
research/notes/progress/2026-04-24-16-baseline-eval-script.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Baseline Evaluation Script Created
2
+
3
+ **Date:** 2026-04-24
4
+
5
+ ## What we built
6
+
7
+ `src/arcspan/eval/run_baselines.py` — evaluates HF NER models against our CyNER test data (748 examples, 5 entity types) with span-level exact-match P/R/F1.
8
+
9
+ Two baselines wired up:
10
+ 1. **SecureBERT2.0-NER** (`cisco-ai/SecureBERT2.0-NER`) — TF-based, BIO, 5 entity types matching ours directly
11
+ 2. **SecureModernBERT-NER** (`attack-vector/SecureModernBERT-NER`) — PyTorch, 22 entity types mapped to our 5-class space
12
+
13
+ ## Key findings from 20-example smoke test
14
+
15
+ | Model | Overall P | Overall R | Overall F1 |
16
+ |---|---|---|---|
17
+ | SecureBERT2.0-NER | 14.8% | 40.0% | 21.6% |
18
+ | SecureModernBERT-NER | 55.0% | 55.0% | 55.0% |
19
+
20
+ - SecureBERT2.0 is very noisy — over-predicts spans (low precision), includes trailing punctuation and non-entity text
21
+ - SecureModernBERT is substantially better on exact match; cleaner span boundaries
22
+ - Both models produce offsets with leading whitespace; we strip it in post-processing
23
+ - Neither model saw any Malware or Indicator entities in the first 20 examples (those types appear later in the dataset)
24
+
25
+ ## Why it matters
26
+
27
+ These are the baselines our fine-tuned Arcspan model will be measured against. The script is modular (`BASELINES` registry dict) so adding more models is trivial.
28
+
29
+ ## Open questions
30
+
31
+ - Need to run full 748-example eval for real numbers
32
+ - Should we add a "relaxed match" mode (overlapping spans count as partial credit)?
33
+ - The 20-example sample is Organization-heavy; full eval will give better per-type coverage
research/notes/progress/2026-04-24-20-paper-direction-decided.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paper Direction Decided + Experimental Framework
2
+
3
+ ## Decision
4
+ Both a **publishable research paper** and an **open-source tool**. If Arcspan matches or surpasses CyNER (560M dense) at 50M active params, it's a genuine contribution.
5
+
6
+ ## Paper Thesis
7
+ "Sparse MoE token classifiers, fine-tuned with minimal data, can match dense transformer NER models at 1/11th the active compute for cybersecurity entity extraction."
8
+
9
+ ## The Five Key Experiments
10
+ 1. **Main comparison table:** Arcspan vs CyNER vs BERT-base vs GLiNER-zero-shot vs regex-only
11
+ 2. **Data efficiency curve (Figure 1 — the money chart):** F1 at 1%/5%/10%/25%/50%/100% of data
12
+ 3. **Per-entity-type breakdown:** Where does MoE win vs lose?
13
+ 4. **Viterbi vs argmax:** Our unique architectural advantage
14
+ 5. **Expert routing ablation:** top-2 vs top-4 via OPF_EXPERTS_PER_TOKEN
15
+
16
+ ## Baselines to Implement
17
+ - CyNER (560M) — primary competitor
18
+ - BERT-base fine-tuned on same data (110M) — standard NER baseline
19
+ - GLiNER-M zero-shot (90M) — zero-shot ceiling
20
+ - Regex-only — lower bound
21
+ - SpaCy trf (110M) — out-of-domain baseline
22
+
23
+ ## What's Blocking Progress
24
+ Waiting on Opus agent for: CyNER exact label schema, dataset locations, PRISM benchmark details. Once we have those, we can design the label space JSON and start data conversion.
research/notes/progress/2026-04-24-24-cyner2-baseline-discovered.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CyNER 2.0 DeBERTa-v3-base — New Baseline Discovered
2
+
3
+ ## Key Facts
4
+ - **Model**: DeBERTa-v3-base (200M params, dense)
5
+ - **F1**: 91.88% (self-reported, needs verification)
6
+ - **Training data**: 11,074 examples (7,751 train) — augmented from bnsapa/cybersecurity-ner + AlienVault/OpenCTI
7
+ - **Label space**: 8 entity types (original 5 + Date, Location, ThreatGroup)
8
+ - **Training**: lr=2e-5, 3 epochs, batch_size=8, weight_decay=0.01
9
+ - **License**: MIT
10
+ - **HuggingFace**: https://huggingface.co/PranavaKailash/CyNER-2.0-DeBERTa-v3-base
11
+
12
+ ## Critical Observations
13
+
14
+ 1. **91.88% F1 is likely on their own augmented test set** — NOT on the original CyNER test set. This makes direct comparison tricky. We need to eval them on the same test set.
15
+ 2. **They use 8 entity types** vs CyNER's 5 — added Date, Location, ThreatGroup. Not apples-to-apples.
16
+ 3. **11K examples vs our 2.8K** — they have ~3x more training data from augmentation.
17
+ 4. **LR = 2e-5** — 10x lower than our first run (2e-4). This is a strong hint for our hyperparameter tuning.
18
+ 5. **DeBERTa-v3-base is 200M dense params** — 4x our active params (50M).
19
+
20
+ ## What We Can Use From This
21
+
22
+ - **Their augmented dataset** (MIT license) — we should download and convert it. 7,751 training examples is much better than our 2,811.
23
+ - **LR = 2e-5 as reference point** — our 2e-4 was too aggressive, confirmed.
24
+ - **As a baseline** — run their model on our CyNER test set for fair comparison.
25
+ - **Their additional entity types** (ThreatActor, Date) overlap with our planned Tier 1 expansion.
26
+
27
+ ## Source
28
+ - Model: https://huggingface.co/PranavaKailash/CyNER-2.0-DeBERTa-v3-base
29
+ - Dataset: https://huggingface.co/datasets/PranavaKailash/CyNER2.0_augmented_dataset
30
+ - GitHub: https://github.com/Pranava-Kailash/CyNER_2.0_API
research/notes/progress/2026-04-24-25-competitor-landscape-deep-dive.md ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cybersecurity NER Competitor Landscape Deep Dive
2
+
3
+ **Date:** 2026-04-24
4
+ **Purpose:** Map the competitive landscape for cybersecurity NER to inform Arcspan's positioning, baselines, and related work section.
5
+
6
+ ---
7
+
8
+ ## Summary Comparison Table
9
+
10
+ | Model | Architecture | Active Params | Entity Types | Overall F1 | Dataset | Weights Public? | License | Runnable Baseline? |
11
+ |---|---|---|---|---|---|---|---|---|
12
+ | **SecureBERT 2.0 NER** | ModernBERT-base (22L, 768d) | ~149M | 5 (Malware, Indicator, Vulnerability, System, Organization) | **0.945** | Cisco internal (3,400 train / 717 test) | Yes ([HF](https://huggingface.co/cisco-ai/SecureBERT2.0-NER)) | Apache 2.0 | **Yes** |
13
+ | **SecureModernBERT-NER** | ModernBERT-large | ~395M | 22 fine-grained (MALWARE, THREAT-ACTOR, CVE, IPV4, IPV6, DOMAIN, URL, HASHES, EMAIL, REGISTRY-KEYS, ORG, PRODUCT, PLATFORM, SERVICE, SECTOR, LOC, FILEPATH, MITRE-TACTIC, TOOL, CAMPAIGN, ...) | **0.848** | 502,726 curated spans from real-world CTI | Yes ([HF](https://huggingface.co/attack-vector/SecureModernBERT-NER)) | MIT | **Yes** |
14
+ | **CyberLLaMA** | LLaMA-3.2-3B + BiLSTM + CRF | ~3B | BIO-tagged cybersecurity terms (4,788 unique) | **0.989** | 42,404 articles (newspapers, blogs, official sites) | No (paper only) | Unknown | **No** |
15
+ | **XLNet-CRF** | XLNet-base + CRF | ~110M | CTI entities (malware, IP, URL, hash, etc.) | **0.974** (CTI-Reports), **0.887** (MalwareTextDB) | CTI-Reports, MalwareTextDB | Code on GitHub (no pretrained weights) | Unknown | **Partial** (retrain needed) |
16
+ | **BERT-CRF for CTI** | BERT-base + CRF | ~110M | 13 types (DNRTI), malware/IP/URL/hash (CTI-Reports) | **0.900** (DNRTI), **0.773** (CTI-Reports) | DNRTI (182K words), CTI-Reports (310K records), MalwareTextDB | Code on [GitHub](https://github.com/stwater20/NER-BERT-CRF-for-CTI) | Unknown | **Partial** |
17
+ | **CyNER** | Transformer + heuristics ensemble | Varies | Malware, threat actors, indicators, vulnerabilities | ~0.74 (on CyberNER harmonized) | CyNER dataset | Yes ([GitHub](https://github.com/aiforsec/CyNER)) | Unknown | **Yes** |
18
+ | **CyberNER (Harmonized)** | RoBERTa/SecureBERT/CySecBERT + CRF | ~125M | 21 STIX 2.1 entity types | **0.736** (RoBERTa best) | 610K tokens, 23,477 sentences from 4 merged datasets | Dataset public, code public | Unknown | **Yes** (benchmark) |
19
+ | **SecLMNER** | LLM (generative) + SecureBERT (encoder) | <10B + 110M | Multi-source cybersecurity entities | SecureBERT +6-17% F1 | 5 cybersecurity text sources | No (paper only) | Unknown | **No** |
20
+
21
+ ---
22
+
23
+ ## Detailed Model Profiles
24
+
25
+ ### 1. SecureBERT 2.0 NER (Cisco AI)
26
+
27
+ **Architecture:** ModernBERT-base with 22 hidden layers, 768 hidden size, 12 attention heads, max 8,192 tokens. Fine-tuned for token classification.
28
+
29
+ **Training data:** Cisco's internal hand-labeled NER corpus: 3,400 training samples, 717 test samples. The base model (SecureBERT 2.0) was pretrained on 13B+ text tokens and 53M code tokens from cybersecurity sources.
30
+
31
+ **Performance:**
32
+ | Model | F1 | Recall | Precision |
33
+ |---|---|---|---|
34
+ | CyBERT | 0.351 | 0.281 | 0.467 |
35
+ | SecureBERT 1.0 | 0.734 | 0.759 | 0.717 |
36
+ | **SecureBERT 2.0** | **0.945** | **0.965** | **0.927** |
37
+
38
+ Per-entity breakdown not publicly reported (only aggregate). Entity types: Malware, Indicator, Vulnerability, System, Organization (5 categories with 11 labels via BIO).
39
+
40
+ **Availability:**
41
+ - HuggingFace: `cisco-ai/SecureBERT2.0-NER` (Apache 2.0)
42
+ - GitHub: `cisco-ai-defense/securebert2`
43
+ - Uses TF model (`TFAutoModelForTokenClassification`) -- note TensorFlow dependency
44
+
45
+ **Paper:** Aghaei, E. et al. "SecureBERT 2.0: Advanced Language Model for Cybersecurity Intelligence." arXiv:2510.00240 (2025). https://arxiv.org/abs/2510.00240
46
+
47
+ **Baseline verdict:** **PRIMARY BASELINE.** Directly downloadable and runnable. The 0.945 F1 is on their own dataset with only 5 entity types -- important caveat. We must either (a) eval on their dataset with their labels, or (b) eval on a shared benchmark.
48
+
49
+ ---
50
+
51
+ ### 2. SecureModernBERT-NER (attack-vector)
52
+
53
+ **Architecture:** ModernBERT-large (answerdotai/ModernBERT-large), ~395M params, fine-tuned for token classification.
54
+
55
+ **Training data:** 502,726 manually curated text spans from real-world threat reports, vulnerability advisories, and incident analyses. Max sequence length 128 tokens during training.
56
+
57
+ **Performance:**
58
+ - Precision: 0.847, Recall: 0.848, F1: 0.848, Accuracy: 0.959
59
+ - Strong per-label: CVE (0.9995), SHA256 (0.9874), URL (0.9801), LOC (0.9557)
60
+ - Weaker: IPV6, EMAIL (rare types)
61
+
62
+ **Entity types (22):** MALWARE, THREAT-ACTOR, CVE, IPV4, IPV6, DOMAIN, URL, MD5, SHA1, SHA256, EMAIL, REGISTRY-KEYS, ORG, PRODUCT, PLATFORM, SERVICE, SECTOR, LOC, FILEPATH, MITRE-TACTIC, TOOL, CAMPAIGN
63
+
64
+ **Availability:**
65
+ - HuggingFace: `attack-vector/SecureModernBERT-NER` (MIT license)
66
+ - PyTorch model, standard `pipeline("token-classification")` inference
67
+
68
+ **Paper:** No academic paper. Community model card only.
69
+
70
+ **Baseline verdict:** **STRONG BASELINE.** 22 entity types makes this the most comprehensive label space. The 0.848 F1 across 22 types is arguably more impressive than SecureBERT 2.0's 0.945 across only 5 types. Directly runnable. MIT license is ideal.
71
+
72
+ ---
73
+
74
+ ### 3. CyMapNER / CyNER
75
+
76
+ **Note:** "CyMapNER" does not appear to be a real model. The actual model is **CyNER** — an open-source Python library from `aiforsec`.
77
+
78
+ **Architecture:** Ensemble approach combining transformer-based models, heuristics for IOC extraction, and publicly available NER models.
79
+
80
+ **Training data:** Custom cybersecurity corpus; integrates with MALOnt2.0 ontology.
81
+
82
+ **Performance:** On the CyberNER harmonized benchmark, transformer models trained on CyNER data achieve ~0.74 F1.
83
+
84
+ **Availability:**
85
+ - GitHub: `aiforsec/CyNER`
86
+ - arXiv: https://arxiv.org/abs/2204.05754
87
+
88
+ **Paper:** Alam, M.T. et al. "CyNER: A Python Library for Cybersecurity Named Entity Recognition." arXiv:2204.05754 (2022).
89
+
90
+ **Baseline verdict:** **SECONDARY BASELINE.** Useful as a reference point for the CyberNER harmonized benchmark. Older (2022) and lower performance.
91
+
92
+ ---
93
+
94
+ ### 4. CTI-BERT / BERT-CRF for CTI
95
+
96
+ **Architecture:** BERT-base-uncased + CRF layer. Also evaluated with secBERT (domain-adapted BERT).
97
+
98
+ **Training data:** Three public datasets:
99
+ - DNRTI: 182,452 words, 300+ threat reports, 13 entity classes
100
+ - CTI-Reports: 310,406 records (malware, IP, URL, hash)
101
+ - MalwareTextDB: malware text samples
102
+
103
+ **Performance:**
104
+ - DNRTI: 90.02% F1
105
+ - CTI-Reports: 77.29% F1 (high precision 98.37%, low recall 74.10%)
106
+ - MalwareTextDB: 58.57% F1
107
+ - Real-world OSINT: 82.64% accuracy
108
+
109
+ **Entity types:** 13 types on DNRTI (hacker groups, attacks, tools, vulnerabilities, methods); 4 types on CTI-Reports (malware, IP, URL, hash).
110
+
111
+ **Availability:**
112
+ - GitHub: `stwater20/NER-BERT-CRF-for-CTI`
113
+ - No pretrained weights hosted; training code available
114
+
115
+ **Paper:** Authors from NYCU. Published as a conference/workshop paper. PDF: https://speed.cs.nycu.edu.tw/~ydlin/Enhancing%20Cyber%20Threat%20Intelligence%20with%20Named%20Entity%20Recognition%20using%20BERT-CRF.pdf
116
+
117
+ **Baseline verdict:** **REFERENCE ONLY.** No hosted weights. Useful as a literature comparison point for DNRTI/CTI-Reports benchmarks.
118
+
119
+ ---
120
+
121
+ ### 5. LANCE
122
+
123
+ **Note:** No model called "LANCE" was found in the cybersecurity NER literature. This may be a confusion with:
124
+ - **LanG** — a governance-aware agentic AI platform (unrelated)
125
+ - **SecLMNER** — the LLM+encoder pipeline framework
126
+ - **TTPrompt** — the retrieval-to-reasoning CTI NER framework
127
+
128
+ The closest match to "LLM-based pipeline using GPT-4o/Llama on PRISM benchmark" is the **CyberBench** evaluation, which tested GPT-4 and Llama-2 on cybersecurity tasks. **PRISM** appears to be a GLM model variant, not a cybersecurity benchmark.
129
+
130
+ **CyberBench results (AAAI-24 Workshop):**
131
+ - GPT-4: 69.6 average across all tasks
132
+ - GPT-3.5-Turbo: 62.6
133
+ - Llama-2-13B: 54.1
134
+ - CyberInstruct-13B (fine-tuned Llama-2): 70.4
135
+ - For NER specifically: BERT-based models outperformed generative LLMs
136
+
137
+ **Baseline verdict:** **NOT A REAL COMPETITOR.** "LANCE" likely doesn't exist as described. CyberBench/CyberInstruct results confirm that generative LLMs underperform specialized encoder models on NER.
138
+
139
+ ---
140
+
141
+ ### 6. Additional High-Performers
142
+
143
+ #### CyberLLaMA (2025)
144
+ - **Architecture:** LLaMA-3.2-3B + BiLSTM + CRF
145
+ - **F1: 98.88%** — but this is on their own custom dataset (42,404 articles, 4,788 terms). No cross-benchmark validation.
146
+ - **Paper:** Zhang, H. et al. "CyberLLaMA: A fine-tuned large language model for cybersecurity named entity recognition." Knowledge-Based Systems 328:114183 (2025).
147
+ - **Weights:** NOT public. Paper only.
148
+ - **Baseline verdict:** **NOT USABLE.** No weights, no shared benchmark. The 98.88% F1 is likely inflated by narrow label space and custom eval. Include in related work, not in experiments.
149
+
150
+ #### XLNet-CRF (2025)
151
+ - **Architecture:** XLNet-base + CRF
152
+ - **F1: 97.43%** on CTI-Reports, 88.65% on MalwareTextDB
153
+ - **Paper:** Wang, T. et al. "XLNet-CRF: Efficient Named Entity Recognition for Cyber Threat Intelligence with Permutation Language Modeling." Electronics 14(15):3034 (2025). https://www.mdpi.com/2079-9292/14/15/3034
154
+ - **Code:** GitHub (training code, no pretrained weights)
155
+ - **Baseline verdict:** **REFERENCE ONLY.** We can cite their numbers on CTI-Reports/MalwareTextDB. Could retrain if we use those datasets.
156
+
157
+ #### CyberNER Harmonized Benchmark (2025)
158
+ - **Architecture:** Various (RoBERTa+CRF best at 0.736 F1)
159
+ - **21 STIX 2.1 entity types**, 610K tokens
160
+ - **Paper:** Ech-Chammakhy, Y. et al. "CyberNER: A Harmonized STIX Corpus for Cybersecurity Named Entity Recognition." arXiv:2510.26499 (2025).
161
+ - **Data + code:** Publicly available
162
+ - **Baseline verdict:** **USE AS BENCHMARK.** This is the most principled evaluation framework -- STIX-aligned, multi-dataset, public. Best baseline F1 is only 0.736, leaving huge room for Arcspan to demonstrate value.
163
+
164
+ #### SecLMNER (2025)
165
+ - **Architecture:** Two-stage: generative LLM (<10B params) reformats text, then SecureBERT does NER
166
+ - **Performance:** +6-17% F1 over SecureBERT alone
167
+ - **Paper:** Zhang, Y. et al. "SecLMNER: A framework for enhanced NER in multi-source cybersecurity data using LLMs." Expert Systems with Applications 271:126651 (2025).
168
+ - **Weights:** NOT public.
169
+ - **Baseline verdict:** **REFERENCE ONLY.** Interesting architecture comparison (two-stage LLM+encoder vs. our single-pass approach).
170
+
171
+ ---
172
+
173
+ ## Key Observations
174
+
175
+ 1. **The field is fragmented.** No single benchmark dominates. Everyone evaluates on different datasets with different label spaces, making direct comparison nearly impossible.
176
+
177
+ 2. **CyberNER harmonized benchmark is the best shared eval.** 21 STIX entity types, public data+code, multiple baselines. Best result is only 0.736 F1 -- enormous headroom.
178
+
179
+ 3. **SecureBERT 2.0's 0.945 F1 is on only 5 coarse entity types** with a small private dataset. Impressive but not directly comparable to models handling 20+ types.
180
+
181
+ 4. **SecureModernBERT-NER is our closest competitor** in terms of practical utility (22 types, MIT license, public weights, standard inference). Its 0.848 F1 is the number to beat.
182
+
183
+ 5. **The claimed 98%+ F1 scores (CyberLLaMA, XLNet-CRF) are on narrow/custom benchmarks** and weights are not public. Not practically threatening.
184
+
185
+ 6. **Arcspan's architectural advantages:** 50M active params (vs. 149-395M for competitors), 128K context window (vs. 128-8192 for competitors), single-pass Viterbi decoding (vs. pipeline approaches), BIOES scheme (vs. BIO).
186
+
187
+ ---
188
+
189
+ ## Recommended Baselines for Our Paper
190
+
191
+ ### Tier 1: Must Include (runnable, public weights)
192
+
193
+ | Model | How to Run | What to Report |
194
+ |---|---|---|
195
+ | **SecureBERT 2.0 NER** | `pip install transformers tensorflow`; load `cisco-ai/SecureBERT2.0-NER`; standard NER pipeline. **Note:** TF model, may need `TFAutoModelForTokenClassification`. | F1 on our dataset + their dataset if we can get it |
196
+ | **SecureModernBERT-NER** | `pip install transformers`; load `attack-vector/SecureModernBERT-NER`; `pipeline("token-classification")`. PyTorch, straightforward. | F1 on our dataset (22 entity types, map to our label space) |
197
+ | **CyNER** | `pip install cyner`; GitHub `aiforsec/CyNER`. Ensemble approach. | F1 on CyberNER benchmark |
198
+
199
+ ### Tier 2: Benchmark Comparison (shared datasets)
200
+
201
+ | Benchmark | Source | Best Published F1 | Our Target |
202
+ |---|---|---|---|
203
+ | **CyberNER (STIX harmonized)** | arXiv:2510.26499, public | 0.736 (RoBERTa+CRF) | >0.80 |
204
+ | **DNRTI** | Public, 13 entity types | 0.900 (BERT-CRF) | >0.90 |
205
+ | **CTI-Reports** | Public, 4 entity types | 0.974 (XLNet-CRF) | Competitive |
206
+
207
+ ### Tier 3: Literature Comparison (cite numbers, can't re-run)
208
+
209
+ | Model | Reported F1 | Notes |
210
+ |---|---|---|
211
+ | CyberLLaMA | 0.989 | Custom dataset, no weights, 3B params |
212
+ | XLNet-CRF | 0.974 | CTI-Reports only, no pretrained weights |
213
+ | SecLMNER | SecureBERT +6-17% | Two-stage pipeline, no weights |
214
+ | BERT-CRF for CTI | 0.900 (DNRTI) | Can retrain from code if needed |
215
+
216
+ ### Practical Instructions
217
+
218
+ ```bash
219
+ # SecureBERT 2.0 NER
220
+ pip install transformers tensorflow
221
+ python -c "
222
+ from transformers import AutoTokenizer, TFAutoModelForTokenClassification, pipeline
223
+ model = TFAutoModelForTokenClassification.from_pretrained('cisco-ai/SecureBERT2.0-NER')
224
+ tokenizer = AutoTokenizer.from_pretrained('cisco-ai/SecureBERT2.0-NER')
225
+ nlp = pipeline('ner', model=model, tokenizer=tokenizer)
226
+ print(nlp('APT29 exploited CVE-2024-1234 using Cobalt Strike against Microsoft Exchange.'))
227
+ "
228
+
229
+ # SecureModernBERT-NER
230
+ pip install transformers torch
231
+ python -c "
232
+ from transformers import pipeline
233
+ nlp = pipeline('token-classification', model='attack-vector/SecureModernBERT-NER', aggregation_strategy='first')
234
+ print(nlp('APT29 exploited CVE-2024-1234 using Cobalt Strike against Microsoft Exchange.'))
235
+ "
236
+
237
+ # CyNER
238
+ pip install cyner
239
+ python -c "
240
+ import cyner
241
+ model = cyner.CyNER()
242
+ print(model.get_entities('APT29 exploited CVE-2024-1234 using Cobalt Strike.'))
243
+ "
244
+ ```
245
+
246
+ ---
247
+
248
+ ## Arcspan Positioning
249
+
250
+ Our key differentiators vs. the field:
251
+ 1. **10x smaller active footprint** (50M vs. 149-395M) -- crucial for edge/SOC deployment
252
+ 2. **128K context window** -- can process entire threat reports in one pass (competitors max at 512-8192)
253
+ 3. **Constrained Viterbi decoding with BIOES** -- structurally guaranteed valid spans (competitors use BIO + greedy/CRF)
254
+ 4. **Single-pass architecture** -- no two-stage LLM preprocessing (vs. SecLMNER)
255
+ 5. **MoE efficiency** -- 1.5B total params but only 50M active per token
256
+
257
+ The CyberNER harmonized benchmark (0.736 best F1, 21 STIX types) is our ideal proving ground. If Arcspan can hit >0.80 F1 on that benchmark with 50M active params, the story writes itself.
research/notes/progress/2026-04-24-26-dataset-aggregation-plan.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cybersecurity NER Dataset Aggregation — Research & Results
2
+
3
+ ## What We Built
4
+
5
+ A master aggregation pipeline (`src/arcspan/data/aggregate_datasets.py`) that combines 4 public cybersecurity NER datasets into a unified 13-class and 5-class OPF BIOES JSONL format, with deduplication.
6
+
7
+ ## Datasets Found & Status
8
+
9
+ ### ✅ Successfully Aggregated (4 datasets)
10
+
11
+ | Dataset | Source | Format | Raw Types | Sentences | Spans | Notes |
12
+ |---------|--------|--------|-----------|-----------|-------|-------|
13
+ | **CyNER original** | `data/raw/CyNER/dataset/mitre/` | CoNLL BIO | 5 | 4,372 | 3,040 | Baseline dataset |
14
+ | **CyNER 2.0 augmented** | `data/raw/cyner2_augmented/hf_dataset/` | HF datasets | 8 | 11,074 | 15,036 | Adds ThreatActor, Date, Location |
15
+ | **CyberNER harmonized** | `data/raw/CyberNER_harmonized/` (GitHub: yasirech-chammakhy/CyberNER) | CSV w/ STIX tags | 21 | 10,042 | 42,329 | **Best single source** — harmonizes CyNER+DNRTI+APTNER+Attacker onto STIX 2.1 |
16
+ | **DNRTI** | `data/raw/DNRTI/DNRTI_Dataset/` (GitHub: LiuPeiP-CS/NER4CTI) | CoNLL BIO | 13 | 6,577 | 12,974 | Chinese-origin CTI dataset, 13 cybersec types |
17
+
18
+ ### ❌ Not Usable for NER Training
19
+
20
+ | Dataset | Why Not |
21
+ |---------|---------|
22
+ | **SecureModernBERT-NER** | Only the *model* is on HuggingFace (attack-vector/SecureModernBERT-NER). Training data (502K spans, 22 classes) is NOT published. Model card describes the data but doesn't share it. |
23
+ | **PRISM** | IOC *classification* (IoC vs nonIoC per indicator), not span-level NER annotations. Already at `data/raw/LANCE/PRISM/GT.json`. |
24
+ | **CTI-Reports** | Behind IEEE DataPort download wall. XML format with IOC extractions, not token-level NER. |
25
+ | **MalwareTextDB** | Requires manual download from statnlp.org (link may be dead). Only has generic "Entity" labels — no typed NER. |
26
+ | **bnsapa/cybersecurity-ner** | Just a distilBERT fine-tune on original CyNER MITRE data — same data we already have. |
27
+ | **Pile-NER cybersecurity subset** | General-purpose GPT-3.5-generated NER, not cybersecurity-specific. Would need heavy filtering and label mapping. Low quality. |
28
+ | **MITRE ATT&CK STIX data** | Structured KB, not annotated text. Useful for distant supervision / data augmentation but not direct NER training. |
29
+
30
+ ### 🔍 Notable: CyberNER Already Subsumes Multiple Sources
31
+
32
+ The CyberNER harmonized corpus (arXiv:2510.26499) already harmonizes CyNER, DNRTI, APTNER, and Attacker datasets. This means our aggregation has **significant overlap** between CyberNER and the individual CyNER/DNRTI datasets. The deduplication step removed ~3,766 duplicates (exact text match), but some paraphrased overlap likely remains. This is acceptable — the STIX-harmonized labels from CyberNER are higher quality than the raw source labels.
33
+
34
+ ## Final Aggregated Stats
35
+
36
+ After deduplication:
37
+
38
+ | Split | Sentences | Spans |
39
+ |-------|-----------|-------|
40
+ | Train | 20,436 | 52,331 |
41
+ | Valid | 3,966 | 8,229 |
42
+ | Test | 3,897 | 7,903 |
43
+ | **Total** | **28,299** | **68,463** |
44
+
45
+ ### 13-Class Label Distribution (train)
46
+
47
+ | Label | Count | % |
48
+ |-------|-------|---|
49
+ | MALWARE | 12,537 | 24.0% |
50
+ | ORGANIZATION | 12,036 | 23.0% |
51
+ | THREAT_ACTOR | 11,589 | 22.1% |
52
+ | TOOL | 7,459 | 14.3% |
53
+ | SYSTEM | 3,672 | 7.0% |
54
+ | VULNERABILITY | 2,709 | 5.2% |
55
+ | FILEPATH | 1,764 | 3.4% |
56
+ | DOMAIN | 298 | 0.6% |
57
+ | IP_ADDRESS | 168 | 0.3% |
58
+ | URL | 71 | 0.1% |
59
+ | EMAIL | 28 | <0.1% |
60
+ | CVE_ID | 0 | 0% |
61
+ | HASH | 0 | 0% |
62
+
63
+ ## Unified Label Mapping
64
+
65
+ ### CyNER (5 types) → 13-class
66
+ - Malware → MALWARE
67
+ - System → SYSTEM
68
+ - Organization → ORGANIZATION
69
+ - Vulnerability → VULNERABILITY
70
+ - Indicator → **dropped** (mixed IOC types, can't reliably split)
71
+
72
+ ### CyNER 2.0 (8 types) → 13-class
73
+ - Malware → MALWARE, ThreatActor → THREAT_ACTOR, System → SYSTEM, Organization → ORGANIZATION, Vulnerability → VULNERABILITY
74
+ - Indicator → dropped, Date → dropped, Location → dropped
75
+
76
+ ### CyberNER STIX (21 types) → 13-class
77
+ - Malware → MALWARE, Threat-Actor → THREAT_ACTOR, Intrusion-Set → THREAT_ACTOR
78
+ - Tool → TOOL, Software → SYSTEM, Infrastructure → SYSTEM
79
+ - Identity → ORGANIZATION, Vulnerability → VULNERABILITY
80
+ - Domain-Name → DOMAIN, IPv4-Addr → IP_ADDRESS, URL → URL, Email-Addr → EMAIL, File → FILEPATH
81
+ - Dropped: Campaign, Attack-Pattern, Course-of-Action, Indicator, Location, Observed-Data, Malware-Analysis, Network-Traffic
82
+
83
+ ### DNRTI (13 types) → 13-class
84
+ - HackOrg → THREAT_ACTOR, SamFile → MALWARE, Tool → TOOL
85
+ - SecTeam → ORGANIZATION, Org → ORGANIZATION, Exp → VULNERABILITY
86
+ - Dropped: OffAct, Time, Purp, Area, Idus, Way, Features
87
+
88
+ ## Open Questions
89
+
90
+ 1. **CVE_ID and HASH have zero examples.** Need regex-based distant supervision from MITRE ATT&CK or synthetic generation to populate these.
91
+ 2. **IOC classes are severely underrepresented** (IP, DOMAIN, URL, EMAIL, FILEPATH total ~2,329 in train). Consider augmenting with regex-extracted IOCs from CTI reports.
92
+ 3. **CyberNER overlap with CyNER/DNRTI.** We deduplicate by exact text, but the same sentences appear with different tokenizations. Could do fuzzy dedup but risk losing valid data.
93
+ 4. **SecureModernBERT training data** (502K spans) would be transformative if released. Worth reaching out to the authors.
94
+
95
+ ## Sources
96
+ - CyberNER: https://github.com/yasirech-chammakhy/CyberNER | arXiv:2510.26499
97
+ - DNRTI: https://github.com/LiuPeiP-CS/NER4CTI
98
+ - CyNER 2.0: HuggingFace PranavaKailash/CyNER2.0_augmented_dataset
99
+ - SecureModernBERT-NER: https://huggingface.co/attack-vector/SecureModernBERT-NER (model only)
research/notes/progress/2026-04-24-29-final-llm-merge-complete.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Final LLM Annotation Merge — All 8 Sources Complete
2
+
3
+ ## Enriched Dataset Stats
4
+ - **enriched_13class_train**: 22,052 examples (20,436 aggregated + 1,616 LLM)
5
+ - **enriched_5class_train**: 21,891 examples
6
+ - Total LLM spans: 6,060 across all 13 entity types
7
+
8
+ ## LLM Span Distribution (all sources combined)
9
+ | Label | Count |
10
+ |-------|-------|
11
+ | MALWARE | 1,638 |
12
+ | THREAT_ACTOR | 959 |
13
+ | SYSTEM | 796 |
14
+ | CVE_ID | 485 |
15
+ | VULNERABILITY | 425 |
16
+ | TOOL | 325 |
17
+ | ORGANIZATION | 315 |
18
+ | DOMAIN | 271 |
19
+ | IP_ADDRESS | 248 |
20
+ | HASH | 248 |
21
+ | FILEPATH | 234 |
22
+ | URL | 69 |
23
+ | EMAIL | 47 |
24
+
25
+ ## Sources (8 annotation agents)
26
+ | Source | Examples | Spans |
27
+ |--------|----------|-------|
28
+ | MITRE ATT&CK | 954 | 2,750 |
29
+ | NVD CVEs | 339 | 990 |
30
+ | Synthetic | 100 | 752 |
31
+ | Vendor blogs | 67 | 446 |
32
+ | News articles | 51 | 362 |
33
+ | CISA advisories | 40 | 400 |
34
+ | AlienVault OTX | 40 | 295 |
35
+ | Malware reports | 25 | 464 |
36
+
37
+ ## Why It Matters
38
+ - Zero-count entity classes eliminated (CVE_ID: 0→485, HASH: 0→248, FILEPATH: 0→234)
39
+ - 8% more training data for Round 5
40
+ - Diverse sources = better generalization
41
+
42
+ ## Next
43
+ Round 4 training in progress (~1h36m). Round 5 script staged and ready.
research/notes/progress/2026-04-24-30-data-quality-audit.md ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Quality Audit — LLM-Annotated Cybersecurity NER Data
2
+
3
+ **Date:** 2026-04-24
4
+ **Auditor:** Automated script + manual review
5
+ **Scope:** All 13 files in `data/processed/`, 17,516 total records
6
+
7
+ ---
8
+
9
+ ## Executive Summary
10
+
11
+ | Issue | Count | Severity | Action Required |
12
+ |-------|-------|----------|-----------------|
13
+ | Offset errors | **0** | — | None |
14
+ | Duplicate texts | **1,727 unique** (4,408 records) | HIGH | Deduplicate before training |
15
+ | Short texts (<20 chars) | **71** | MEDIUM | Remove — too short for meaningful NER |
16
+ | Mislabeled entities | **~10,854** | CRITICAL | See breakdown — most are label-space design issues |
17
+ | Overlapping spans | **1,060** | HIGH | Fix or pick longest-match |
18
+ | Garbage text (real HTML) | **~471** | MEDIUM | Strip HTML markup |
19
+ | Repetitive entities (50+) | **100 entities** | MEDIUM | Review for template artifacts |
20
+ | Empty spans (no annotations) | **942** | LOW-MEDIUM | Decide: keep as negatives or remove |
21
+
22
+ **Overall data health: FAIR.** Offsets are clean (big win), but label consistency, overlaps, and duplicates need remediation before training.
23
+
24
+ ---
25
+
26
+ ## 1. Offset Errors: 0 ✅
27
+
28
+ All `text[start:end]` slices match their declared entity text across all 17,516 records. The annotation pipeline produced correct character offsets.
29
+
30
+ ---
31
+
32
+ ## 2. Duplicate Texts: 1,727 unique texts appear 2+ times (4,408 total records)
33
+
34
+ **Within-file duplicates:** 78 unique texts
35
+ **Cross-file duplicates:** 1,649 unique texts
36
+
37
+ ### Worst offenders:
38
+ - `"Ransomware."` — **44 copies** in `llm_annotated_apt.jsonl`
39
+ - `"Ransomware"` — 7 copies in same file
40
+ - Many MITRE descriptions appear in **both** `llm_annotated_mitre.jsonl` AND `llm_annotated_mitre_v2.jsonl` AND `llm_annotated_apt.jsonl` (3-4 copies each)
41
+ - Oracle NVD boilerplate descriptions appear 4-6 times in `llm_annotated_nvd_v2.jsonl`
42
+
43
+ ### Root cause:
44
+ - `mitre` and `mitre_v2` are overlapping dataset versions that were both kept
45
+ - `apt` dataset ingested MITRE descriptions alongside its own data
46
+ - Very short texts like "Ransomware." are degenerate entries from APT descriptions
47
+
48
+ ### Recommendation:
49
+ **Deduplicate globally.** Keep the version with the best annotations when spans differ. Priority: `mitre_v2` > `mitre`, `nvd_v2` > `nvd`.
50
+
51
+ ---
52
+
53
+ ## 3. Short Texts (<20 chars): 71
54
+
55
+ All 71 are from `llm_annotated_apt.jsonl`. Examples:
56
+ - `"WebShell."` (9 chars) — 2 occurrences
57
+ - `"Ransomware."` (11 chars) — 44+ occurrences
58
+ - `"Keylogger."` (10 chars)
59
+ - `"PyVil RAT"` (9 chars)
60
+
61
+ These are malware "descriptions" that are just a single word. They have no spans (empty annotations) and provide zero training signal.
62
+
63
+ ### Recommendation:
64
+ **Remove all records with text <20 chars.** They cannot produce useful span examples.
65
+
66
+ ---
67
+
68
+ ## 4. Mislabeled Entities: ~10,854 flagged
69
+
70
+ This is the highest-count issue but most are **label-space design disagreements**, not random errors. Breakdown:
71
+
72
+ ### 4a. Security vendors labeled as SYSTEM instead of ORGANIZATION (200 instances)
73
+
74
+ | Entity | Count |
75
+ |--------|-------|
76
+ | ESET | 37 |
77
+ | Trend Micro | 25 |
78
+ | Kaspersky | 16 |
79
+ | Symantec | 11 |
80
+ | SentinelOne | 8 |
81
+ | Avast | 7 |
82
+ | Fortinet | 7 |
83
+ | Bitdefender | 3 |
84
+ | Sophos | 2 |
85
+ | Palo Alto | 2 |
86
+ | McAfee | 1 |
87
+
88
+ **Analysis:** The LLM annotator confused security product names with their parent companies. "Kaspersky" the company vs "Kaspersky" the antivirus product. This is genuinely ambiguous, but for cybersecurity NER, these should be **ORGANIZATION**.
89
+
90
+ **Severity: HIGH.** These are real errors that will confuse the model. Fix by relabeling.
91
+
92
+ ### 4b. CVE_ID vs VULNERABILITY label (30 instances)
93
+
94
+ CVE identifiers (e.g., `CVE-2023-1389`) are labeled as `CVE_ID` but the audit expected `VULNERABILITY`.
95
+
96
+ **Analysis:** This is actually a **label-space design question**. If the label space includes both `CVE_ID` and `VULNERABILITY`, then CVE IDs should indeed be `CVE_ID`. Check if `CVE_ID` is in the intended label space.
97
+
98
+ **Severity: LOW** if `CVE_ID` is a valid label. **HIGH** if it's not in the final label space.
99
+
100
+ ### 4c. URL and HASH labeled as their own types instead of INDICATOR (51 instances)
101
+
102
+ URLs labeled `URL`, hashes labeled `HASH` — audit expected `INDICATOR`.
103
+
104
+ **Analysis:** Same as 4b — depends on label-space design. If `URL`, `HASH`, `IP_ADDRESS`, `DOMAIN`, `EMAIL` are all valid labels (they appear in the label distribution), then these are **correct**. The audit's expectation of a single `INDICATOR` class was wrong.
105
+
106
+ **Severity: NOT AN ISSUE** — the data uses fine-grained IOC labels which is actually better for cybersecurity NER.
107
+
108
+ ### 4d. Revised mislabel count
109
+
110
+ Excluding label-space design issues (4b, 4c), the **real mislabel count is ~200** (security vendors as SYSTEM). This is much more manageable.
111
+
112
+ ---
113
+
114
+ ## 5. Overlapping Spans: 1,060
115
+
116
+ ### Dominant patterns:
117
+
118
+ 1. **"Google Play" triple overlap** (~100+ instances):
119
+ - `ORGANIZATION: Google [26:32]`
120
+ - `SYSTEM: Google Play [26:37]`
121
+ - `MALWARE: Play [33:37]` ← **This is wrong** — "Play" (as in Google Play) is not malware
122
+
123
+ 2. **Nested entity annotations** (e.g., `SYSTEM: Cisco` inside `ORGANIZATION: Cisco Talos`)
124
+
125
+ 3. **Partial overlaps** (e.g., `SYSTEM: Android` overlapping `SYSTEM: Android operating system`)
126
+
127
+ ### Root cause:
128
+ The LLM annotator is producing **all possible readings** of ambiguous spans instead of picking one. The BIOES tagging scheme used by the model **cannot represent overlapping spans** — the Viterbi decoder produces exactly one label per token.
129
+
130
+ ### Recommendation:
131
+ **Resolve all overlaps before training.** Strategy:
132
+ - For nested spans: keep the **longest** span
133
+ - For `Google Play`: annotate as `SYSTEM: Google Play` only (not three separate entities)
134
+ - For `MALWARE: Play`: **remove** — this is a false annotation. "Play" in "Google Play" is not the Play ransomware group
135
+ - General rule: prefer the span that covers the full entity mention
136
+
137
+ ---
138
+
139
+ ## 6. Garbage Text / HTML Artifacts: ~471 records with real HTML
140
+
141
+ Of 1,119 records flagged for HTML-like patterns:
142
+ - **~471** contain actual HTML markup tags (`<p>`, `<code>`, `<a>`, etc.)
143
+ - **~648** contain legitimate code references (`<script>`, `<EXEC>`, `<guid>`) that are valid cybersecurity text
144
+
145
+ The real HTML artifacts are concentrated in:
146
+ - `llm_annotated_apt.jsonl` — MITRE technique descriptions with residual HTML
147
+ - `llm_annotated_nvd_v2.jsonl` — NVD descriptions with markup
148
+
149
+ 1 record has encoding issues (high non-ASCII ratio).
150
+
151
+ ### Recommendation:
152
+ **Strip HTML tags** from the ~471 affected records (careful not to remove code references). Re-run annotation on cleaned text since offsets will shift.
153
+
154
+ ---
155
+
156
+ ## 7. Repetitive Entities
157
+
158
+ ### Legitimate high-frequency entities (expected):
159
+ - `SYSTEM: Windows` (1,011), `SYSTEM: Linux` (465), `SYSTEM: Linux kernel` (1,262)
160
+ - `ORGANIZATION: Microsoft` (431), `ORGANIZATION: Google` (297)
161
+ - `TOOL: PowerShell` (229), `TOOL: Metasploit` (171)
162
+ - `THREAT_ACTOR: APT29` (149)
163
+
164
+ ### Suspicious / problematic:
165
+ | Entity | Count | Issue |
166
+ |--------|-------|-------|
167
+ | `TOOL: at` | 495 | **FALSE POSITIVE** — English word "at" (as in "since at least 2020") labeled as the Unix `at` command |
168
+ | `FILEPATH: /01/2014` | 155 | **FALSE POSITIVE** — date substrings from Linux kernel commit references labeled as file paths |
169
+ | `VULNERABILITY: phishing` | 240 | **Debatable** — phishing is an attack technique, not a vulnerability |
170
+ | `SYSTEM: .NET` | 225 | **Debatable** — .NET is a framework, could be SYSTEM or TOOL |
171
+ | `SYSTEM: QEMU` | 196 | Correct for NVD kernel data |
172
+ | `SYSTEM: Python` | 177 | **Debatable** — Python is a language, not a system |
173
+
174
+ ### Critical fix needed:
175
+ - **`TOOL: at`** — 495 false positives will heavily poison the model. The word "at" appears thousands of times in text; labeling it as a tool will cause massive false positive rates at inference. **Must remove all instances and re-annotate only genuine uses of the `at` command.**
176
+ - **`FILEPATH: /01/2014`** — 155 false positives from date strings in kernel changelogs. **Must remove.**
177
+
178
+ ---
179
+
180
+ ## 8. Empty Spans: 942 records (5.4% of data)
181
+
182
+ | File | Empty records | Total records | % Empty |
183
+ |------|--------------|---------------|---------|
184
+ | llm_annotated_apt.jsonl | 517 | 4,554 | 11.4% |
185
+ | llm_annotated_mitre_v2.jsonl | 244 | 1,984 | 12.3% |
186
+ | llm_annotated_nvd_v2.jsonl | 175 | 3,000 | 5.8% |
187
+ | llm_annotated_news.jsonl | 3 | 51 | 5.9% |
188
+ | llm_annotated_vendor_blogs.jsonl | 3 | 67 | 4.5% |
189
+
190
+ **Analysis:** These are texts where the LLM annotator found no entities. Some are legitimate (generic descriptions without named entities), others are short/degenerate texts that should have been filtered.
191
+
192
+ ### Recommendation:
193
+ - **Keep ~50%** as negative examples (texts with no entities are useful for training the model to predict `O` tags)
194
+ - **Remove** the ones that are short (<50 chars) or degenerate ("Ransomware.", "WebShell.")
195
+ - Cap negatives at ~5% of training data to avoid class imbalance
196
+
197
+ ---
198
+
199
+ ## 9. Label Distribution
200
+
201
+ | Label | Count | % of all spans |
202
+ |-------|-------|----------------|
203
+ | SYSTEM | 13,085 | 20.1% |
204
+ | FILEPATH | 8,012 | 12.3% |
205
+ | MALWARE | 7,821 | 12.0% |
206
+ | VULNERABILITY | 7,617 | 11.7% |
207
+ | THREAT_ACTOR | 4,028 | 6.2% |
208
+ | IP_ADDRESS | 3,994 | 6.1% |
209
+ | ORGANIZATION | 3,734 | 5.7% |
210
+ | TOOL | 3,683 | 5.7% |
211
+ | HASH | 3,322 | 5.1% |
212
+ | URL | 3,180 | 4.9% |
213
+ | DOMAIN | 2,658 | 4.1% |
214
+ | EMAIL | 2,106 | 3.2% |
215
+ | CVE_ID | 1,417 | 2.2% |
216
+
217
+ **13 label types total.** Distribution is reasonably balanced. `SYSTEM` dominates (inflated by the `Linux kernel` repetitions in NVD data). `CVE_ID` and `EMAIL` are underrepresented.
218
+
219
+ ---
220
+
221
+ ## Priority Remediation Plan
222
+
223
+ ### P0 — Must fix before any training
224
+ 1. **Remove `TOOL: at` false positives** (495 instances) — will poison the model
225
+ 2. **Remove `FILEPATH: /01/2014` false positives** (155 instances)
226
+ 3. **Resolve all 1,060 overlapping spans** — model architecture cannot handle overlaps
227
+ 4. **Deduplicate** across files (especially mitre/mitre_v2/apt overlaps)
228
+
229
+ ### P1 — Should fix
230
+ 5. **Relabel security vendors** (ESET, Kaspersky, etc.) from SYSTEM → ORGANIZATION (200 instances)
231
+ 6. **Remove `MALWARE: Play`** false annotations from Google Play contexts
232
+ 7. **Remove records with text <20 chars** (71 records)
233
+ 8. **Strip real HTML tags** from ~471 records and re-align offsets
234
+
235
+ ### P2 — Nice to have
236
+ 9. Review `VULNERABILITY: phishing` label consistency
237
+ 10. Decide on `SYSTEM` vs `TOOL` for Python, .NET
238
+ 11. Cap empty-span records at 5% of training data
239
+ 12. Review `SYSTEM: QEMU` concentration from NVD kernel data (may cause domain bias)
240
+
241
+ ---
242
+
243
+ ## Audit Script
244
+
245
+ The full audit script is at `scripts/audit_data_quality.py`. Detailed JSON results at `scripts/audit_results.json`.
research/notes/progress/2026-04-24-32-round4-training-overfitting.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Round 4 Training Launched + Overfitting Analysis
2
+
3
+ ## Round 4 Configuration
4
+ - **Data**: 20,436 aggregated examples (5-class + 13-class)
5
+ - **LR**: 5e-5, **Epochs**: 15, **BS**: 1, **Grad accum**: 8
6
+ - **GPU**: RTX 5090 32GB on Vast.ai
7
+ - **Script**: `/root/run_train_v4c.sh` in tmux session `arcspan`
8
+
9
+ ## Overfitting Pattern (epoch-by-epoch)
10
+ | Epoch | Train Loss | Val Loss | Val Acc |
11
+ |-------|-----------|----------|---------|
12
+ | 1 | 0.248 | 0.164 | 95.1% |
13
+ | 2 | 0.156 | 0.136 | 96.0% |
14
+ | 3 | 0.120 | **0.126** | 96.4% |
15
+ | **4** | 0.097 | **0.126** ← best | 96.5% |
16
+ | 5 | 0.078 | 0.134 ↑ | 96.7% |
17
+ | 9 | 0.042 | 0.165 ↑↑ | 96.9% |
18
+
19
+ ## Key Insight
20
+ Best checkpoint at epoch 3-4. Classic overfitting after that — train loss keeps dropping but val loss climbs. The model learns fast but runs out of new signal in 20K examples. More data should push the sweet spot later.
21
+
22
+ ## Why This Matters
23
+ - Confirms data scaling is the bottleneck, not model capacity
24
+ - 15 epochs is too many for 20K data with this LR
25
+ - Round 5 with 32K enriched data should allow deeper training
research/notes/progress/2026-04-24-36-dapt-research.md ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Domain-Adaptive Pretraining (DAPT) for Arcspan Cybersecurity NER
2
+
3
+ **Date:** 2026-04-24
4
+ **Status:** Research complete — decision needed on whether to pursue DAPT
5
+
6
+ ---
7
+
8
+ ## 1. How DAPT Works for Token Classifiers — Literature Review
9
+
10
+ ### "Don't Stop Pretraining" (Gururangan et al., ACL 2020)
11
+ - **Core finding:** Continued MLM pretraining on in-domain unlabeled text before fine-tuning improves downstream task performance across all four tested domains (biomed, CS, news, reviews).
12
+ - **DAPT + TAPT stacking:** Domain-adaptive pretraining → Task-adaptive pretraining (on the unlabeled task corpus itself) → supervised fine-tuning yields the best results. Sequential DAPT→TAPT is optimal.
13
+ - **Critical caveat:** Adapting on domain-*irrelevant* corpora **degrades** performance. Domain match matters.
14
+ - **Gains:** Consistent improvements across high- and low-resource settings. Typical improvements are 1-3 F1 points on classification tasks; larger gains in low-resource scenarios.
15
+ - **Source:** https://aclanthology.org/2020.acl-main.740/
16
+
17
+ ### SecureBERT (Aghaei et al., 2022) → SecureBERT 2.0 (2025)
18
+ - **Original SecureBERT:** Continued pretraining of RoBERTa on 1.1B words (98K cybersecurity documents). Custom BPE tokenizer added 17,673 cyber-specific tokens. Used Gaussian noise injection (μ=0, σ=0.01) on embeddings to prevent overfitting.
19
+ - NER on MalwareTextDB: F1=86.65 (vs RoBERTa-base 86.20) — modest +0.45 gain
20
+ - **SecureBERT 2.0:** Built on ModernBERT, pretrained on **13B+ text tokens + 53M code tokens**. 13x more data than v1.
21
+ - NER F1=**0.945** (vs original SecureBERT 0.734, vs CyBERT 0.351)
22
+ - The massive DAPT corpus is the primary differentiator — same NER fine-tuning approach, dramatically better results.
23
+ - **Lesson:** Scale of DAPT corpus matters enormously for cybersecurity NER. 1B tokens → marginal gains. 13B tokens → transformative gains.
24
+ - **Sources:** https://huggingface.co/cisco-ai/SecureBERT2.0-NER, https://arxiv.org/abs/2510.00240
25
+
26
+ ### CySecBERT (Bayer et al., 2022)
27
+ - Domain-adapted BERT using diverse sub-corpora: blogs, arXiv papers, NVD data, Twitter.
28
+ - **Source:** https://arxiv.org/pdf/2212.02974
29
+
30
+ ### CyLLM-DAP (2025) — Efficient DAPT
31
+ - Achieved competitive cybersecurity performance with only **118.8M tokens** (vs 2.77B in comparable models).
32
+ - Shows that careful curation can substitute for raw scale.
33
+ - **Source:** https://arxiv.org/html/2507.02964v1
34
+
35
+ ---
36
+
37
+ ## 2. Can We Do DAPT with This MoE Architecture?
38
+
39
+ ### Architecture Recap
40
+ - 1.5B total params, 50M active (top-4 of 128 experts per token)
41
+ - 8 transformer layers, d_model=640, banded attention (257-token window)
42
+ - Output head: Linear(640, num_labels) — this is the NER classification head
43
+ - Embedding: `nn.Embedding(vocab_size, 640)` — standard token embeddings
44
+ - Custom tiktoken tokenizer
45
+
46
+ ### What DAPT Would Require
47
+ For MLM continued pretraining, we need to:
48
+ 1. **Replace the output head** with an MLM head: `Linear(640, vocab_size)` instead of `Linear(640, num_labels)`. The current unembedding is a small NER head (33 or 97 classes). MLM needs vocab_size (~100K+) output.
49
+ 2. **Implement MLM masking:** Randomly mask 15% of input tokens, predict original tokens.
50
+ 3. **Write a custom training loop** — the existing `opf train` runner only does supervised token classification (cross-entropy on label predictions). It cannot do MLM.
51
+
52
+ ### MoE-Specific Risks
53
+
54
+ **Catastrophic Forgetting in MoE:**
55
+ - Recent research (DES-MoE, EMNLP 2025) shows MoE models are particularly susceptible to catastrophic forgetting during domain adaptation, due to cross-domain interference across expert routing.
56
+ - DES-MoE proposed: (1) adaptive router with distillation, (2) real-time expert-domain correlation mapping, (3) three-phase progressive freezing. Achieved 89% reduction in forgetting.
57
+ - **For our case:** We're adapting to a *single* new domain (cyber), not multi-domain. This is simpler. The main risk is degrading the model's general English token-classification ability.
58
+ - **Source:** https://arxiv.org/abs/2509.16882
59
+
60
+ **Mitigation strategies:**
61
+ - **Replay mixing:** Include some general English text (e.g., Wikipedia) alongside cyber text during DAPT (SaulLM approach). Ratio: ~80% domain, 20% general.
62
+ - **Lower learning rate:** Use 1/10th to 1/5th of original pretraining LR for continued pretraining.
63
+ - **No warmup:** Research shows warmup during continued pretraining causes regressions.
64
+ - **Progressive expert freezing:** Optionally freeze router weights after initial adaptation.
65
+
66
+ ### Critical Question: Does the Tokenizer Limit Us?
67
+ - The model uses a custom tiktoken encoding. Cybersecurity terms like "CVE-2024-12345", "Emotet", "mimikatz" may be over-tokenized into subwords.
68
+ - SecureBERT added 17,673 custom tokens to handle this. **We cannot easily do this** — adding tokens would require resizing the embedding matrix and invalidating pretrained weights.
69
+ - **Mitigation:** The existing tokenizer likely handles most cyber terms via subword composition (BPE). The model can still learn representations for multi-token entities via DAPT, just less efficiently than with a custom tokenizer.
70
+
71
+ ---
72
+
73
+ ## 3. Available Raw Cybersecurity Text
74
+
75
+ ### What We Already Have (~200K documents)
76
+ | Source | Count | Est. Tokens |
77
+ |--------|-------|-------------|
78
+ | NVD CVE descriptions | 193K | ~50-80M |
79
+ | APT reports (CyberCorpus) | 4.5K | ~20-40M |
80
+ | Exploit-DB entries | 4.3K | ~5-10M |
81
+ | MITRE ATT&CK descriptions | 2K | ~2-3M |
82
+ | **Total existing** | **~204K** | **~80-130M** |
83
+
84
+ ### Freely Available Additional Sources
85
+
86
+ **Large-scale pretraining corpora:**
87
+ | Dataset | Size | Access |
88
+ |---------|------|--------|
89
+ | **Alpha-Root** (Common Crawl cyber extraction) | 3B tokens, 2.8M webpages | HuggingFace |
90
+ | **PRIMUS / Primus-FineWeb** | 2.57B tokens, 3.38M examples | HuggingFace (filtered FineWeb) |
91
+ | **STUCCO auto-labeled corpus** | Unknown size | GitHub (github.com/stucco/auto-labeled-corpus) |
92
+
93
+ **Specialized text sources:**
94
+ | Source | Content | Access |
95
+ |--------|---------|--------|
96
+ | MITRE CVE full database | 200K+ CVE records with descriptions | cve.org bulk download |
97
+ | MITRE CWE descriptions | ~900 weakness types | mitre.org |
98
+ | MITRE CAPEC | ~500 attack pattern descriptions | mitre.org |
99
+ | SecurityFocus/BugTraq archives | Vulnerability advisories | Web archives |
100
+ | APTnotes (GitHub) | ~500+ APT report PDFs | github.com/aptnotes |
101
+ | Malpedia descriptions | Malware family descriptions | malpedia.caad.fkie.fraunhofer.de |
102
+ | NIST SP 800-series | Security standards/guides | nist.gov |
103
+ | RFC security-related docs | Protocol security specs | ietf.org |
104
+
105
+ ### Realistic Corpus Assembly
106
+ - **Quick win (minimal effort):** Use Alpha-Root or PRIMUS from HuggingFace → 2.5-3B tokens, ready to use.
107
+ - **Medium effort:** Our existing 80-130M tokens + Alpha-Root 3B tokens = ~3.1B tokens.
108
+ - **Maximum effort:** Curate custom corpus from all sources above → potentially 5-10B tokens but requires significant data engineering.
109
+
110
+ **Recommendation:** Use **Alpha-Root (3B tokens)** as the primary DAPT corpus. It's already curated, cyber-specific, and available on HuggingFace. Supplement with our existing 130M tokens for domain specificity.
111
+
112
+ ---
113
+
114
+ ## 4. How Much DAPT Is Enough?
115
+
116
+ ### Empirical Benchmarks from Literature
117
+
118
+ | Model | DAPT Tokens | NER Gain | Notes |
119
+ |-------|-------------|----------|-------|
120
+ | SecureBERT v1 | ~1.3B | +0.45 F1 | Modest gain on MalwareTextDB |
121
+ | SecureBERT 2.0 | 13B | +21 F1 points | Transformative — but also changed base arch |
122
+ | CyLLM-DAP | 118.8M | Competitive | Careful curation compensated for scale |
123
+ | "Don't Stop Pretraining" | Varied | +1-3 F1 | Across 4 domains |
124
+
125
+ ### Guidelines for Our Model (50M active params)
126
+
127
+ Our model is **much smaller** than BERT-base (110M) or SecureBERT 2.0 (ModernBERT, ~150M). With only 50M active parameters:
128
+
129
+ - **Minimum viable DAPT:** ~100-500M tokens (1-3 epochs over our existing corpus + supplements). Expect marginal gains (+0.5-2 F1).
130
+ - **Sweet spot:** ~1-3B tokens. This is where Alpha-Root fits perfectly. For a 50M active param model, 3B tokens represents ~60 tokens per parameter — well within the "compute-optimal" range for continued pretraining.
131
+ - **Diminishing returns:** Beyond 3-5B tokens for this model size.
132
+
133
+ ### Learning Rate Schedule
134
+ - **Start LR:** 1/10th of original pretraining LR. If original was ~3e-4, use ~3e-5 for DAPT.
135
+ - **Schedule:** Cosine decay to 1/100th of max LR (i.e., 3e-7).
136
+ - **No warmup** — research shows warmup hurts continued pretraining.
137
+ - **Epochs:** 1-2 passes over the corpus if <1B tokens; single pass if >1B tokens.
138
+
139
+ ### Masking Rate
140
+ - Standard 15% masking for MLM. Some research suggests dynamic masking schedules, but standard 15% is robust.
141
+
142
+ ---
143
+
144
+ ## 5. Implementation Feasibility
145
+
146
+ ### What Exists (in `vendor/privacy-filter/`)
147
+ - `opf/_train/runner.py`: Full supervised fine-tuning loop with AdamW, gradient accumulation, best-epoch checkpointing, safetensors output. **Well-structured, 920 lines.**
148
+ - `opf/_model/model.py`: `Transformer` class with `embedding → blocks → norm → unembedding` pipeline. Clean forward pass.
149
+ - The model loads from checkpoint via `Transformer.from_checkpoint()`.
150
+
151
+ ### What We'd Need to Build
152
+
153
+ **1. MLM Head Swap (Easy — ~20 lines)**
154
+ ```python
155
+ # Replace unembedding (NER head) with MLM head (vocab prediction)
156
+ mlm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
157
+ # Optionally tie weights with embedding: mlm_head.weight = model.embedding.weight
158
+ model.unembedding = mlm_head
159
+ ```
160
+
161
+ **2. MLM Data Pipeline (Medium — ~100 lines)**
162
+ - Load raw text → tokenize with tiktoken → chunk into 257-token windows (matching the banded attention window)
163
+ - Apply random 15% masking: replace with [MASK] token (80%), random token (10%), keep (10%)
164
+ - Yield (masked_tokens, original_tokens, mask_positions) batches
165
+
166
+ **3. MLM Training Loop (Medium — ~150 lines)**
167
+ - Can largely copy from `_train_one_epoch` in runner.py
168
+ - Replace label-based loss with: `CrossEntropyLoss(logits[mask_positions], original_tokens[mask_positions])`
169
+ - Add cosine LR scheduler
170
+ - Add checkpoint saving (reuse `save_named_tensors`)
171
+
172
+ **4. Head Swap Back + Fine-tune (Trivial)**
173
+ - After DAPT: save the backbone weights (everything except unembedding)
174
+ - Load backbone → attach new NER head → run normal `opf train`
175
+
176
+ ### Estimated Implementation Effort
177
+ - **~300-400 lines of new Python code** for a complete DAPT pipeline
178
+ - **~2-3 days of engineering** for a clean implementation
179
+ - **Training time:** With 3B tokens on a single A100 (80GB), ~50M active params, batch size 64, expect ~12-24 hours of DAPT training
180
+
181
+ ### Key Risk: The `num_labels` Config Entanglement
182
+ The `config.json` ties `num_labels` to the unembedding size. During DAPT we'd set `num_labels=vocab_size` (or bypass this field). After DAPT, we restore the NER config. This requires careful checkpoint management but is straightforward.
183
+
184
+ ---
185
+
186
+ ## 6. Recommendation
187
+
188
+ ### Should We Do DAPT?
189
+
190
+ **Arguments FOR:**
191
+ - SecureBERT 2.0 proves massive gains from cyber DAPT on NER (0.734 → 0.945 F1)
192
+ - Alpha-Root provides 3B ready-to-use cyber tokens on HuggingFace
193
+ - Implementation is tractable (~300 lines, ~2-3 days)
194
+ - Our model's general-English pretraining likely has poor coverage of cyber vocabulary
195
+ - DAPT + TAPT + supervised fine-tuning is the empirically optimal pipeline
196
+
197
+ **Arguments AGAINST:**
198
+ - Our model is tiny (50M active params) — may not have capacity to absorb much domain knowledge
199
+ - MoE routing adds catastrophic forgetting risk
200
+ - Custom tokenizer may not handle cyber terms well regardless of DAPT
201
+ - Engineering effort competes with other priorities (more labeled data, better label space design)
202
+ - SecureBERT 2.0's gains may be confounded by the architecture change (BERT → ModernBERT)
203
+
204
+ **Verdict: Worth trying, but AFTER establishing a supervised-only baseline.**
205
+
206
+ The recommended pipeline is:
207
+ 1. **Phase 1 (now):** Fine-tune on labeled NER data → establish baseline F1
208
+ 2. **Phase 2 (if baseline is weak):** Do DAPT with Alpha-Root (3B tokens) → re-fine-tune → measure delta
209
+ 3. **Phase 3 (optional):** TAPT on our unlabeled cyber corpus → re-fine-tune → measure delta
210
+
211
+ This way we know exactly how much DAPT buys us, with minimal wasted effort if the supervised-only baseline is already strong.
212
+
213
+ ---
214
+
215
+ ## Key Sources
216
+ - Gururangan et al. 2020 — "Don't Stop Pretraining": https://aclanthology.org/2020.acl-main.740/
217
+ - SecureBERT 2.0: https://arxiv.org/abs/2510.00240
218
+ - SecureBERT 2.0 NER model card: https://huggingface.co/cisco-ai/SecureBERT2.0-NER
219
+ - DES-MoE (catastrophic forgetting in MoE): https://arxiv.org/abs/2509.16882
220
+ - CyLLM-DAP (efficient DAPT): https://arxiv.org/html/2507.02964v1
221
+ - Alpha-Root (3B cyber tokens): https://arxiv.org/html/2602.22218
222
+ - "Reuse, Don't Retrain" (LR schedules for continued pretraining): https://arxiv.org/html/2407.07263v1
223
+ - CySecBERT: https://arxiv.org/pdf/2212.02974
224
+
225
+ ## Open Questions
226
+ 1. What is the actual vocab size of the OPF tokenizer? (Determines MLM head size and memory)
227
+ 2. Does Alpha-Root overlap significantly with our existing NVD/ATT&CK data?
228
+ 3. Can we tie MLM head weights to the embedding matrix to save memory?
229
+ 4. Should we freeze the MoE router during DAPT to prevent expert collapse?
230
+ 5. What's the [MASK] token ID in the OPF tiktoken encoding? (Or do we use a span corruption objective instead?)
research/notes/progress/2026-04-24-37-training-tricks-research.md ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Tricks Research for Arcspan NER Fine-Tuning
2
+
3
+ **Date:** 2026-04-24
4
+ **Context:** We use flat LR=5e-5, AdamW, no warmup, no decay. Overfitting at epoch 3-4 on 20K examples. Goal: squeeze maximum F1 from our 50M-active-param MoE model.
5
+
6
+ ---
7
+
8
+ ## 1. LR Scheduling: Warmup + Decay
9
+
10
+ ### Key Findings
11
+
12
+ **Linear warmup is essential for transformer fine-tuning.** Without it, early gradient estimates are noisy and can push the model into bad basins. Standard recommendation: **warmup for 6-10% of total training steps**, linear ramp from 0 to peak LR.
13
+
14
+ **Best schedules for NER fine-tuning (ranked):**
15
+
16
+ 1. **Linear warmup + cosine decay** — The workhorse. Used in most BERT/RoBERTa NER papers. Peak LR 2e-5 to 5e-5, warmup 6-10%, then cosine anneal to ~0.
17
+ 2. **Warmup-Stable-Decay (WSD)** — Emerging best practice from 2024-2025. Three phases: warmup (1-2%), stable plateau at peak LR (60-80%), then decay (10-25%). The long plateau maximizes exploration. Sqrt decay shape slightly outperforms cosine in recent benchmarks (+0.2-0.5% relative).
18
+ 3. **Linear warmup + linear decay** — Simpler, nearly as good. HuggingFace default.
19
+
20
+ **Concrete recommendation for our setup:**
21
+ - With ~20K examples, BS=8, ~5 epochs → ~12,500 steps
22
+ - Warmup: 750-1250 steps (6-10%)
23
+ - Schedule: cosine decay (simplest to implement via HF `get_scheduler`)
24
+ - This alone should help significantly vs. flat LR — flat LR keeps updating aggressively in later epochs when the model should be settling
25
+
26
+ **AdamW β₂ tuning:** Setting β₂=0.98 (vs default 0.999) can improve stability during decay phase. Worth trying.
27
+
28
+ **Sources:**
29
+ - WSD scheduling: https://www.emergentmind.com/topics/warmup-stable-decay-wsd-learning-rate-scheduling
30
+ - Advanced fine-tuning techniques: https://towardsdatascience.com/advanced-techniques-for-fine-tuning-transformers-82e4e61e16e/
31
+ - EMNLP 2024 LR transitions: https://aclanthology.org/2024.findings-emnlp.954.pdf
32
+
33
+ ---
34
+
35
+ ## 2. Weight Decay
36
+
37
+ ### Key Findings
38
+
39
+ **Standard value: 0.01** for fine-tuning. This is the BERT/RoBERTa default and works well across NER benchmarks.
40
+
41
+ **Parameter group strategy (important):**
42
+ - Weight matrices: weight_decay=0.01
43
+ - Bias terms: weight_decay=0.0
44
+ - LayerNorm params: weight_decay=0.0
45
+
46
+ This is standard practice but we should verify our training code implements it. Regularizing biases and norms is counterproductive.
47
+
48
+ **For our overfitting problem:** Could try 0.05-0.1 (higher decay = stronger regularization). But LR scheduling will likely have a bigger impact than weight decay tuning.
49
+
50
+ **Key insight:** In AdamW, effective regularization = lr × weight_decay. As LR decays via schedule, regularization automatically weakens — this is desirable as it lets the model settle into fine-grained adjustments late in training.
51
+
52
+ **Sources:**
53
+ - https://aicompetence.org/fine-tuning-with-adamw/
54
+ - https://mbrenndoerfer.com/writing/adamw-optimizer-decoupled-weight-decay
55
+
56
+ ---
57
+
58
+ ## 3. Curriculum Learning
59
+
60
+ ### Key Findings
61
+
62
+ **Yes, curriculum learning helps NER**, especially in low-resource settings. The Dual-Stage Curriculum Learning (DCL) framework (2024) shows consistent gains.
63
+
64
+ **How to define difficulty for NER samples:**
65
+ 1. **Model uncertainty (best):** Train a teacher model for a few epochs, then score samples by prediction uncertainty (Monte Carlo dropout or simple softmax entropy). High uncertainty = hard.
66
+ 2. **Top-N Least Confidence:** Average confidence of the N least-confident tokens in a sentence.
67
+ 3. **Sentence length:** Longer = harder. Crude but effective as a baseline.
68
+ 4. **Entity density:** More entities per sentence = harder (our intuition is correct).
69
+ 5. **Entity type rarity:** Sentences with rare entity types are harder.
70
+
71
+ **Practical implementation:**
72
+ 1. Train for 1-2 epochs on full data (or use a pretrained checkpoint)
73
+ 2. Score all training samples by model uncertainty
74
+ 3. Sort easy → hard
75
+ 4. Start training on easiest 30%, gradually add harder examples using a root scheduling function
76
+ 5. Full dataset incorporated by ~60% of training
77
+
78
+ **Expected gains:** +0.5-2.0 F1 points depending on dataset difficulty distribution. The DCL paper reports 25% faster convergence with improved final performance.
79
+
80
+ **Relevance for Arcspan:** Medium-high. Our cybersecurity NER data likely has a wide difficulty range (simple "CVE-2024-1234" mentions vs. complex nested vulnerability descriptions). Worth implementing after we nail LR scheduling.
81
+
82
+ **Sources:**
83
+ - DCL framework: https://arxiv.org/html/2402.13534
84
+ - Low-resource NER + curriculum: https://www.researchgate.net/publication/353753078
85
+
86
+ ---
87
+
88
+ ## 4. Label Smoothing
89
+
90
+ ### Key Findings
91
+
92
+ **Standard label smoothing (ε=0.1) has mixed results for token classification.** The issue is that BIOES tags have strict structural meaning — smoothing uniformly across O/B/I/E/S tags can confuse the Viterbi decoder.
93
+
94
+ **Boundary Smoothing (NER-specific, ACL 2022):** A much better approach. Instead of uniform smoothing, it redistributes probability mass specifically to adjacent spans. E.g., if "tokens 3-5" is annotated as an entity, boundary smoothing gives small probability to "tokens 2-5" and "tokens 3-6". This:
95
+ - Mitigates over-confidence at entity boundaries
96
+ - Improves calibration
97
+ - Produces flatter loss landscapes
98
+ - Achieves SOTA on 8 NER benchmarks
99
+
100
+ **Practical recommendation:**
101
+ - **Don't use standard label smoothing** with our BIOES + Viterbi setup — it would smooth probability toward invalid tag transitions
102
+ - **Boundary smoothing is architecturally compatible** with our approach but requires custom implementation
103
+ - **Priority: LOW for now.** The gains are real but implementation effort is non-trivial given our Viterbi constraint. Revisit after higher-impact changes.
104
+
105
+ **Sources:**
106
+ - Boundary Smoothing for NER (ACL 2022): https://aclanthology.org/2022.acl-long.490/
107
+ - Paper: https://arxiv.org/abs/2204.12031
108
+
109
+ ---
110
+
111
+ ## 5. Stochastic Weight Averaging (SWA) / Checkpoint Averaging
112
+
113
+ ### Key Findings
114
+
115
+ **This is a near-free lunch. High priority.**
116
+
117
+ **Simple checkpoint averaging:** Average the weights of the last N checkpoints (e.g., last 3-5 epochs). This is trivially implementable and consistently yields +0.3-1.0 F1 improvement over picking the single best checkpoint.
118
+
119
+ **Why it works:** Individual checkpoints sit in different local minima. Averaging finds a point in a flatter basin with better generalization. Izmailov et al. (2018) showed SWA finds wider optima.
120
+
121
+ **SWA (more sophisticated):**
122
+ - After normal training, continue for extra steps with constant or cyclic LR
123
+ - Maintain running average of weights
124
+ - Apply batch normalization update at the end
125
+ - Typically uses a lower constant LR (e.g., 2e-6)
126
+
127
+ **Practical recommendation for Arcspan:**
128
+ 1. **Immediate win: Average top-3 checkpoints by val F1.** Just load checkpoints, average state_dicts, evaluate. Zero training cost.
129
+ 2. **Next: Average last 3-5 epoch checkpoints** (regardless of individual val scores). Often works even better than cherry-picking.
130
+ 3. **Later: Full SWA** with cyclical LR for the last 25% of training.
131
+
132
+ **Implementation is trivial:**
133
+ ```python
134
+ import torch
135
+ ckpts = [torch.load(f"epoch_{i}.pt") for i in range(3, 6)]
136
+ avg = {k: sum(c[k] for c in ckpts) / len(ckpts) for k in ckpts[0]}
137
+ torch.save(avg, "averaged.pt")
138
+ ```
139
+
140
+ **Sources:**
141
+ - Izmailov et al. 2018: https://arxiv.org/abs/1803.05407
142
+ - Checkpoint ensembles: https://arxiv.org/abs/1710.03282
143
+ - SWA applied to Arabic NER: https://aclanthology.org/2023.arabicnlp-1.86/
144
+
145
+ ---
146
+
147
+ ## 6. Discriminative Fine-Tuning / Layer-wise LR Decay (LLRD)
148
+
149
+ ### Key Findings
150
+
151
+ **LLRD assigns different learning rates to different layers**, with lower LR for earlier (more general) layers and higher LR for later (more task-specific) layers + classification head.
152
+
153
+ **Standard approach:**
154
+ - Classification head: base_lr (e.g., 5e-5)
155
+ - Layer N (top): base_lr × decay^1
156
+ - Layer N-1: base_lr × decay^2
157
+ - ...
158
+ - Embeddings: base_lr × decay^N
159
+ - **Typical decay factor: 0.85-0.95 per layer**
160
+
161
+ **For our 8-layer model:**
162
+ - Head: 5e-5
163
+ - Layer 7: 4.25e-5 (×0.85)
164
+ - Layer 6: 3.6e-5
165
+ - Layer 5: 3.1e-5
166
+ - Layer 4: 2.6e-5
167
+ - Layer 3: 2.2e-5
168
+ - Layer 2: 1.9e-5
169
+ - Layer 1: 1.6e-5
170
+ - Embeddings: 1.35e-5
171
+
172
+ **Evidence:** Originally from ULMFiT (Howard & Ruder, 2018). Widely adopted in NLP fine-tuning. The "Advanced Techniques for Fine-Tuning Transformers" guide recommends it with decay=0.9 per layer. Gains are typically +0.3-0.8 F1 for NER.
173
+
174
+ **Relevance for Arcspan:** Our model is only 8 layers, so the range between top and bottom is smaller than for BERT-24. Still worth trying — the MoE routing layers in early blocks encode general token representations that benefit from conservative updates.
175
+
176
+ **Implementation:** Requires custom param groups in the optimizer. Moderate effort.
177
+
178
+ **Sources:**
179
+ - Howard & Ruder, ULMFiT (2018): https://arxiv.org/abs/1801.06146
180
+ - Advanced fine-tuning: https://towardsdatascience.com/advanced-techniques-for-fine-tuning-transformers-82e4e61e16e/
181
+
182
+ ---
183
+
184
+ ## 7. Data Augmentation for NER
185
+
186
+ ### Key Findings
187
+
188
+ **Token-level Mixup is NOT straightforward for NER** because labels are tied to specific token positions. Standard Mixup (interpolating two input embeddings) destroys token-label alignment.
189
+
190
+ **Effective NER augmentation techniques:**
191
+
192
+ 1. **Entity replacement / mention swapping:** Replace entity mentions with other entities of the same type from a gazetteer. E.g., swap "CVE-2024-1234" with "CVE-2023-5678". Preserves label structure. **High priority for cybersecurity NER** — we can generate synthetic CVEs, IPs, hashes easily.
193
+
194
+ 2. **Synonym replacement (non-entity tokens only):** Replace context words with synonyms. Preserves entity spans. Helps model learn that entities are context-independent.
195
+
196
+ 3. **Random token dropout:** Randomly mask/drop non-entity tokens. Forces model to rely on entity-internal patterns rather than context.
197
+
198
+ 4. **Sentence cropping:** Take sub-spans of long sentences as new training examples. Effectively increases dataset size.
199
+
200
+ 5. **Back-translation:** Translate to another language and back. Paraphrases context while (ideally) preserving entities. Noisy but effective.
201
+
202
+ 6. **LLM-generated synthetic data:** Use an LLM to generate new sentences containing target entity types. **Already in our pipeline — this is our primary augmentation strategy.**
203
+
204
+ **Priority for Arcspan:** Entity replacement is the highest-value technique we're not already doing. For cybersecurity entities (CVEs, IPs, domains, malware names), we can build simple gazetteers and do systematic replacement augmentation.
205
+
206
+ ---
207
+
208
+ ## 8. Batch Size and Gradient Accumulation
209
+
210
+ ### Key Findings
211
+
212
+ **Current setup:** BS=4, grad_accum=2 → effective BS=8.
213
+
214
+ **General principles:**
215
+ - For fine-tuning transformers, **effective BS of 16-32 is the sweet spot** for most NER tasks
216
+ - Larger BS → more stable gradients → can use higher LR (linear scaling rule: double BS → double LR)
217
+ - But for small datasets, **smaller BS provides implicit regularization** through gradient noise
218
+ - Diminishing returns above BS=32 for NER; some papers report degradation above BS=64
219
+
220
+ **Recommendation:**
221
+ - Try grad_accum=4 (effective BS=16) with proportionally higher LR
222
+ - Try grad_accum=8 (effective BS=32) as upper bound
223
+ - With 20K examples and BS=32: ~625 steps/epoch, ~3125 total steps for 5 epochs — still enough for meaningful LR scheduling
224
+
225
+ **Important interaction:** Larger batch size + LR warmup work synergistically. Warmup is MORE important with larger batches because the initial gradient estimates are more aggressive.
226
+
227
+ **The "linear scaling rule" (Goyal et al., 2017):** When multiplying batch size by k, multiply LR by k. So BS=8 at LR=5e-5 → BS=32 at LR=2e-4. But for fine-tuning (vs. pretraining), be more conservative — try BS=32 at LR=1e-4.
228
+
229
+ ---
230
+
231
+ ## Priority-Ranked Implementation Plan
232
+
233
+ Based on expected impact vs. implementation effort:
234
+
235
+ ### Tier 1: Do Immediately (high impact, low effort)
236
+ 1. **LR warmup + cosine decay** — Expected to directly address our overfitting. 10% warmup, cosine to 0. Implementation: 2 lines of code change.
237
+ 2. **Checkpoint averaging** — Average top-3 or last-3 checkpoints. Zero training cost, +0.3-1.0 F1. Implementation: 10 lines of post-processing.
238
+
239
+ ### Tier 2: Do Next (high impact, moderate effort)
240
+ 3. **Weight decay param groups** — Exclude bias/LayerNorm from decay. Set decay=0.01-0.05. Implementation: refactor optimizer setup.
241
+ 4. **Increase effective batch size** — Try BS=16 and BS=32 with proportional LR scaling. Implementation: change config values.
242
+ 5. **Layer-wise LR decay** — Decay=0.9 per layer. Implementation: custom param groups.
243
+
244
+ ### Tier 3: Experiment After Basics (moderate impact, higher effort)
245
+ 6. **Entity replacement augmentation** — Build cybersecurity gazetteers, systematic replacement. Implementation: data pipeline changes.
246
+ 7. **Curriculum learning** — Score samples by model uncertainty, train easy→hard. Implementation: training loop refactor.
247
+
248
+ ### Tier 4: Later Investigation (uncertain impact, high effort)
249
+ 8. **Boundary smoothing** — NER-specific label smoothing. Needs custom loss + Viterbi interaction analysis.
250
+ 9. **Full SWA** — Cyclical LR + weight averaging in final training phase.
251
+
252
+ ---
253
+
254
+ ## Open Questions
255
+
256
+ - Does our Viterbi decoding interact with LR scheduling in unexpected ways? (The transition constraints should be independent of training dynamics, but worth verifying.)
257
+ - Our MoE routing: should expert selection layers have their own LR group? The routing weights are crucial — too-aggressive updates could destabilize expert assignment.
258
+ - For checkpoint averaging: do we average the full model including MoE routing weights, or exclude them? Averaging routing weights from different checkpoints might create inconsistent expert assignments.
259
+ - β₂=0.98 vs 0.999: worth a quick ablation.
research/notes/progress/2026-04-24-39-class-weighting-data-scaling-research.md ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Class Weighting, Data Scaling & Error Analysis for NER
2
+
3
+ **Date:** 2026-04-24
4
+ **Context:** Arcspan cybersecurity NER — severe class imbalance (Indicator ~3% F1, Vulnerability 0% in early rounds). Enriched dataset: ~32K examples, ~79K spans. Model: OpenAI Privacy Filter (50M active params, BIOES + Viterbi).
5
+
6
+ ---
7
+
8
+ ## 1. Class-Weighted Cross-Entropy for BIOES Token Classification
9
+
10
+ ### How to Compute Weights
11
+
12
+ Two strategies, combinable:
13
+
14
+ **A. Weight by tag frequency (inverse frequency):**
15
+ - Count occurrences of each of the `1 + N*4` BIOES classes across training tokens
16
+ - Weight_c = total_tokens / (num_classes * count_c), or simpler: Weight_c = 1 / freq_c, normalized
17
+ - The O tag (often 85-95% of tokens) gets weight ~0.05-0.1; rare B-Vulnerability tags get weight ~5-20
18
+
19
+ **B. Weight by entity type (group BIOES tags):**
20
+ - All B/I/O/E/S tags for the same entity type share a base weight derived from entity-type frequency
21
+ - Then optionally further scale B and S tags higher (they're critical for span detection)
22
+
23
+ ### Practical Implementation
24
+ ```python
25
+ # Inverse-frequency weights
26
+ from collections import Counter
27
+ counts = Counter(all_training_tags)
28
+ total = sum(counts.values())
29
+ weights = {tag: total / (len(counts) * counts[tag]) for tag in counts}
30
+ # Clamp to avoid extreme values
31
+ weights = {t: min(w, 20.0) for t, w in weights.items()}
32
+ ```
33
+
34
+ ### Key Findings
35
+ - Standard approach in NER literature; used in spaCy, HuggingFace token classifiers
36
+ - **Risk**: Extreme weights destabilize training. Clamp at 10-20x max.
37
+ - **Recommendation**: Start with sqrt(inverse frequency) — less aggressive than raw inverse frequency but still effective. Monitor whether model stops predicting O entirely (overcorrection).
38
+
39
+ **Sources:**
40
+ - https://datascience.stackexchange.com/questions/94021/imbalance-classes-in-named-entity-recognition
41
+ - https://stats.stackexchange.com/questions/63635/named-entity-recognition-and-class-imbalance
42
+
43
+ ---
44
+
45
+ ## 2. Focal Loss for NER
46
+
47
+ ### Formula
48
+ `FL(p_t) = -α_t * (1 - p_t)^γ * log(p_t)`
49
+
50
+ ### Does It Help with O-Tag Dominance?
51
+ **Yes, directly.** O-tag tokens are "easy" examples (model quickly learns to predict them with high confidence). Focal loss down-weights these easy examples via the (1-p_t)^γ factor, automatically focusing gradient on hard tokens (entity boundaries, rare types).
52
+
53
+ ### Recommended Gamma Values
54
+ | Source | Recommended γ | Context |
55
+ |--------|--------------|---------|
56
+ | Lin et al. (original paper) | **2.0** | Object detection (analogous imbalance) |
57
+ | NLP practitioners | **0.5 - 1.5** | Text classification, moderate imbalance |
58
+ | Extreme imbalance | **2.0 - 5.0** | When majority class >90% |
59
+
60
+ **For our case** (O-tag likely 85-92%): Start with **γ=2.0**, sweep {1.0, 2.0, 3.0, 5.0}.
61
+
62
+ ### Focal vs. Class-Weighted CE
63
+ - Focal loss is **adaptive** — it learns which examples are hard per-token
64
+ - Class-weighted CE is **static** — same weight regardless of model confidence
65
+ - Can combine both: weighted focal loss (α per class + γ focusing)
66
+ - **Recommendation**: Try focal loss alone first (γ=2), then weighted focal if needed
67
+
68
+ ### MoM (Majority or Minority) Learning — Novel Alternative
69
+ Paper: "Majority or Minority: Data Imbalance Learning Method for NER" (arXiv 2401.11431, 2024):
70
+ - Adds auxiliary loss computed only on O-class samples
71
+ - Single hyperparameter λ instead of per-class weights
72
+ - Outperformed focal loss and dice loss on CoNLL2003, OntoNotes5.0, KWDLC
73
+ - **Worth investigating** for our use case
74
+
75
+ **Sources:**
76
+ - https://arxiv.org/html/2401.11431
77
+ - https://stats.stackexchange.com/questions/567859/how-to-choose-gamma-parameter-in-focal-loss
78
+ - https://aicompetence.org/tuning-gamma-in-focal-loss/
79
+
80
+ ---
81
+
82
+ ## 3. Dice Loss / F1-Oriented Loss
83
+
84
+ ### Key Insight
85
+ Dice coefficient ≡ F1 score mathematically: `2TP / (2TP + FP + FN)`
86
+
87
+ Dice loss = 1 - Dice coefficient. Directly optimizes the metric we care about.
88
+
89
+ ### Paper: "Dice Loss for Data-imbalanced NLP Tasks" (Li et al., 2020)
90
+ - Proposed replacing CE with Dice loss for NER, text classification, MRC
91
+ - Dice loss **attaches equal importance to FP and FN**, making it robust to class imbalance
92
+ - On NER benchmarks: improved F1 for minority classes without hurting majority
93
+
94
+ ### Implementation for Token Classification
95
+ - Compute per-class soft Dice over all tokens in a batch
96
+ - Average across entity classes (optionally excluding O)
97
+ - Self-adjusting Dice (DSC) variant adds a smoothing term
98
+
99
+ ### Tradeoffs
100
+ | Aspect | Dice Loss | CE Loss |
101
+ |--------|-----------|---------|
102
+ | Class imbalance handling | Excellent (inherent) | Requires weighting |
103
+ | Gradient quality | Can be noisy for rare classes | Stable |
104
+ | Optimization landscape | Non-convex | Convex |
105
+ | Common practice | Growing adoption | Standard default |
106
+
107
+ ### Recommendation
108
+ **Use as secondary experiment.** Dice loss is promising but the non-convex optimization can cause training instability. Try: (1) CE baseline, (2) focal loss γ=2, (3) Dice loss, and compare.
109
+
110
+ **Sources:**
111
+ - https://www.semanticscholar.org/paper/Dice-Loss-for-Data-imbalanced-NLP-Tasks-Li-Sun/5487dadb5b4b8b240ab4ae28705acc0b9f138db0
112
+ - https://www.researchgate.net/publication/343302480_Dice_Loss_for_Data-imbalanced_NLP_Tasks
113
+
114
+ ---
115
+
116
+ ## 4. Data Scaling — Expected Gains from 3-5x More Training Data
117
+
118
+ ### Current State
119
+ - Using ~3K of 193K NVD records, ~1.7K of ~2K MITRE records
120
+ - Total: ~32K examples, ~79K spans
121
+ - Potential: 100K+ examples with more NVD sampling
122
+
123
+ ### Scaling Laws for NER
124
+ NER follows **power-law scaling**: performance ∝ D^α where α is typically 0.1-0.3 for token classification tasks with pre-trained models.
125
+
126
+ **What this means concretely:**
127
+ - Going from 32K → 100K examples (3x) would yield roughly **3^0.2 ≈ 1.25x** improvement factor
128
+ - If current F1 gap from ceiling is 20 points, expect ~5 point gain
129
+ - Going from 32K → 160K (5x) → **5^0.2 ≈ 1.38x** → ~7-8 point gain
130
+
131
+ ### Key Findings from Literature
132
+ 1. **Pre-trained models reduce data hunger**: Since our model is fine-tuned (not trained from scratch), we're in the "transfer learning" regime where gains from more data are more modest
133
+ 2. **Diminishing returns are real**: Most gains come in the first 10-50K examples for pre-trained models
134
+ 3. **Per-entity scaling matters more than total**: Adding 10K more examples with zero Vulnerability mentions won't help Vulnerability F1. **Data scaling should target underrepresented entity types**
135
+ 4. **Quality > quantity past a threshold**: At 100K+ examples, annotation noise dominates gains
136
+
137
+ ### Recommendation
138
+ - **High priority**: Scale Vulnerability and Indicator examples specifically (these are our failure modes)
139
+ - **Medium priority**: Go from 3K → 15K NVD records (5x) — expect meaningful but not transformative gains
140
+ - **Track learning curves**: Train on 10%, 25%, 50%, 100% of data and plot F1 per entity type to find actual diminishing returns point for OUR data
141
+
142
+ **Sources:**
143
+ - https://arxiv.org/abs/2001.08361 (Scaling Laws for Neural Language Models)
144
+ - https://pmc.ncbi.nlm.nih.gov/articles/PMC11228526/ (Explaining Neural Scaling Laws)
145
+
146
+ ---
147
+
148
+ ## 5. Active Learning for NER
149
+
150
+ ### Does It Help?
151
+ **Yes — up to 66% annotation savings** compared to random sampling (PMC4934373).
152
+
153
+ ### Best Strategy: Uncertainty Sampling
154
+ 1. Train initial model on small seed set
155
+ 2. Run inference on unlabeled pool
156
+ 3. Select examples where model is **least confident** (highest token-level entropy, especially on entity tokens)
157
+ 4. Annotate those, retrain, repeat
158
+
159
+ ### Specific Methods (ranked by effectiveness)
160
+ 1. **Entity Entropy**: Sum entropy only over tokens the model predicts as B-* tags. Best for targeting entity-specific uncertainty.
161
+ 2. **N-best Sequence Entropy**: Entropy over top-N label sequences (N=3 typical)
162
+ 3. **Least Confidence**: 1 - P(best sequence)
163
+ 4. **Margin Sampling**: Difference between top-2 predictions
164
+
165
+ ### For Our Use Case
166
+ Since we have 193K unlabeled NVD records and 190K unused:
167
+ - Train initial model on current 32K
168
+ - Score all 190K unused NVD records by uncertainty
169
+ - Select top 5-10K with highest entity entropy
170
+ - Generate spans for those (via our LLM pipeline)
171
+ - This targets exactly the examples our model struggles with
172
+
173
+ ### Practical Concern
174
+ Our annotation is LLM-based (not human), so "annotation cost" is API cost, not human time. Active learning still helps by **ensuring we generate training data for the hardest cases** rather than easy/redundant ones.
175
+
176
+ **Sources:**
177
+ - https://pmc.ncbi.nlm.nih.gov/articles/PMC4934373/
178
+ - https://journals.sagepub.com/doi/10.3233/IDT-200048
179
+
180
+ ---
181
+
182
+ ## 6. Oversampling Rare Entity Types
183
+
184
+ ### Approaches
185
+ 1. **Simple duplication**: Repeat examples containing rare entities N times in training data
186
+ 2. **Sentence-level oversampling**: Duplicate entire sentences, not just the entity tokens
187
+ 3. **Context variation**: Use the rare entity in different synthetic contexts (LLM-generated)
188
+ 4. **Entity replacement augmentation**: Replace common entities with rare type entities in existing sentences
189
+
190
+ ### Risks
191
+ - **Overfitting**: Duplicated examples reduce effective dataset diversity; model memorizes specific contexts
192
+ - **Distribution shift**: Training distribution no longer matches real-world frequency
193
+ - **Boundary artifacts**: If always seeing the same token patterns around rare entities, model learns spurious boundary cues
194
+
195
+ ### Better Alternatives (ranked)
196
+ 1. **Loss weighting** (focal/class-weighted) — addresses imbalance without altering data distribution
197
+ 2. **Targeted data generation** — generate NEW diverse examples for rare types via LLM, don't just duplicate
198
+ 3. **Moderate oversampling (2-3x)** combined with loss weighting — lower overfitting risk than aggressive oversampling
199
+ 4. **SMOTE-like approaches** — don't translate well to NER (token sequences aren't continuous vectors)
200
+
201
+ ### Recommendation
202
+ **Don't oversample more than 3x.** Prefer generating genuinely new examples for rare types (we already have the LLM pipeline for this). Combine with focal loss γ=2 for remaining imbalance.
203
+
204
+ ---
205
+
206
+ ## 7. Error Analysis Methodology
207
+
208
+ ### Span-Level Error Taxonomy
209
+ Standard NER error categories:
210
+
211
+ | Error Type | Description | Example |
212
+ |-----------|-------------|---------|
213
+ | **Missing** | Entity in gold, not predicted | Failed to detect CVE-2024-1234 |
214
+ | **Spurious** | Predicted entity not in gold | Marked "version" as Software |
215
+ | **Type error** | Correct span, wrong label | Detected "log4j" but labeled as Indicator instead of Software |
216
+ | **Boundary error (left)** | Span starts too early/late | "Apache Log4j" → only detected "Log4j" |
217
+ | **Boundary error (right)** | Span ends too early/late | "CVE-2024-1234" → detected "CVE-2024" |
218
+ | **Boundary + Type** | Both span and label wrong | Combined error |
219
+
220
+ ### Tools
221
+ 1. **seqeval** (Python): Standard NER evaluation, entity-level P/R/F1 per type. `pip install seqeval`
222
+ 2. **nervaluate** (Python): Supports partial match scoring (exact, partial, type, overlap). `pip install nervaluate`
223
+ 3. **Custom span-diff scripts**: Compare gold vs pred span lists, categorize each error
224
+ 4. **Confusion matrices**: Build entity-type confusion matrix (what gets confused with what)
225
+ 5. **Length-stratified analysis**: Group spans by token length (1, 2-3, 4+) and compute F1 per bucket
226
+
227
+ ### Recommended Error Analysis Pipeline for Arcspan
228
+ ```
229
+ 1. Run inference on eval set
230
+ 2. Extract (gold_spans, pred_spans) per example
231
+ 3. Align spans using IoU overlap
232
+ 4. Categorize each error: missing/spurious/type/boundary
233
+ 5. Aggregate by entity type → identify systematic patterns
234
+ 6. Stratify by span length, position in sentence, entity density
235
+ 7. Sample 50 worst errors per type for qualitative review
236
+ ```
237
+
238
+ ### Key Diagnostic Questions
239
+ - Which entity types have high recall but low precision? (spurious predictions)
240
+ - Which have low recall? (model can't detect them — need more data or better features)
241
+ - Are boundary errors concentrated on multi-token spans? (common with BIOES)
242
+ - Does the model confuse specific type pairs? (e.g., Indicator vs. Software)
243
+
244
+ ---
245
+
246
+ ## 8. Ensemble Methods for NER with Viterbi Decoding
247
+
248
+ ### Standard Approaches
249
+ 1. **Logit averaging**: Average pre-softmax logits from N models, then run single Viterbi decode
250
+ 2. **Probability averaging**: Average softmax probabilities, then Viterbi decode
251
+ 3. **Span-level voting**: Each model produces spans independently; take majority vote on span boundaries + types
252
+ 4. **Stacking**: Train a meta-model on concatenated features from N base models
253
+
254
+ ### Compatibility with Viterbi Decoding
255
+
256
+ **Logit/probability averaging + Viterbi is the cleanest approach:**
257
+ - Each model produces per-token logits over BIOES classes
258
+ - Average the logits (or log-probs) across models
259
+ - Run Viterbi on the averaged emission scores with the standard BIOES transition constraints
260
+ - This preserves valid BIOES sequences while benefiting from ensemble diversity
261
+
262
+ **Span-level voting is simpler but loses Viterbi benefits:**
263
+ - Each model independently runs Viterbi → produces span list
264
+ - Merge spans: keep spans where ≥K of N models agree (on type + boundary overlap ≥50%)
265
+ - Risk: may produce fewer spans (conservative) or need tie-breaking rules
266
+
267
+ ### Expected Gains
268
+ - Typical NER ensemble gains: **+0.5 to 2.0 F1 points** over best single model
269
+ - Gains are larger when individual models have diverse errors (different seeds, different data subsets)
270
+ - Diminishing returns beyond 3-5 models
271
+
272
+ ### Practical Considerations for Arcspan
273
+ - Our model is small (50M active params) → training 3-5 variants is cheap
274
+ - Diversity sources: different random seeds, different data subsets (bagging), different loss functions (CE vs focal vs dice)
275
+ - **Recommendation**: Train 3 models (different seeds + CE/focal/dice), average logits before Viterbi
276
+ - Inference cost: 3x for ensemble, but base model is already fast (50M params)
277
+
278
+ ---
279
+
280
+ ## Summary: Priority-Ordered Action Plan
281
+
282
+ | Priority | Action | Expected Impact | Effort |
283
+ |----------|--------|----------------|--------|
284
+ | **P0** | Implement focal loss (γ=2) | +3-5 F1 on rare types | Low |
285
+ | **P0** | Class-weighted CE as baseline comparison | Baseline improvement | Low |
286
+ | **P0** | Error analysis pipeline | Diagnostic (guides all other work) | Medium |
287
+ | **P1** | Scale Vulnerability/Indicator training data specifically | +5-10 F1 on those types | Medium |
288
+ | **P1** | Learning curve analysis (10/25/50/100% data) | Know where we are on scaling curve | Medium |
289
+ | **P1** | Active learning for NVD selection | Better data efficiency | Medium |
290
+ | **P2** | Dice loss experiment | May outperform focal for rare types | Low |
291
+ | **P2** | MoM loss (arxiv 2401.11431) | Novel, shown to beat focal+dice | Medium |
292
+ | **P2** | 3-model ensemble (logit averaging + Viterbi) | +1-2 F1 overall | Medium |
293
+ | **P3** | Moderate oversampling (2-3x) of rare types | Small gains, risk of overfitting | Low |
294
+
295
+ ### Key Insight
296
+ **Loss function changes (P0) are highest ROI**: they require minimal code changes and directly address our core problem. Data scaling (P1) is next — but should be targeted at underperforming entity types, not uniform scaling. Ensembles (P2) are a reliable but incremental improvement for after the fundamentals are solid.
research/notes/progress/2026-04-24-40-round4b-killed-no-checkpoint.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Round 4b (13-class) Training Results
2
+
3
+ ## Configuration
4
+ - **Data**: 20,436 aggregated examples, 13-class (53 token labels)
5
+ - **LR**: 5e-5, **Epochs**: 15, **BS**: 1, **Grad accum**: 8
6
+ - **GPU**: RTX 5090 32GB
7
+
8
+ ## Epoch-by-Epoch Val Loss
9
+ | Epoch | Train Loss | Val Loss | Val Acc |
10
+ |-------|-----------|----------|---------|
11
+ | 1 | 0.360 | 0.237 | 92.8% |
12
+ | 2 | 0.237 | 0.196 | 94.1% |
13
+ | 3 | 0.188 | 0.179 | 94.7% |
14
+ | **4** | **0.155** | **0.178** ← best | **94.8%** |
15
+ | 5 | 0.129 | 0.178 | 95.1% |
16
+ | 6 | 0.109 | 0.183 ↑ | 95.3% |
17
+ | 7 | 0.092 | 0.198 ↑↑ | 95.3% |
18
+
19
+ Killed at epoch 7 due to clear overfitting (3 consecutive epochs of val_loss increase).
20
+
21
+ ## Key Finding
22
+ - **No checkpoint saved** — `opf train` saves best checkpoint at end of training, not per-epoch. Killing mid-run = no checkpoint.
23
+ - Same overfitting pattern as R4a (5-class): best at epoch 3-4, then degradation
24
+ - 13-class converges slower (val_loss 0.178 vs 0.126 for 5-class) — expected with 53 vs 21 token classes
25
+ - **Lesson**: Need to let training finish naturally or implement per-epoch checkpoint saving
26
+
27
+ ## Impact
28
+ - R4b eval results lost. R5b on enriched 32K data will supersede this anyway.
29
+ - Confirms early stopping at patience=3 is correct strategy for R5.
research/notes/progress/2026-04-24-44-r5a-baseline-results.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # R5a Baseline Results — 5-class Enriched (Epoch 3 Best)
2
+
3
+ ## Training Summary
4
+ - **Config**: flat LR=5e-5, CE loss, no warmup, no LLRD, BS=4 + grad_accum=2
5
+ - **Best epoch**: 3 (val_loss=0.145), early stopping killed at epoch 6 (val_loss=0.160)
6
+ - **Overfitting**: train_loss dropped to 0.032 by epoch 6, val_loss plateau at 0.145-0.147
7
+
8
+ ## Span F1 Results
9
+ | Test Set | Span F1 | Span Precision | Span Recall |
10
+ |----------|---------|----------------|-------------|
11
+ | Enriched test (3853 ex) | **0.6315** | 0.7776 | 0.5316 |
12
+ | CyNER test (748 ex) | **0.5153** | 0.8261 | 0.3744 |
13
+ | SecureBERT2 test (200 ex) | **0.4654** | 0.8421 | 0.3216 |
14
+
15
+ ## Per-Class Span F1 (Enriched test)
16
+ | Class | Precision | Recall | F1 |
17
+ |-------|-----------|--------|----|
18
+ | Vulnerability | 0.816 | 0.674 | 0.738 |
19
+ | Malware | 0.817 | 0.561 | 0.665 |
20
+ | Organization | 0.780 | 0.526 | 0.628 |
21
+ | Indicator | 0.531 | 0.635 | 0.579 |
22
+ | System | 0.768 | 0.364 | 0.494 |
23
+
24
+ ## Key Findings
25
+ 1. **Precision >> Recall everywhere** — model is conservative, under-predicts spans
26
+ 2. **Indicator collapses on CyNER/SB2** — 4-5% recall vs 64% on enriched test. Distribution mismatch.
27
+ 3. **System weakest class** — 36% recall, struggles with entity boundaries
28
+ 4. **Vulnerability strongest** — best F1 at 0.738, probably cleanest label boundaries
29
+ 5. **S- (single-token) tags severely underperformed** — S-Malware F1=0.26, S-Indicator F1=0.00
30
+
31
+ ## Gap Analysis (target: 80% span F1)
32
+ Current: 63% → Need: 80% = **17 point gap**
33
+
34
+ Levers to close it:
35
+ - **Focal loss** (γ=2): Should boost recall by down-weighting easy O tokens
36
+ - **Cosine LR + warmup**: Better convergence, less overfitting
37
+ - **LLRD** (0.9): Preserve pretrained features in lower layers
38
+ - **Viterbi tuning**: Free +1-3 F1 from transition bias calibration
39
+ - **Checkpoint averaging**: Smooth out noise
40
+
41
+ ## Inference Speed
42
+ - Enriched: 7,757 tok/s (longer sequences = better batching)
43
+ - CyNER: 4,043 tok/s
44
+ - SB2: 3,577 tok/s
45
+
46
+ ## Source
47
+ - Checkpoint: `checkpoints/r5a_enriched_5class/epoch_3/`
48
+ - Logs: `eval_r5a_enriched.log`, `eval_r5a_cyner.log`, `eval_r5a_sb2.log`
research/notes/progress/2026-04-24-45-data-quality-audit.md ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Quality Audit: enriched_5class Training Data
2
+
3
+ **Date:** 2026-04-24
4
+ **Context:** Model trained on this data has precision=0.78 but recall=0.53, suggesting systematic missed-entity problems in training annotations.
5
+
6
+ ---
7
+
8
+ ## 1. Dataset Overview
9
+
10
+ | Split | Examples | Empty (no entities) | Entity char density |
11
+ |-------|---------|-------------------|-------------------|
12
+ | Train | 31,510 | 5,183 (16.4%) | 9.7% |
13
+ | Enriched Test | 3,853 | 1,242 (32.2%) | 7.4% |
14
+ | CyNER Test | 748 | 317 (42.4%) | 10.5% |
15
+
16
+ **Train sources** (17 sources, highly heterogeneous):
17
+
18
+ | Source | Examples | Entity density |
19
+ |--------|---------|---------------|
20
+ | cyberner_stix_train | 7,922 | 9.1% |
21
+ | cyner2_train | 4,952 | 7.0% |
22
+ | **exploitdb** | **4,344** | **78.6%** |
23
+ | dnrti_train | 3,187 | 5.2% |
24
+ | nvd_v2 | 2,661 | 2.8% |
25
+ | apt_reports | 2,295 | 6.7% |
26
+ | **synthetic_v2** | **2,000** | **36.3%** |
27
+ | cyner_train | 1,893 | 4.2% |
28
+ | mitre_attack_v2 | 1,544 | 3.5% |
29
+ | synthetic_ioc | 100 | 40.7% |
30
+ | + 7 smaller sources | 512 | varies |
31
+
32
+ Near-duplicates: only 2 exact-duplicate texts found. Not a major issue.
33
+
34
+ ---
35
+
36
+ ## 2. CRITICAL: Massive Unlabeled Indicators (Root Cause of Low Recall)
37
+
38
+ **Finding:** In the first 5,000 training examples alone, **367 obvious IP addresses and hashes appear in the text but are NOT labeled as Indicator entities.** Extrapolating to the full 31.5K examples, there are likely **2,000+ unlabeled IOCs** in the training data.
39
+
40
+ **Examples of missed labels:**
41
+
42
+ - `"MD5 : 69b4b32e4636f1981841cbbe3b927560"` — hash appears in text immediately after "MD5 :" but is not labeled (cyner_train)
43
+ - `"b45defca452a640b303288131eb64c485f442aae0682a3c56489d24d59439b47 d96017..."` — a line full of hashes, none labeled (cyner_train)
44
+ - `"Indicators of Compromise SHA256 Package App label 332e68d8..."` — IOC listing section where multiple hashes go unlabeled
45
+
46
+ **Impact:** This is the single biggest driver of low recall. The model learns from these examples that hashes/IPs in text should be tagged `O`, directly suppressing Indicator detection. This creates a **false-negative training signal** that explains both:
47
+ - Low overall recall (0.53)
48
+ - Catastrophic CyNER Indicator recall (4%)
49
+
50
+ ---
51
+
52
+ ## 3. S-tag (Single-Token Entity) Analysis
53
+
54
+ Train span counts and single-token fractions:
55
+
56
+ | Class | Total spans | Single-token | % Single |
57
+ |-------|-----------|-------------|---------|
58
+ | Indicator | 17,964 | 17,621 | **98.1%** |
59
+ | Malware | 18,700 | 15,243 | 81.5% |
60
+ | Organization | 16,892 | 10,894 | 64.5% |
61
+ | System | 16,444 | 7,946 | 48.3% |
62
+ | Vulnerability | 11,854 | 4,738 | 40.0% |
63
+
64
+ **S-tag representation is actually abundant in training.** The problem is NOT insufficient S-tag examples. Rather:
65
+
66
+ - **S-Indicator F1=0.00** is caused by the massive missed-entity problem (Section 2) — the model learns to suppress Indicator detection.
67
+ - **S-Malware F1=0.26** may reflect boundary confusion: 81.5% of Malware spans are single-token, which is actually very high. The issue is more likely label noise (see Section 5).
68
+
69
+ ---
70
+
71
+ ## 4. Indicator Distribution Mismatch (Enriched vs CyNER)
72
+
73
+ ### Type distribution comparison:
74
+
75
+ | Indicator type | Train | CyNER Test | Enriched Test |
76
+ |---------------|-------|-----------|--------------|
77
+ | Domain | 6,356 (35%) | 139 (53%) | similar |
78
+ | IP/partial-IP | 4,507 (25%) | 7 (3%) | similar |
79
+ | Hash | 3,692 (21%) | 23 (9%) | similar |
80
+ | URL | 1,907 (11%) | 0 (0%) | similar |
81
+ | Filename | 590 (3%) | 23 (9%) | similar |
82
+ | Filepath | 43 (<1%) | 17 (7%) | minimal |
83
+ | Other | 819 (5%) | 51 (20%) | minimal |
84
+
85
+ **Key mismatches:**
86
+ 1. **CyNER is domain-heavy (53%)** while train has more IPs and hashes. But more critically, CyNER's "domains" are often **defanged** (`uyghurapps [ .`, `negg.ddns [ .`) while training data mostly contains clean indicators.
87
+ 2. **CyNER has many "other" indicators (20%)** including registry paths (`Software\Microsoft\Windows\CurrentVersion\Run`), package names (`jp.naver.line.android`, `com.whatsapp`), and tool names (`DroidVPN`). These types are rare in training.
88
+ 3. **CyNER uses defanged notation** (`hxxp`, `[ .`, `[.]`) in 38/748 examples. Training has defanged indicators in only 1,588/31,510 examples (5%) and even then they're handled inconsistently.
89
+
90
+ **This explains the 64% enriched Indicator recall vs 4% CyNER Indicator recall:** the enriched test set was drawn from the same distribution as training (clean IOCs, standard formats), while CyNER contains defanged indicators, partial IPs (`222.139.212 [ .`), and unconventional indicator types that the model never learned to recognize.
91
+
92
+ ---
93
+
94
+ ## 5. ExploitDB Source: Degenerate Data Distribution
95
+
96
+ The `exploitdb` source (4,344 examples = 13.8% of training) has **78.6% entity density** — nearly the entire text is entities. These examples are exploit titles:
97
+
98
+ ```
99
+ TEXT: "Android - ashmem Readonly Bypasses via remap_file_pages() and ASHMEM_UNPIN"
100
+ SPANS: System: "Android", Vulnerability: "ashmem Readonly Bypasses via remap_file_pages()..."
101
+
102
+ TEXT: "FLEX 1080 < 1085 Web 1.6.0 - Denial of Service"
103
+ SPANS: System: "FLEX 1080 < 1085 Web 1.6.0", Vulnerability: "Denial of Service"
104
+ ```
105
+
106
+ **Problems:**
107
+ 1. **Not real sentences** — these are structured titles with a `System - Vulnerability` pattern. The model learns a degenerate heuristic: "everything before the dash is System, everything after is Vulnerability."
108
+ 2. **Inflates System and Vulnerability counts** with formulaic patterns, hurting generalization to natural text.
109
+ 3. **System spans include version numbers** (e.g., "FLEX 1080 < 1085 Web 1.6.0") which creates artificially long, oddly-bounded System entities.
110
+
111
+ ### Synthetic data (synthetic_v2, synthetic_ioc): 2,100 examples, 36-41% density
112
+
113
+ Similarly templated:
114
+ ```
115
+ "FireEye published a threat intelligence report linking Velvet Tempest to a new
116
+ campaign exploiting CVE-2021-10425 in Apache Struts."
117
+ ```
118
+
119
+ All follow identical sentence structure. While less degenerate than ExploitDB, these teach the model template-matching rather than genuine entity recognition.
120
+
121
+ ---
122
+
123
+ ## 6. Span Length Distribution Issues
124
+
125
+ **Vulnerability spans are often very long:**
126
+ - Train: mean 2.1 tokens, but many 3-token spans (2,756 instances). Common pattern: `"Denial of Service"`, `"Remote Code Execution"`, `"SQL Injection"`.
127
+ - ExploitDB contributes very long vulnerability descriptions as single spans: `"ashmem Readonly Bypasses via remap_file_pages() and ASHMEM_UNPIN"` (8+ tokens).
128
+
129
+ **Indicator spans are almost always single-token (98.1%):**
130
+ This makes sense — IPs, hashes, and URLs are typically single whitespace-delimited tokens. But it means multi-token Indicator spans in CyNER (like `"% APPDATA % /myupd/gen/ % Y % m..."` at 55 tokens!) are completely out-of-distribution.
131
+
132
+ ---
133
+
134
+ ## 7. Label Consistency Issues (Spot-Check)
135
+
136
+ Examining specific examples:
137
+
138
+ 1. **"Apple" labeled inconsistently:** In line 21 it's `System: Apple` ("Apple devices"), in line 22 it's `Organization: Apple` ("Apple phishing site"). Both are arguably correct in context, but this teaches the model conflicting signals for the same surface form.
139
+
140
+ 2. **File extensions labeled as Indicator:** `.dll`, `.exe`, `.NET`, `.PDF` etc. appear as Indicator spans (67 total). These are not IOCs — they're generic file types. This adds noise to the Indicator class.
141
+
142
+ 3. **Generic terms as Indicator:** `".NET"` appears as both Indicator and as part of malware names. The class is polluted with non-IOC content.
143
+
144
+ 4. **Vulnerability spans too vague:** `"security vulnerabilities"` (CyNER test line 9) is labeled as Vulnerability — this is a generic phrase, not a specific vulnerability.
145
+
146
+ ---
147
+
148
+ ## 8. Summary of Root Causes for Low Recall
149
+
150
+ Ranked by estimated impact:
151
+
152
+ | # | Issue | Impact | Affected classes |
153
+ |---|-------|--------|-----------------|
154
+ | 1 | **~2,000+ unlabeled IOCs in training** | **CRITICAL** — teaches model to suppress Indicator detection | Indicator (primary), overall recall |
155
+ | 2 | **Indicator distribution mismatch** — defanged notation, package names, registry paths absent from training | **HIGH** — explains 64%→4% CyNER Indicator recall drop | Indicator |
156
+ | 3 | **ExploitDB degenerate patterns** — 4,344 title-format examples with 78.6% entity density | **MEDIUM** — inflates System/Vuln counts with non-generalizable patterns | System, Vulnerability |
157
+ | 4 | **Synthetic template monotony** — 2,100 examples from identical sentence templates | **MEDIUM** — model memorizes template rather than entity features | All classes |
158
+ | 5 | **Label inconsistency** — same surface form gets different labels (Apple=System vs Organization) | **LOW-MEDIUM** — creates conflicting gradients | Organization, System |
159
+ | 6 | **Indicator class pollution** — file extensions, protocol names labeled as Indicator | **LOW** — 76 noisy instances out of 17,964 | Indicator |
160
+
161
+ ---
162
+
163
+ ## 9. Recommendations
164
+
165
+ 1. **Fix unlabeled IOCs** — Run regex-based annotation over training data to label all IPs, hashes, URLs, and domains that currently have no span annotation. This is the highest-ROI fix.
166
+
167
+ 2. **Add defanged indicator examples** — Either add CyNER-style defanged IOCs to training, or normalize both train and test to the same format.
168
+
169
+ 3. **Downweight or remove ExploitDB** — These 4,344 title-format examples are hurting generalization. Either remove them or downweight by 5-10x during training.
170
+
171
+ 4. **Diversify synthetic templates** — The 2,000 synthetic_v2 examples use identical structure. Rewrite with varied sentence patterns.
172
+
173
+ 5. **Add CyNER-style indicator types to training** — Package names, registry paths, and file paths are underrepresented. Add examples from CyNER train or synthesize them.
174
+
175
+ 6. **Clean Indicator class** — Remove bare file extensions (`.dll`, `.exe`) from Indicator labels. These aren't IOCs.
research/notes/progress/2026-04-24-46-competitor-deep-dive.md ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Competitor Deep-Dive: Cybersecurity NER Models
2
+
3
+ **Date:** 2026-04-24
4
+ **Context:** Our Arcspan baseline gets 63% span F1 on cybersecurity NER. Understanding exactly what the top models did to reach their numbers.
5
+
6
+ ---
7
+
8
+ ## 1. SecureBERT 2.0 (Cisco AI) — F1: 94.5%
9
+
10
+ **Paper:** Aghaei et al., "SecureBERT 2.0: Advanced Language Model for Cybersecurity Intelligence" (arXiv:2510.00240, Sep 2025)
11
+ **HuggingFace:** [cisco-ai/SecureBERT2.0-NER](https://huggingface.co/cisco-ai/SecureBERT2.0-NER)
12
+ **GitHub:** [cisco-ai-defense/securebert2](https://github.com/cisco-ai-defense/securebert2)
13
+
14
+ ### Architecture
15
+ - **Base model:** ModernBERT (custom pretrained) — 22 layers, d_model=768, 12 heads
16
+ - **Max seq length:** 8192 (fine-tuned on 1024)
17
+ - **Vocab:** 50,368
18
+
19
+ ### Training Recipe
20
+ | Setting | Value |
21
+ |---------|-------|
22
+ | Optimizer | AdamW |
23
+ | LR | **1e-5** |
24
+ | Scheduler | Linear |
25
+ | Weight Decay | 0.001 |
26
+ | Batch Size | 8 per GPU (8x A100) |
27
+ | Epochs | **20** |
28
+ | Max Seq Len | 1024 |
29
+ | Grad Clip | 1.0 |
30
+ | Precision | fp16 |
31
+ | Loss | **Token-wise Cross Entropy** (no CRF, no class weights mentioned) |
32
+
33
+ ### Data Pipeline
34
+ - **Train:** 3,400 samples / **Test:** 717 samples
35
+ - **Source:** Manually annotated threat intelligence documents by domain experts
36
+ - **Preprocessing:** Subword tokenization with label alignment to subword tokens
37
+ - No augmentation mentioned
38
+
39
+ ### Label Space
40
+ - **5 entity types:** Malware, Indicator, System, Organization, Vulnerability
41
+ - **BIO scheme** (not BIOES): B-X, I-X, O → 11 classes total
42
+ - Same label space as the CyNER dataset
43
+
44
+ ### Special Techniques
45
+ - **None documented** — no CRF, no class weighting, no focal loss, no ensemble
46
+ - The key weapon is **domain-adaptive pretraining**: SecureBERT 2.0 base was pretrained on 13.6B cybersecurity tokens (13x more than v1)
47
+
48
+ ### Per-Class Results
49
+ - **Not published per-class** — only aggregate F1=0.945, R=0.965, P=0.927
50
+
51
+ ### What They Say Matters
52
+ > "Domain-adaptive pretraining and fine-tuning on cybersecurity corpora dramatically improves NER performance."
53
+
54
+ **Key insight for us:** The jump from SecureBERT v1 (73.4%) to v2 (94.5%) came primarily from **massive domain pretraining** (13.6B tokens), NOT from NER-specific tricks. The NER fine-tuning recipe is completely vanilla.
55
+
56
+ ---
57
+
58
+ ## 2. SecureBERT v1 (Original) — F1: 73.4%
59
+
60
+ **Paper:** Aghaei et al., "SecureBERT: A Domain-Specific Language Model for Cybersecurity" (SecureComm 2022, Springer)
61
+ **HuggingFace:** [ehsanaghaei/SecureBERT](https://huggingface.co/ehsanaghaei/SecureBERT)
62
+
63
+ ### Architecture
64
+ - **Base model:** RoBERTa-base, further pretrained on cybersecurity corpus via MLM
65
+ - Standard BERT-base size (~125M params)
66
+
67
+ ### Training Recipe (NER fine-tuning)
68
+ - Details not fully published in model card; paper behind Springer paywall
69
+ - Same CyNER dataset and label space as v2
70
+ - Standard token classification head
71
+
72
+ ### What We Know
73
+ - Pretrained on a cybersecurity corpus (much smaller than v2's 13.6B tokens)
74
+ - NER F1 of 73.4% on same eval set as v2
75
+ - The 21-point gap to v2 is almost entirely attributable to the improved pretraining
76
+
77
+ ---
78
+
79
+ ## 3. SecureModernBERT-NER (attack-vector) — F1: 84.8%
80
+
81
+ **HuggingFace:** [attack-vector/SecureModernBERT-NER](https://huggingface.co/attack-vector/SecureModernBERT-NER)
82
+ **No paper** — community model, documented only on HF model card.
83
+
84
+ ### Architecture
85
+ - **Base model:** answerdotai/ModernBERT-large (NOT domain-pretrained, just the general ModernBERT)
86
+ - Standard token classification head
87
+
88
+ ### Training Recipe
89
+ | Setting | Value |
90
+ |---------|-------|
91
+ | Optimizer | AdamW (torch) |
92
+ | LR | **5e-5** |
93
+ | Scheduler | **Cosine** |
94
+ | Batch Size | **128** |
95
+ | Epochs | **5** |
96
+ | Max Seq Len | **128** |
97
+ | Precision | fp16 |
98
+ | Grad Accum | 1 |
99
+ | Hardware | Single L40S |
100
+
101
+ ### Data Pipeline
102
+ - **502,726 labeled spans** — one of the largest CTI NER datasets
103
+ - Sources: real-world threat reports, vulnerability advisories, incident analyses
104
+ - Manually curated + automated heuristic conflict resolution
105
+ - Span distribution: ORG ~198k, PRODUCT ~79k, MALWARE ~67k, PLATFORM ~57k, THREAT-ACTOR ~49k, CVE ~41k
106
+
107
+ ### Label Space
108
+ - **22 entity types** — much broader than SecureBERT's 5
109
+ - **BIO scheme**
110
+ - Types: URL, ORG, SERVICE, SECTOR, FILEPATH, DOMAIN, PLATFORM, THREAT-ACTOR, PRODUCT, MALWARE, LOC, CVE, TOOL, IPV4, MITRE-TACTIC, MD5, CAMPAIGN, SHA1, SHA256, EMAIL, IPV6, REGISTRY-KEYS
111
+ - This is 45 classes (22 × 2 BIO tags + O)
112
+
113
+ ### Per-Class Results (accuracy, not F1)
114
+ | Entity | Accuracy |
115
+ |--------|----------|
116
+ | CVE | 0.9995 |
117
+ | SHA256 | 0.9874 |
118
+ | URL | 0.9801 |
119
+ | IPV4 | 0.9631 |
120
+ | (others not published individually) |
121
+
122
+ - Overall: P=0.8468, R=0.8484, **F1=0.8476**
123
+ - Macro accuracy across all 22 types: 0.8776
124
+
125
+ ### Special Techniques
126
+ - **None** — completely vanilla token classification
127
+ - No CRF, no class weighting, no augmentation documented
128
+
129
+ ### What They Say Matters
130
+ > "combining the state-of-the-art architecture of ModernBERT with one of the largest and most diverse CTI-labelled NER corpora ever built"
131
+
132
+ **Key insight for us:** This model wins through **data scale** (500k+ spans) not architecture tricks. ModernBERT-large is general-purpose, not domain-pretrained. The 84.8% F1 with 22 classes is impressive but note: structured indicators (CVE, SHA256, URL, IPV4) are easy — regex could catch those. The harder classes (ORG, THREAT-ACTOR, TOOL) likely drag the average down.
133
+
134
+ ---
135
+
136
+ ## 4. CyNER (Alam et al., 2022) — F1: 76.7%
137
+
138
+ **Paper:** "CyNER: A Python Library for Cybersecurity Named Entity Recognition" (arXiv:2204.05754)
139
+ **GitHub:** [aiforsec/CyNER](https://github.com/aiforsec/CyNER)
140
+
141
+ ### Architecture
142
+ - **Best model:** XLM-RoBERTa-large (~560M params)
143
+ - Standard token classification head
144
+ - Also integrates heuristic (regex) extractors and Flair/spaCy NER
145
+
146
+ ### Training Recipe
147
+ - Standard Hugging Face token classification fine-tuning
148
+ - Specific LR/epochs not documented in detail in the paper
149
+
150
+ ### Data Pipeline
151
+ - **~60 threat intelligence reports** from MITRE ATT&CK (Android malware focus)
152
+ - **106,000+ tokens**, **4,530 tagged entities**
153
+ - Annotated using BRAT tool by cybersecurity-trained annotators
154
+ - Very small dataset by modern standards
155
+
156
+ ### Label Space
157
+ - **5 entity types:** Malware, Indicator, System, Organization, Vulnerability
158
+ - **BIO scheme** (B-I-O tagging)
159
+ - 11 classes total (same as SecureBERT's eval set — this IS the canonical benchmark)
160
+
161
+ ### Per-Class Results (XLM-RoBERTa-large)
162
+ | Entity | Precision | Recall | F1 |
163
+ |--------|-----------|--------|-----|
164
+ | **Malware** | 79.82% | 75.11% | **77.39%** |
165
+ | **Indicator** | 78.34% | 86.62% | **82.27%** |
166
+ | **System** | 70.36% | 79.93% | **74.84%** |
167
+ | **Organization** | 70.64% | 60.16% | **64.98%** |
168
+ | **Vulnerability** | 100.0% | 80.0% | **88.89%** |
169
+ | **Overall** | — | — | **76.66%** |
170
+
171
+ ### Special Techniques
172
+ - Multi-approach fusion: transformer + regex heuristics + generic NER (Flair/spaCy)
173
+ - Configurable priority between approaches
174
+
175
+ ### Hardest Classes
176
+ 1. **Organization** (64.98% F1) — low recall (60.16%), ambiguous entities
177
+ 2. **System** (74.84% F1) — broad category, confusable with products
178
+ 3. **Malware** (77.39% F1) — novel names hard to catch
179
+
180
+ ### What They Say Matters
181
+ - Domain-specific pretraining helps (XLM-RoBERTa wasn't cybersec-specific but still won)
182
+ - Combining transformer + heuristic extractors for IOCs
183
+ - Small dataset is the main bottleneck
184
+
185
+ ---
186
+
187
+ ## 5. CyberNER (Harmonized STIX Corpus, 2025) — Reference Dataset
188
+
189
+ **Paper:** arXiv:2510.26499 — "CyberNER: A Harmonized STIX Corpus for Cybersecurity Named Entity Recognition"
190
+ **GitHub:** [yasirech-chammakhy/CyberNER](https://github.com/yasirech-chammakhy/CyberNER)
191
+
192
+ - Unifies CyNER + DNRTI + APTNER + Attacker datasets onto STIX 2.1 standard
193
+ - ~610k tokens, 23,477 sentences, 21 entity types
194
+ - Best model: RoBERTa → F1=0.736
195
+ - ~30% relative improvement over naive dataset concatenation
196
+ - Important as a future benchmark to consider
197
+
198
+ ---
199
+
200
+ ## Comparative Summary
201
+
202
+ | Model | F1 | Params | Pretraining | NER Data | Labels | Scheme | Special |
203
+ |-------|-----|--------|-------------|----------|--------|--------|---------|
204
+ | SecureBERT 2.0 | **94.5%** | ~350M | 13.6B cybersec tokens | 3.4k samples | 5 types | BIO | None — pure domain pretraining |
205
+ | SecureModernBERT-NER | **84.8%** | ~395M (ModernBERT-large) | General | 502k spans | 22 types | BIO | None — data scale |
206
+ | CyNER | **76.7%** | ~560M (XLM-R-large) | General multilingual | 4.5k entities | 5 types | BIO | Regex + transformer fusion |
207
+ | SecureBERT v1 | **73.4%** | ~125M (RoBERTa-base) | Cybersec MLM | 3.4k samples | 5 types | BIO | None |
208
+ | **Arcspan (ours)** | **63%** | **50M active** (MoE) | General | ? | ? | **BIOES** | Viterbi decoding |
209
+
210
+ ---
211
+
212
+ ## Key Takeaways for Closing Our Gap
213
+
214
+ ### 1. Domain pretraining is the #1 lever (SecureBERT v1→v2: +21 points)
215
+ We can't do this — we have 50M active params and a fixed pretrained model. This means we need to compensate with other levers.
216
+
217
+ ### 2. Data scale is the #2 lever (SecureModernBERT: 500k spans → 84.8%)
218
+ Our most actionable lever. SecureBERT 2.0 achieved 94.5% on just 3.4k samples, but it had massive domain pretraining. Without that, SecureModernBERT needed 500k spans for 84.8%. We likely need **significantly more training data**.
219
+
220
+ ### 3. Nobody uses CRF or class weighting
221
+ All competitors use vanilla cross-entropy + BIO + standard token classification head. No CRF layers, no focal loss, no class weighting, no curriculum learning. This is notable — the field hasn't found these necessary.
222
+
223
+ ### 4. All competitors use BIO, we use BIOES
224
+ Our BIOES scheme with Viterbi decoding is architecturally different. BIOES is theoretically richer (encodes span boundaries better), but all competitors do fine with BIO. Our Viterbi constraint should be an advantage if the model learns the patterns correctly.
225
+
226
+ ### 5. Organization is universally the hardest class
227
+ CyNER: Organization=65% F1 vs Vulnerability=89% F1. This 24-point gap is consistent across models. If our per-class breakdown shows Organization dragging us down, that's expected.
228
+
229
+ ### 6. Small base models can do well with enough signal
230
+ SecureBERT v1 at ~125M params got 73.4%. Our model at 50M active params getting 63% is not catastrophically behind — we're in the right ballpark for a model of our size without domain pretraining. A 10-point gap to close to match CyNER's 76.7% seems very achievable.
231
+
232
+ ### 7. Immediate action items
233
+ - **More training data** — consider CyberNER harmonized corpus (610k tokens), DNRTI, APTNER datasets
234
+ - **Longer training** — SecureBERT 2.0 used 20 epochs; are we training long enough?
235
+ - **LR tuning** — competitors use 1e-5 to 5e-5; our default is 2e-4 which may be too high
236
+ - **Sequence length** — SecureBERT 2.0 uses 1024; SecureModernBERT uses 128. What are we using?
237
+ - **Evaluate per-class** — identify our weakest categories to target
238
+
239
+ ---
240
+
241
+ ## Open Questions
242
+
243
+ 1. What is our current training data size? How does it compare to the 3.4k samples SecureBERT used?
244
+ 2. Are we using the CyNER dataset specifically? If so, same train/test split?
245
+ 3. What is our learning rate? The 2e-4 default may be too aggressive for NER fine-tuning.
246
+ 4. Per-class breakdown of our 63% F1 — which classes are we failing on?
247
+ 5. Should we try BIO instead of BIOES to match the evaluation setup of competitors?
248
+ 6. Can we incorporate the CyberNER harmonized corpus for more training data?
249
+
250
+ ---
251
+
252
+ ## Sources
253
+
254
+ - SecureBERT 2.0 paper: https://arxiv.org/abs/2510.00240
255
+ - SecureBERT 2.0 NER model: https://huggingface.co/cisco-ai/SecureBERT2.0-NER
256
+ - SecureBERT v1: https://huggingface.co/ehsanaghaei/SecureBERT
257
+ - SecureModernBERT-NER: https://huggingface.co/attack-vector/SecureModernBERT-NER
258
+ - CyNER paper: https://arxiv.org/abs/2204.05754
259
+ - CyNER repo: https://github.com/aiforsec/CyNER
260
+ - CyberNER corpus: https://arxiv.org/html/2510.26499v1
261
+ - SecureBERT 2.0 GitHub: https://github.com/cisco-ai-defense/securebert2
research/notes/progress/2026-04-24-49-moe-finetuning-research.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MoE Fine-Tuning Best Practices for Arcspan
2
+
3
+ **Date:** 2026-04-24
4
+ **Context:** We're fine-tuning OpenAI's Privacy Filter (1.5B params, 50M active, 128 experts top-4, 8 layers, d_model=640) for cybersecurity NER. This note collects best practices for fine-tuning sparse MoE models on downstream tasks.
5
+
6
+ ---
7
+
8
+ ## 1. Load Balancing & Auxiliary Loss
9
+
10
+ **The core problem:** Without intervention, routers develop a positive feedback loop — a few "favorite" experts receive most tokens, others starve of gradient updates (**expert collapse**).
11
+
12
+ ### Switch Transformer Auxiliary Loss (standard approach)
13
+
14
+ ```
15
+ L_balance = N × Σ(fᵢ × Pᵢ)
16
+ L_total = L_task + α × L_balance
17
+ ```
18
+
19
+ - `fᵢ` = fraction of tokens routed to expert i (hard assignment)
20
+ - `Pᵢ` = mean router probability for expert i (soft probability)
21
+ - **α typically 0.01** — highly sensitive, task-dependent
22
+ - Too small → expert collapse; too large → hurts task performance
23
+
24
+ **Key finding:** Keep auxiliary loss enabled during fine-tuning, even when freezing expert parameters. It prevents overfitting and improves performance.
25
+
26
+ ### DeepSeek V3 Auxiliary-Loss-Free Alternative
27
+
28
+ - Apply expert-wise bias to routing scores before top-K selection
29
+ - Dynamically update bias based on recent load (outside backprop)
30
+ - Use biased scores for selection, unbiased scores for gate weights
31
+ - Gradients flow through unbiased weights → task loss uncontaminated
32
+
33
+ ### Router Z-Loss (ST-MoE)
34
+
35
+ ```
36
+ L_z = mean(logsumexp(router_logits)²)
37
+ ```
38
+
39
+ Penalizes large router logits, prevents numerical instability and training spikes. Recommended for stability, especially with FP16/mixed precision.
40
+
41
+ **Source:** [How MoE Models Actually Learn](https://medium.com/@chris.p.hughes10/how-moe-models-actually-learn-a-guide-to-auxiliary-losses-and-expert-balancing-293084e3f600), [HuggingFace MoE Blog](https://huggingface.co/blog/moe)
42
+
43
+ ---
44
+
45
+ ## 2. Expert Freezing Strategies
46
+
47
+ ### Expert-Specialized Fine-Tuning (ESFT)
48
+
49
+ Paper: ["Let the Expert Stick to His Last"](https://arxiv.org/html/2407.01906v1) — Expert-Specialized Fine-Tuning for Sparse Architectural LLMs.
50
+
51
+ - Different tasks activate different experts at different magnitudes
52
+ - Experts do specialize meaningfully across tasks
53
+ - This specialization transfers to unseen tasks (zero-shot generalization)
54
+ - Strategy: identify which experts are most relevant to your task, freeze the rest
55
+
56
+ ### Practical guidance for our model
57
+
58
+ With 128 experts and top-4 routing, most experts will be underutilized for cybersecurity NER. Options:
59
+
60
+ 1. **Full fine-tune all experts** — simplest, works well if data is sufficient
61
+ 2. **Freeze experts + fine-tune router + head** — prevents catastrophic forgetting, good for small datasets
62
+ 3. **Identify active experts on cyber NER data → fine-tune only those** — best of both worlds but requires analysis pass first
63
+
64
+ **Keep auxiliary loss on even when freezing experts** — it regularizes and improves downstream performance.
65
+
66
+ ---
67
+
68
+ ## 3. Learning Rate for MoE vs Dense
69
+
70
+ - **MoE models tolerate higher learning rates** than dense counterparts (e.g., 3e-4 vs 1e-5 typical for dense)
71
+ - The Privacy Filter's existing `opf train` uses **lr=2e-4 with AdamW** — this is in the right ballpark
72
+ - **Batch size is critical:** Smaller batch sizes (32-64) are important. Larger batches (256+) cause experts to collapse during fine-tuning
73
+ - MoE models benefit more from instruction tuning / multi-task fine-tuning than dense models
74
+
75
+ ### Recommendation for Arcspan
76
+
77
+ - Start with lr=2e-4 (existing default), sweep 1e-4 to 5e-4
78
+ - Use batch size 32-64, not larger
79
+ - Warmup + cosine decay schedule
80
+
81
+ ---
82
+
83
+ ## 4. Router Behavior During Fine-Tuning
84
+
85
+ - Router patterns **do change** during fine-tuning — experts re-specialize for the new task
86
+ - On small datasets, routing can become inconsistent/unstable
87
+ - **Option: freeze routing during fine-tuning** to prevent inconsistent routing on small datasets, while keeping load balancing loss as regularization
88
+ - With sufficient data, allowing router to adapt improves performance
89
+
90
+ ### Representation Collapse
91
+
92
+ Beyond routing collapse, hidden representations can cluster around expert centroids. Mitigations:
93
+ - Dimension reduction before routing
94
+ - L2 normalization of token representations and expert embeddings
95
+ - Learnable temperature parameter τ in gating function
96
+
97
+ **Source:** [On the Representation Collapse of Sparse MoE](https://arxiv.org/abs/2204.09179)
98
+
99
+ ---
100
+
101
+ ## 5. MoE-Specific Regularization
102
+
103
+ | Technique | Purpose | Recommendation |
104
+ |---|---|---|
105
+ | Auxiliary load balancing loss (α=0.01) | Prevent expert collapse | Always use during fine-tuning |
106
+ | Router z-loss | Prevent logit explosion / training instability | Use if training in mixed precision |
107
+ | Small batch size (32-64) | Prevent expert collapse | Critical for fine-tuning |
108
+ | Expert dropout | Regularization | Not widely adopted; try if overfitting |
109
+ | Router temperature | Control expert activation sharpness | Learnable τ can help |
110
+
111
+ ---
112
+
113
+ ## 6. Full Fine-Tune vs LoRA/Adapter for MoE
114
+
115
+ ### LoRA for MoE — Key Findings
116
+
117
+ Source: [LoRA Without Regret](https://thinkingmachines.ai/blog/lora/), [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://arxiv.org/abs/2410.21228)
118
+
119
+ **When LoRA matches full fine-tuning:**
120
+ - LoRA applied to **all layers** (MLP/MoE + attention), not just attention
121
+ - Dataset size roughly matches number of trainable LoRA parameters (1.1:1 to 1.5:1 ratio)
122
+ - Learning rate **10× higher** than full fine-tuning equivalent
123
+ - Separate LoRA adapters per expert, rank scaled by number of active experts
124
+
125
+ **LoRA advantages:**
126
+ - ~10× fewer accelerators needed
127
+ - ~2/3 FLOPs per pass
128
+ - Multiple adapters can serve simultaneously
129
+
130
+ **LoRA caveats:**
131
+ - Accesses different parts of solution space than full fine-tuning ("intruder dimensions" in SVD)
132
+ - Different generalization behavior out-of-distribution
133
+ - Less tolerant of large batch sizes
134
+
135
+ ### Recommendation for Arcspan (50M active params)
136
+
137
+ **Full fine-tuning is the right default.** Reasons:
138
+ - 50M active parameters is tiny — full fine-tuning is cheap
139
+ - The existing `opf train` already does full fine-tuning with AdamW
140
+ - No memory pressure justifying LoRA's complexity
141
+ - Token classification benefits from full adaptation of all expert representations
142
+
143
+ LoRA would only make sense if we need multi-domain adapters (e.g., cyber NER + medical NER switching at inference).
144
+
145
+ ---
146
+
147
+ ## 7. MoE for Token Classification / NER
148
+
149
+ No dedicated papers found on MoE architectures specifically for NER/token classification. This is notable — it means:
150
+
151
+ 1. **The Privacy Filter is unusually positioned** — a production MoE model doing token classification is rare
152
+ 2. Most MoE research focuses on generative LLMs (Mixtral, Switch-T, GLaM, DeepSeek)
153
+ 3. The closest analogue is BERT-style token classifiers, but those are dense
154
+
155
+ **This is actually good for us** — less competition, the architecture is proven (OpenAI shipped it in production for PII), and the fine-tuning path is already validated by the v2→v7 label space expansion (8→24 categories).
156
+
157
+ ---
158
+
159
+ ## Summary: Practical Checklist for Arcspan Fine-Tuning
160
+
161
+ 1. **Keep auxiliary load balancing loss** (α=0.01) during fine-tuning
162
+ 2. **Add router z-loss** if not already present (check Privacy Filter code)
163
+ 3. **Batch size 32-64** — do not go larger
164
+ 4. **Learning rate 2e-4** (existing default is good), sweep 1e-4 to 5e-4
165
+ 5. **Full fine-tuning** (not LoRA) — model is small enough
166
+ 6. **Monitor expert utilization** during training — log fraction of tokens per expert
167
+ 7. **Consider freezing router** if dataset is small (<5K examples), let it adapt if dataset is large
168
+ 8. **Warm-start output head** from existing PII weights where label semantics overlap (the codebase already supports this)
169
+
170
+ ## Open Questions
171
+
172
+ - Does the Privacy Filter codebase already include auxiliary load balancing loss? Need to check `vendor/privacy-filter/` code.
173
+ - What is the current routing distribution on PII data? Would be useful to visualize before fine-tuning.
174
+ - How many experts are actually active for typical text? If only 20-30 of 128 see meaningful traffic, freezing the rest is viable.
175
+ - Should we add expert utilization logging to the training loop?
176
+
177
+ ---
178
+
179
+ ## Key References
180
+
181
+ - [How MoE Models Actually Learn: Auxiliary Losses and Expert Balancing](https://medium.com/@chris.p.hughes10/how-moe-models-actually-learn-a-guide-to-auxiliary-losses-and-expert-balancing-293084e3f600)
182
+ - [HuggingFace MoE Explained](https://huggingface.co/blog/moe)
183
+ - [Auxiliary-Loss-Free Load Balancing (DeepSeek)](https://arxiv.org/abs/2408.15664)
184
+ - [On the Representation Collapse of Sparse MoE](https://arxiv.org/abs/2204.09179)
185
+ - [Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning](https://arxiv.org/html/2407.01906v1)
186
+ - [LoRA Without Regret](https://thinkingmachines.ai/blog/lora/)
187
+ - [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://arxiv.org/abs/2410.21228)
188
+ - [ICLR 2024: MoE Parameter-Efficient Fine-Tuning](https://proceedings.iclr.cc/paper_files/paper/2024/file/6d00071564ec447466fc4577743cf1b3-Paper-Conference.pdf)
189
+ - [MoE Load Balance Review](https://huggingface.co/blog/NormalUhr/moe-balance)
research/notes/progress/2026-04-24-50-r7-data-pipeline-plan.md ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # R7 Data Pipeline Plan — Dataset Acquisition & Mapping
2
+
3
+ **Date:** 2026-04-24
4
+ **Status:** Research complete, ready for implementation
5
+
6
+ ---
7
+
8
+ ## Current Baseline
9
+
10
+ | Metric | Value |
11
+ |---|---|
12
+ | Training sentences | 17,954 (5-class) |
13
+ | Total sentences (train+test+valid) | 25,468 |
14
+ | Total entities | 42,159 |
15
+ | Entity breakdown | Malware: 16,122 · Organization: 15,236 · System: 4,647 · Vulnerability: 3,287 · Indicator: 2,867 |
16
+ | Existing sources | cyberner_stix (7,922), cyner2 (4,952), dnrti (3,187), cyner (1,893) |
17
+
18
+ Target: Competitors with 500K+ spans achieve 84.8% F1. We need substantially more data.
19
+
20
+ ---
21
+
22
+ ## Dataset Availability Assessment
23
+
24
+ ### 1. ✅ APTNER (Standalone) — **HIGH PRIORITY, NEW DATA**
25
+
26
+ - **Repo:** https://github.com/wangxuren/APTNER
27
+ - **Status:** ✅ Downloaded and verified
28
+ - **Format:** CoNLL-style, **BIOES** (matches our architecture!)
29
+ - **Files:** APTNERtrain.txt (6,745 sentences), APTNERdev.txt, APTNERtest.txt
30
+ - **Overlap:** 3,628 sentences already in our data → **3,117 genuinely new sentences**
31
+ - **Total entities (train):** ~6,867 (B- tags)
32
+
33
+ **Entity types and mapping:**
34
+
35
+ | APTNER Label | Count | → Our Label | Notes |
36
+ |---|---|---|---|
37
+ | IDTY (Identity) | 1,392 | Organization | People/groups mentioned in CTI |
38
+ | ACT (Action) | 1,109 | **DROP** | Attack actions, not named entities |
39
+ | MAL (Malware) | 845 | **Malware** | Direct match |
40
+ | LOC (Location) | 822 | **DROP** | Geographic locations |
41
+ | APT (APT Group) | 770 | **Organization** | Threat actor groups |
42
+ | TIME | 753 | **DROP** | Temporal expressions |
43
+ | TOOL | 405 | **System** | Attacker tools/software |
44
+ | FILE | 358 | **Indicator** | File names/paths as IOCs |
45
+ | SECTEAM | 338 | **Organization** | Security research teams |
46
+ | VULNAME | 26 | **Vulnerability** | Vulnerability names |
47
+ | OS | 15 | **System** | Operating systems |
48
+ | PROT (Protocol) | 6 | **DROP** | Too few, not in our schema |
49
+ | VULID | 5 | **Vulnerability** | CVE IDs |
50
+ | URL | 3 | **Indicator** | URLs |
51
+ | IP | 3 | **Indicator** | IP addresses |
52
+ | ENCR | 2 | **DROP** | Encryption methods |
53
+ | EMAIL | 1 | **Indicator** | Email addresses |
54
+ | SHA2 | 1 | **Indicator** | Hash |
55
+ | MD5 | — | **Indicator** | Hash |
56
+
57
+ **Estimated new entities from 3,117 new sentences:** ~3,000 mappable entities (roughly half of the 6,867 total, proportional to new sentence fraction, minus DROPs)
58
+
59
+ **Quality notes:**
60
+ - BIOES format — no conversion needed for tag scheme
61
+ - Some noisy tags: `S-APT/APT34`, `S-MAL:hash_value` — need regex cleanup to extract base type
62
+ - A few multi-tag errors: `IDTY I-TOOL E-IDTY` — need to skip/fix these (~10 rows)
63
+
64
+ ### 2. ⚠️ CyberNER Combined STIX CSV — **LOW PRIORITY (mostly redundant)**
65
+
66
+ - **Repo:** https://github.com/yasirech-chammakhy/CyberNER
67
+ - **Status:** ✅ Downloaded
68
+ - **Format:** CSV with columns: Word, Tag, Sentence_ID, STIX_Tag, Source
69
+ - **Size:** 10,042 sentences, 57,300 entities (STIX tags)
70
+ - **Overlap:** 8,518/10,042 sentences already in our data → **only 42 new sentences, 18 new entities**
71
+ - **Verdict:** Almost entirely redundant with our existing data.
72
+
73
+ **Potential secondary value:** The STIX_Tag column provides an alternative STIX-aligned annotation layer on sentences we already have. The STIX schema maps cleanly to our 5 classes. This could be used for label validation/correction on existing data but is NOT a source of new training examples.
74
+
75
+ ### 3. ⚠️ DNRTI Dataset — **SKIP (already included)**
76
+
77
+ - **Repo:** https://github.com/LiuPeiP-CS/NER4CTI/tree/main/DNRTI_Dataset
78
+ - **Status:** ✅ Downloaded
79
+ - **Format:** CoNLL BIO (no S/E tags — would need BIOES conversion)
80
+ - **Size:** 5,251 train sentences
81
+ - **Overlap:** 5,145/5,251 already in our data → **only 106 new sentences**
82
+ - **Entity types:** HackOrg, Tool, Area, OffAct, Idus, Time, SamFile, Org, Exp, SecTeam, Way, Features, Purp
83
+ - **Verdict:** Skip — already incorporated.
84
+
85
+ ### 4. ⚠️ bnsapa/cybersecurity-ner (HuggingFace) — **SKIP (already included)**
86
+
87
+ - **HF:** `bnsapa/cybersecurity-ner`
88
+ - **Status:** ✅ Downloaded
89
+ - **Format:** HuggingFace Dataset, BIO tags, 5 classes (Malware, Indicator, Organization, System, Vulnerability) — exact match to ours
90
+ - **Size:** 2,664 train + 717 test + 785 valid = 4,166 total
91
+ - **Overlap:** This IS our CyNER source data. Completely redundant.
92
+ - **Verdict:** Skip.
93
+
94
+ ### 5. ✅ Stucco Auto-Labeled Corpus — **MEDIUM PRIORITY, LARGE but NVD-focused**
95
+
96
+ - **Repo:** https://github.com/stucco/auto-labeled-corpus
97
+ - **Status:** ✅ Downloaded
98
+ - **Format:** JSON with token-level BIO annotations, keyed by CVE ID
99
+ - **Subcorpora:**
100
+ - NVD: 15,192 docs, 685K tokens, 147K+ entities
101
+ - MS-Bulletin: 230 docs, 120K tokens
102
+ - Metasploit: 356 docs, 26K tokens
103
+ - **Total:** 15,778 docs
104
+
105
+ **Entity types and mapping:**
106
+
107
+ | Stucco Label | NVD Count | → Our Label | Notes |
108
+ |---|---|---|---|
109
+ | relevant_term | 71,565 | **DROP** | Generic terms like "remote", "allows" — not named entities |
110
+ | version | 29,022 | **DROP** | Version numbers alone |
111
+ | application | 18,774 | **System** | Software names — direct match |
112
+ | vendor | 10,460 | **Organization** | Software vendors |
113
+ | update | 4,206 | **DROP** | Patch/update identifiers |
114
+ | os | 3,487 | **System** | Operating systems |
115
+ | file | 3,197 | **Indicator** | File paths/names |
116
+ | cve id | 3,141 | **Vulnerability** | CVE identifiers — direct match |
117
+ | function | 1,445 | **DROP** | Function names (too generic) |
118
+ | parameter | 650 | **DROP** | API parameters |
119
+ | hardware | 585 | **System** | Hardware devices |
120
+ | edition | 578 | **DROP** | Software editions |
121
+ | programming language | 163 | **DROP** | Language names |
122
+ | method | 163 | **DROP** | Method names |
123
+ | language | 7 | **DROP** | Natural languages |
124
+
125
+ **Estimated usable entities (NVD only):**
126
+ - System (application + os + hardware): ~22,846
127
+ - Organization (vendor): ~10,460
128
+ - Indicator (file): ~3,197
129
+ - Vulnerability (cve id): ~3,141
130
+ - **Total mappable: ~39,644 entities across 15,192 docs**
131
+
132
+ **Quality concerns:**
133
+ - Auto-labeled (not human-annotated) — expect noise
134
+ - NVD descriptions are formulaic ("X before version Y allows...") — domain-narrow
135
+ - `relevant_term` is the majority label and must be DROPped (it's not NER)
136
+ - BIO format only → needs BIOES conversion
137
+ - No Malware entities — this corpus is about vulnerabilities, not threat actors
138
+
139
+ ### 6. ❌ MalwareDB (in NER4CTI repo) — **SKIP (wrong domain)**
140
+
141
+ - **Format:** CoNLL BIO
142
+ - **Labels:** Action, Entity, Modifier — too generic, not cybersecurity NER
143
+ - **Size:** ~8,525 train entities
144
+ - **Verdict:** Skip — labels don't map to our schema meaningfully.
145
+
146
+ ---
147
+
148
+ ## Priority-Ordered Incorporation Plan
149
+
150
+ ### Priority 1: APTNER (New sentences only) — Est. +3,000 entities
151
+
152
+ **Why first:** BIOES format (zero tag-scheme conversion), 3,117 genuinely new sentences, diverse CTI content, human-annotated.
153
+
154
+ **Conversion steps:**
155
+ 1. Parse CoNLL-style BIOES from `APTNERtrain.txt` / `APTNERdev.txt` / `APTNERtest.txt`
156
+ 2. Clean noisy tags: strip `/` suffixes (`S-APT/APT34` → `S-APT`), strip `:` hash suffixes (`S-MAL:abc123` → `S-MAL`), fix multi-tag rows
157
+ 3. Apply label mapping (see table above)
158
+ 4. Reconstruct text from tokens, compute character offsets for spans
159
+ 5. Convert to our JSONL format: `{"text": "...", "spans": {"Malware: name": [[start, end]]}, "info": {...}}`
160
+ 6. Deduplicate against existing data (fuzzy match on text[:80])
161
+ 7. Split: merge new train into train, keep dev/test separate for validation
162
+
163
+ ### Priority 2: Stucco NVD Corpus — Est. +39,000 entities
164
+
165
+ **Why second:** Massive volume but auto-labeled (noisy), NVD-domain-only (no Malware class), needs BIO→BIOES conversion.
166
+
167
+ **Conversion steps:**
168
+ 1. Parse JSON token arrays from `full_corpus.json` → NVD subcorpus
169
+ 2. Apply label mapping (drop `relevant_term`, `version`, `update`, `function`, `parameter`, `edition`, `programming language`, `method`, `language`)
170
+ 3. Convert BIO → BIOES: add S- for single-token entities, add E- for final token of multi-token entities
171
+ 4. Reconstruct text, compute character offsets
172
+ 5. Convert to our JSONL format
173
+ 6. Deduplicate against existing NVD entries (we have ~26 + 2,790 LLM-annotated NVD entries)
174
+ 7. Quality filter: sample 100 random entries, manually check annotation quality before bulk incorporation
175
+
176
+ **Risk:** Auto-labeled data may hurt more than help if noisy. Recommend incorporating in batches (e.g., 5K docs first) and evaluating impact on val F1 before adding more.
177
+
178
+ ### Priority 3: Defanged IOC Augmentation — Est. +2,000–5,000 synthetic entities
179
+
180
+ **Why:** Our training data lacks defanged notation (`hxxp://`, `hxxps://`, `[.]`, `[@]`, etc.) which is extremely common in real CTI reports. This is a gap that will hurt recall on real-world data.
181
+
182
+ **Approach:**
183
+ 1. **Identify clean indicators** in existing training data: URLs, domains, IPs, email addresses
184
+ 2. **Apply defanging transforms** probabilistically (50% rate):
185
+ - `http` → `hxxp`, `https` → `hxxps`
186
+ - `.` in domains/IPs → `[.]`
187
+ - `@` in emails → `[@]`
188
+ - Full URL defanging: `hxxps://example[.]com/path`
189
+ 3. **Generate new examples** by duplicating existing sentences containing indicators and replacing the indicator text with defanged version
190
+ 4. **Adjust character offsets** to account for changed string lengths
191
+ 5. **Add as augmented training examples** with `source: "defang_augmented"`
192
+
193
+ **Implementation detail:** A Python function that takes a clean IOC string and returns all plausible defanged variants. Apply to training data to create ~1 augmented copy per indicator-bearing sentence.
194
+
195
+ ---
196
+
197
+ ## Concrete Label Mapping Summary
198
+
199
+ | Source Dataset | → Malware | → Indicator | → System | → Organization | → Vulnerability | DROP |
200
+ |---|---|---|---|---|---|---|
201
+ | **APTNER** | MAL | FILE, URL, IP, EMAIL, SHA2, MD5 | TOOL, OS | IDTY, APT, SECTEAM | VULNAME, VULID | ACT, LOC, TIME, PROT, ENCR |
202
+ | **Stucco NVD** | — | file | application, os, hardware | vendor | cve id | relevant_term, version, update, function, parameter, edition, prog lang, method, language |
203
+ | **Defang Aug** | — | (augmented copies) | — | — | — | — |
204
+
205
+ ---
206
+
207
+ ## Estimated Total Data After R7
208
+
209
+ | Source | New Sentences | New Entities | Quality |
210
+ |---|---|---|---|
211
+ | Current data | 25,468 | 42,159 | Human-annotated, mixed |
212
+ | APTNER (new only) | ~3,100 | ~3,000 | Human-annotated, BIOES |
213
+ | Stucco NVD | ~15,000 | ~39,600 | Auto-labeled, BIO |
214
+ | Defang augmentation | ~1,500 | ~3,000 | Synthetic |
215
+ | **Total** | **~45,000** | **~87,800** | Mixed |
216
+
217
+ This roughly **doubles** our entity count. Still well short of 500K, but a meaningful step.
218
+
219
+ ---
220
+
221
+ ## Remaining Class Imbalance
222
+
223
+ After incorporation, estimated distribution:
224
+ - Malware: ~17,000 (existing 16K + 845 APTNER MAL)
225
+ - Organization: ~27,000 (existing 15K + 2,500 APTNER + 10,460 Stucco vendor)
226
+ - System: ~28,000 (existing 4.6K + 420 APTNER + 22,846 Stucco app/os/hw)
227
+ - Vulnerability: ~6,500 (existing 3.3K + 31 APTNER + 3,141 Stucco CVE)
228
+ - Indicator: ~6,400 (existing 2.9K + 366 APTNER + 3,197 Stucco file)
229
+
230
+ **System and Organization will be overrepresented** due to Stucco NVD. Consider downsampling Stucco or using class-weighted loss.
231
+
232
+ ---
233
+
234
+ ## Open Questions / Blockers
235
+
236
+ 1. **Stucco quality gate:** Need to manually inspect ~100 random Stucco NVD annotations before bulk incorporation. Auto-labeled data could introduce systematic errors.
237
+
238
+ 2. **APTNER tag noise:** ~10 rows have corrupted multi-tag annotations (e.g., `IDTY I-TOOL E-IDTY`). Strategy: skip these rows (trivial loss).
239
+
240
+ 3. **Class imbalance after Stucco:** Stucco adds heavily to System/Organization but zero to Malware. Options:
241
+ - Downsample Stucco to ~5K docs
242
+ - Use class-weighted loss during training
243
+ - Or both
244
+
245
+ 4. **NVD overlap with our LLM-annotated NVD:** We have 2,790 LLM-annotated NVD entries. Need to deduplicate by CVE ID against Stucco's NVD docs (keyed by CVE ID — straightforward).
246
+
247
+ 5. **Defanging scope:** Should we defang indicators in ALL datasets or only in our existing data? Stucco NVD descriptions probably don't contain defanged IOCs in practice.
248
+
249
+ 6. **Additional data sources not yet explored:**
250
+ - SemEval-2018 Task 8 (MalwareTextDB) — focused on malware, could help balance
251
+ - CASIE dataset — cybersecurity event extraction
252
+ - Generating synthetic CTI text with LLMs + our label space
253
+
254
+ ---
255
+
256
+ ## Implementation Order
257
+
258
+ 1. **APTNER converter** — ~2 hours of scripting. Parse BIOES, clean tags, map labels, deduplicate, emit JSONL.
259
+ 2. **Stucco NVD converter** — ~2 hours. Parse JSON, map labels, BIO→BIOES, emit JSONL. Quality-gate before merge.
260
+ 3. **Defanging augmenter** — ~1 hour. Regex-based IOC defanging + offset adjustment.
261
+ 4. **Aggregation script update** — merge new sources into `aggregated_5class_*.jsonl`, recompute stats.
262
+ 5. **Retrain + evaluate** — compare F1 before/after each data addition to measure impact.
research/notes/progress/2026-04-24-51-audit-ioc-coverage.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audit: IOC Label Coverage in Training Data
2
+
3
+ **Date:** 2026-04-24
4
+ **Files audited:**
5
+ - `enriched_5class_train.jsonl` (ORIGINAL — 31,510 examples)
6
+ - `enriched_5class_train_cleaned.jsonl` (CLEANED — 27,666 examples)
7
+
8
+ ## Method
9
+
10
+ Regex scan for 6 IOC types (IPv4, MD5, SHA1, SHA256, URL, Domain) across all examples, checking whether each match overlaps with any existing span annotation.
11
+
12
+ ## Results Summary
13
+
14
+ | Metric | ORIGINAL | CLEANED | Delta |
15
+ |--------|----------|---------|-------|
16
+ | Total examples | 31,510 | 27,666 | -3,844 |
17
+ | Examples with IOCs | 4,313 | 4,073 | -240 |
18
+ | Total IOC matches | 16,967 | 16,715 | -252 |
19
+ | Labeled IOCs | 15,629 (92.1%) | 16,638 (99.5%) | **+7.4pp** |
20
+ | Unlabeled IOCs | 1,338 | 77 | **-94.2%** |
21
+
22
+ ## Per-Type Breakdown (CLEANED)
23
+
24
+ | IOC Type | Found | Labeled | Unlabeled | Coverage |
25
+ |----------|-------|---------|-----------|----------|
26
+ | IPv4 | 4,829 | 4,828 | 1 | 100.0% |
27
+ | MD5 | 1,244 | 1,244 | 0 | 100.0% |
28
+ | SHA1 | 1,377 | 1,377 | 0 | 100.0% |
29
+ | SHA256 | 1,693 | 1,693 | 0 | 100.0% |
30
+ | URL | 1,926 | 1,926 | 0 | 100.0% |
31
+ | Domain | 5,646 | 5,570 | 76 | 98.7% |
32
+
33
+ ## Analysis of Remaining 77 "Unlabeled" IOCs
34
+
35
+ **All are false positives from the regex, not genuinely missed labels:**
36
+
37
+ 1. **1 IPv4 false positive:** `18.0.0.324` — an Adobe Flash version number (octet `.324` > 255, not a valid IP).
38
+
39
+ 2. **76 Domain false positives:** Almost entirely Android package names and reverse-domain app identifiers that happen to end in real TLDs:
40
+ - `au.com.nab.mobile`, `au.com.bankwest.mobile` (package names)
41
+ - `btc.org.freewallet.app`, `eth.org.freewallet.app` (crypto app packages)
42
+ - `jp.co.sagawa.SagawaOfficialApp` (Japanese carrier app)
43
+ - `com.hua.ru.quan` (package name with `.ru` TLD match)
44
+
45
+ These are **not IOC domains** — they are software identifiers that regex incorrectly flags. No labeling action needed.
46
+
47
+ ## Verdict
48
+
49
+ **The cleaning step was highly effective.** IOC coverage went from 92.1% → 99.5%, with all remaining "gaps" being regex false positives rather than genuine unlabeled indicators. The cleaned dataset has **zero genuine unlabeled IOCs** across all six categories.
50
+
51
+ ### Before vs After (ORIGINAL → CLEANED)
52
+
53
+ - IPv4: 94.9% → 100.0% (258 gaps closed)
54
+ - MD5: 92.7% → 100.0% (91 gaps closed)
55
+ - SHA1: 97.6% → 100.0% (33 gaps closed)
56
+ - SHA256: 77.2% → 100.0% (386 gaps closed — biggest improvement)
57
+ - URL: 99.3% → 100.0% (14 gaps closed)
58
+ - Domain: 90.2% → 98.7% (480 gaps closed, 76 remaining are false positives)
59
+
60
+ ## Open Questions
61
+
62
+ None — IOC coverage is complete. The dataset is ready for training from an IOC labeling perspective.
research/notes/progress/2026-04-24-53-audit-label-consistency.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Label Quality Audit: enriched_5class_train_cleaned.jsonl
2
+
3
+ **Date:** 2026-04-24
4
+ **Dataset:** `/data/processed/enriched_5class_train_cleaned.jsonl`
5
+ **Records:** 27,666 | **Span annotations:** 75,691
6
+
7
+ ## Label Distribution
8
+
9
+ | Label | Count |
10
+ |---|---|
11
+ | Indicator | 19,490 |
12
+ | Malware | 18,700 |
13
+ | Organization | 16,892 |
14
+ | System | 12,597 |
15
+ | Vulnerability | 8,012 |
16
+
17
+ ---
18
+
19
+ ## 1. Same Surface Form, Different Labels
20
+
21
+ **574 entities** appear with multiple labels across examples. Most are low-noise (dominant label >99%), but several warrant attention:
22
+
23
+ ### Top 20 Most Ambiguous
24
+
25
+ | Entity | Total | Distribution | Severity |
26
+ |---|---|---|---|
27
+ | `backdoor` | 2,493 | Malware:2484, Indicator:9 | LOW — 9 mislabels |
28
+ | `windows` | 975 | System:973, Malware:2 | LOW |
29
+ | `oracle` | 880 | Org:877, System:3 | LOW |
30
+ | `android` | 848 | System:839, Malware:8, Org:1 | LOW |
31
+ | `google` | 683 | Org:679, Malware:1, System:3 | LOW |
32
+ | `java` | 580 | System:579, Org:1 | LOW |
33
+ | `microsoft` | 555 | Org:554, Malware:1 | LOW |
34
+ | `linux` | 435 | System:434, Malware:1 | LOW |
35
+ | `malware` | 386 | Malware:371, Indicator:15 | LOW — generic word labeled as Indicator in 15 cases |
36
+ | `kaspersky` | 279 | Org:278, Malware:1 | LOW |
37
+ | `exploit` | 238 | Vuln:221, Malware:17 | **MEDIUM** — genuine ambiguity (exploit kit vs vulnerability) |
38
+ | `juniper` | 235 | System:5, Org:230 | LOW — System cases may be correct (Juniper devices vs company) |
39
+ | `github` | 196 | Org:109, System:86, Malware:1 | **MEDIUM** — Org vs System is genuinely ambiguous (company vs platform) |
40
+ | `python` | 189 | System:188, Malware:1 | LOW |
41
+ | `ios` | 164 | System:163, Malware:1 | LOW |
42
+ | `carbanak` | 154 | Malware:64, Indicator:3, Vuln:87 | **HIGH** — Carbanak is a Malware/APT group, NOT a Vulnerability. 87 mislabels. |
43
+ | `zero-day` | 143 | Vuln:142, Malware:1 | LOW |
44
+ | `trojan` | 139 | Malware:135, Vuln:3, Indicator:1 | LOW |
45
+ | `wordpress` | 137 | Org:2, System:135 | LOW |
46
+ | `facebook` | 127 | System:57, Org:70 | **MEDIUM** — genuine ambiguity (platform vs company) |
47
+
48
+ ### Action Items
49
+ - **Carbanak as Vulnerability (87 instances):** This is clearly wrong. Carbanak is malware/APT group. Likely a systematic source-data error. **Should be corrected to Malware.**
50
+ - **`exploit` as Malware (17):** Some may be valid (exploit kits), but most are likely label noise.
51
+ - **`github`/`facebook`:** Genuinely dual-natured. Could accept as-is or pick a canonical label.
52
+
53
+ ---
54
+
55
+ ## 2. Span Boundary Issues
56
+
57
+ | Check | Count | Verdict |
58
+ |---|---|---|
59
+ | Zero-length spans | 0 | PASS |
60
+ | Beyond text bounds | 0 | PASS |
61
+ | Mid-word boundaries | 33 | **NEEDS REVIEW** |
62
+ | Overlapping spans | 0 | PASS |
63
+ | Surface/offset mismatches | 0 | PASS |
64
+
65
+ ### Mid-Word Boundary Details (33 cases)
66
+
67
+ These are spans where the character offset splits a compound word. Examples:
68
+
69
+ - `Intel` extracted from "at **Intel**lig" (rec 19562) — annotation grabs "Intel" from "Intelligence"
70
+ - `Disco` extracted from "**Disco**ver" (recs 20859, 20873) — annotation grabs "Disco" from "Discover"
71
+ - `Access` from "**Access**Tok" (rec 21709) — from "AccessToken"
72
+ - `Exchange` from "MSExchangeIS" (rec 21711) — partial match inside compound
73
+ - `API` from "bleAPISer" (rec 21711) — "API" inside "DisableAPIService"
74
+ - `Native API` extending into "APIs" (rec 21767)
75
+
76
+ **Root cause:** These appear to be substring-match annotation artifacts where an entity name (Intel, API, Exchange, etc.) was matched inside a larger token. **33 cases out of 75,691 spans = 0.04%** — very low rate.
77
+
78
+ **Action:** Fix the 33 cases or filter them out. Most are from MITRE ATT&CK technique records (recs 21xxx).
79
+
80
+ ---
81
+
82
+ ## 3. Span Offset Consistency
83
+
84
+ **0 mismatches** between span key surface text and extracted text at offsets. The data is fully consistent — every `"Label: surface_text"` key matches `text[start:end]` exactly.
85
+
86
+ ---
87
+
88
+ ## 4. Manual Inspection (30 Random Samples)
89
+
90
+ ### Issues Found
91
+
92
+ 1. **Trailing punctuation in spans** (cyner2 source):
93
+ - rec 2081: `[Malware: NewPosThings.]` — trailing period included
94
+ - rec 2081: `[Malware: variant,]` — trailing comma included
95
+ - rec 2156: multiple alias lists labeled as single backdoor span (not actually wrong, but noisy)
96
+
97
+ 2. **Generic words labeled as entities** (cyberner_stix source):
98
+ - rec 9227: `[Vulnerability: high turnover of staff]` and `[Vulnerability: difficult to ensure all staff have cybersecurity training]` — these are NOT vulnerabilities, they're descriptions of organizational challenges. Systematic labeling error in cyberner_stix source.
99
+ - rec 11735: `[Indicator: Microsoft Word attachment]` — not an IOC
100
+ - rec 11595: `[Indicator: Filensfer]` — this is actually a malware family name, mislabeled as Indicator
101
+
102
+ 3. **MITRE/kernel records are noisy:**
103
+ - rec 21605: `[Malware: attrib]` — attrib is a legitimate Windows utility, not malware (though in MITRE context it's a "tool used by adversaries")
104
+ - rec 22902, 22916, 24772: Linux kernel vulnerability descriptions with very sparse annotations — mostly just `[System: Linux kernel]` and `[System: QEMU]`. These are fine but low entity density.
105
+ - rec 22916: `[Indicator: a533010b71dab205ad2f507188ce8c82203b0254]` — this is a git commit hash inside a kernel log, not a malware indicator. Likely auto-matched as a SHA hash.
106
+
107
+ 4. **Synthetic records (synth_v2) look clean** — well-formed, entities are correct and complete. Good quality.
108
+
109
+ ### Quality by Source
110
+
111
+ | Source | Quality | Notes |
112
+ |---|---|---|
113
+ | cyner_train | Good | Some trailing punctuation in spans |
114
+ | cyner2_train | Fair | Trailing punctuation, some alias-list noise |
115
+ | cyberner_stix | Poor | Worst quality — generic phrases as Vuln, mislabeled entities |
116
+ | MITRE/kernel | Fair | Git hashes as Indicators, tools-as-malware edge cases |
117
+ | synth_v2 | Excellent | Clean, well-formed, accurate |
118
+
119
+ ---
120
+
121
+ ## 5. Bare File Extensions as Indicator
122
+
123
+ **0 found.** The cleaning step successfully removed all bare file extension annotations (.dll, .exe, .pdf, .doc, .zip, .bat, .ps1, .vbs, .js).
124
+
125
+ ---
126
+
127
+ ## Summary & Recommendations
128
+
129
+ ### Data Quality Score: **B+ (Good, with fixable issues)**
130
+
131
+ | Issue | Count | Priority |
132
+ |---|---|---|
133
+ | Carbanak mislabeled as Vulnerability | 87 | **HIGH** — systematic, fixable |
134
+ | cyberner_stix generic phrases as Vulnerability | ~50+ est. | **HIGH** — source-level issue |
135
+ | Mid-word boundary splits | 33 | MEDIUM — 0.04%, easy to filter |
136
+ | Trailing punctuation in spans | ~20+ est. | MEDIUM — cyner2 source |
137
+ | Git commit hashes as Indicators | ~10 est. | LOW |
138
+ | `exploit` label ambiguity | 17 | LOW — may be acceptable |
139
+ | Bare file extensions | 0 | RESOLVED |
140
+
141
+ ### Recommended Next Steps
142
+
143
+ 1. **Fix Carbanak labels:** Bulk relabel `Vulnerability: carbanak` → `Malware: Carbanak` (87 instances)
144
+ 2. **Audit cyberner_stix Vulnerability labels:** Many are generic descriptions, not actual CVEs or vulnerability names. Consider filtering Vulnerability spans from this source that don't match CVE patterns or known vuln names.
145
+ 3. **Strip trailing punctuation** from span boundaries (regex: strip `[.,;:!?]` from span ends, adjust offsets)
146
+ 4. **Remove mid-word boundary spans** (33 cases) or expand them to word boundaries
147
+ 5. **Filter git commit hashes** from Indicator labels in kernel vulnerability records
research/notes/progress/2026-04-24-59-aptner-held-out-test.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # APTNER Independent Held-Out Test Set
2
+
3
+ **Date:** 2026-04-24
4
+ **File:** `data/processed/aptner_5class_test_clean.jsonl`
5
+
6
+ ## Summary
7
+
8
+ Built an independent test set from the APTNER test split (APT threat intelligence reports). **Zero leakage** found — no exact or prefix-80 matches against any of our 4 training files (28,675 unique training texts).
9
+
10
+ ## Stats
11
+
12
+ | Metric | Value |
13
+ |---|---|
14
+ | Examples | 172 |
15
+ | Total entity mentions | 340 |
16
+ | Avg entities/example | 2.0 |
17
+ | Malware | 102 |
18
+ | Organization | 91 |
19
+ | System | 87 |
20
+ | Indicator | 55 |
21
+ | Vulnerability | 5 |
22
+
23
+ ## Entity Memorization
24
+
25
+ 57.3% of test entity surface forms (90/157) also appear in training data. This is expected — common malware names and organizations recur across cybersecurity corpora. The model must still detect spans in novel contexts, so this doesn't invalidate the benchmark.
26
+
27
+ ## Independence vs. Enriched Test
28
+
29
+ Our 3 existing test sets (enriched, CyNER, SB2) share 96–98% text overlap because they derive from the same annotation pipelines. APTNER has **0% text overlap** with training data and originates from a completely separate annotation effort on APT reports. This makes it a genuinely independent benchmark.
30
+
31
+ ## Weakness
32
+
33
+ Vulnerability class has only 5 mentions — too few for reliable per-class metrics on that label. The other 4 classes are well-represented.
34
+
35
+ ## Source
36
+
37
+ Original data: `data/raw/APTNER/APTNERtest.txt` → converted via `scripts/convert_aptner.py`
research/notes/progress/2026-04-24-cyner-deep-dive-and-datasets.md ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CyNER Deep Dive & Cybersecurity NER Dataset Catalog
2
+
3
+ **Date:** 2026-04-24
4
+ **Purpose:** Deep analysis of CyNER benchmark competitor + exhaustive dataset discovery for Arcspan training
5
+
6
+ ---
7
+
8
+ ## 1. CyNER Analysis
9
+
10
+ ### 1.1 Architecture
11
+
12
+ - **Backbone:** XLM-RoBERTa-large (560M params) — best performer among tested models
13
+ - **Head:** Single linear layer on top of transformer hidden representations for token classification
14
+ - **Training framework:** T-NER library (Ushio & Camacho-Collados, 2021) + HuggingFace Transformers
15
+ - **Sequence length:** 128 tokens max
16
+ - **Optimizer:** AdamW, LR 5e-6 (large models) / 1e-5 (base models)
17
+ - **Batch size:** 32
18
+ - **Epochs:** 20
19
+ - **Hardware:** Single Nvidia Tesla V100
20
+
21
+ ### 1.2 Entity Label Schema (5 classes)
22
+
23
+ | Class | Definition | Examples |
24
+ |-------|-----------|----------|
25
+ | **Malware** | Viruses, trojans, ransomware, etc. | FluBot, DroidJack RAT |
26
+ | **Indicator** | IOCs: domain, URL, IP, filename, hash, email, port | SHA256 hashes, IPs |
27
+ | **System** | OS, software, hardware | Android, Windows, Adobe Flash |
28
+ | **Organization** | Companies, groups, institutions | Proofpoint, Kaspersky |
29
+ | **Vulnerability** | CVE IDs and exploit mentions | CVE-2012-2825, "master key vulnerability" |
30
+
31
+ **Format:** BIO tagging (B-Entity, I-Entity, O) in CoNLL 2003 format.
32
+
33
+ **Notable exclusions:** Location, Person — delegated to off-the-shelf models (Flair/SpaCy).
34
+
35
+ ### 1.3 Training Corpus
36
+
37
+ - ~60 threat intelligence reports from MITRE ATT&CK software category
38
+ - Reports from Kaspersky, Symantec, McAfee (2018-2021)
39
+ - Manually cleaned text (not raw HTML/PDF)
40
+ - Annotated with BRAT annotation tool by trained graduate students
41
+ - **Total:** 106,991 tokens, 4,530 tagged entity spans
42
+
43
+ | Split | Malware | Indicator | System | Organization | Vulnerability |
44
+ |-------|---------|-----------|--------|-------------|---------------|
45
+ | Train (40 docs) | 703 | 1,021 | 837 | 284 | 48 |
46
+ | Dev (10 docs) | 254 | 208 | 182 | 92 | 9 |
47
+ | Test (10 docs) | 242 | 261 | 248 | 131 | 10 |
48
+
49
+ **Key observation:** Vulnerability class is extremely small (48 train / 10 test). Organization also underrepresented.
50
+
51
+ ### 1.4 Benchmark Results (span micro-F1 via seqeval)
52
+
53
+ **Overall model comparison:**
54
+
55
+ | Model | Precision | Recall | F1 |
56
+ |-------|-----------|--------|-----|
57
+ | BERT-base-uncased | 69.67 | 69.88 | 69.77 |
58
+ | BERT-large-uncased | 72.69 | 73.45 | 73.07 |
59
+ | RoBERTa-base | 37.22 | 42.50 | 39.69 |
60
+ | RoBERTa-large | 34.76 | 44.18 | 38.91 |
61
+ | XLM-RoBERTa-base | 74.57 | 77.23 | 75.88 |
62
+ | **XLM-RoBERTa-large** | **75.30** | **78.07** | **76.66** |
63
+
64
+ **Per-class results (XLM-RoBERTa-large):**
65
+
66
+ | Class | Precision | Recall | F1 |
67
+ |-------|-----------|--------|-----|
68
+ | Malware | 79.82 | 75.11 | 77.39 |
69
+ | Indicator | 78.34 | 86.62 | 82.27 |
70
+ | System | 70.36 | 79.93 | 74.84 |
71
+ | Organization | 70.64 | 60.16 | **64.98** |
72
+ | Vulnerability | 100.0 | 80.0 | 88.89 |
73
+
74
+ ### 1.5 Hybrid Pipeline (Priority-Based Merging)
75
+
76
+ CyNER combines three extraction approaches with configurable priority:
77
+
78
+ 1. **Heuristic (H):** Regex patterns for IOCs — SHA256, SHA1, CVE, IPv4, email, filepath. Highest default priority.
79
+ 2. **Transformer (T):** XLM-RoBERTa-large fine-tuned on cybersecurity corpus.
80
+ 3. **Flair (F):** Pre-trained Flair NER model for generic entities (PER, LOC, ORG, MISC).
81
+ 4. **SpaCy (S):** SpaCy NER for additional generic entities.
82
+
83
+ Default priority: **HTFS** — Heuristic > Transformer > Flair > SpaCy. When entity spans overlap, the higher-priority source wins.
84
+
85
+ **Key design insight:** Regex gets priority for IOC indicators because they don't need context. Transformer handles semantic entities (malware names, systems). Flair/SpaCy catch generic entities (locations, people) that CyNER deliberately excludes from its own training.
86
+
87
+ ### 1.6 Evaluation Methodology
88
+
89
+ - **Metric:** Span-level micro-F1 via seqeval library
90
+ - **Test set:** 10 documents with 892 entity spans
91
+ - **No cross-validation reported** — single train/dev/test split
92
+ - **No evaluation of the hybrid pipeline end-to-end** — only transformer component benchmarked in paper
93
+
94
+ ### 1.7 Strengths
95
+
96
+ 1. Clean, modular design — easy to use as a library
97
+ 2. Hybrid approach is pragmatic: regex for structured IOCs, ML for semantic entities
98
+ 3. XLM-RoBERTa backbone handles multilingual threat reports
99
+ 4. Open source with pretrained models available
100
+
101
+ ### 1.8 Weaknesses & Limitations
102
+
103
+ 1. **Small training data:** Only 4,530 entity spans from 60 documents. Vulnerability class has only 48 training examples.
104
+ 2. **76.66% F1 is mediocre.** Organization class at 64.98% F1 is particularly weak.
105
+ 3. **Short context window:** 128 tokens max — misses long-range dependencies in threat reports.
106
+ 4. **No BIOES tagging** — uses BIO only, losing boundary precision.
107
+ 5. **RoBERTa models perform terribly** (39% F1) — suspicious, possibly a training bug.
108
+ 6. **No evaluation of hybrid pipeline** — paper only benchmarks transformer component.
109
+ 7. **Android malware bias** — corpus is android malware focused, may not generalize to network intrusions, APTs, etc.
110
+ 8. **Indicator class is a catch-all** — lumps URLs, IPs, hashes, domains, emails together. No sub-type distinction at the model level (regex handles sub-types separately).
111
+ 9. **No Viterbi/CRF decoding** — just a linear head, no structured prediction.
112
+
113
+ ### 1.9 Implications for Arcspan
114
+
115
+ Our advantages over CyNER:
116
+ - **Finer entity types** — we can distinguish IP, URL, hash, domain, CVE at model level
117
+ - **BIOES tagging** with Viterbi decoding — better boundary detection
118
+ - **MoE architecture** — 50M active params vs 560M, 10x more efficient
119
+ - **Bidirectional token classifier** — same paradigm but more modern architecture
120
+ - Their 76.66% F1 is very beatable with better data and architecture
121
+
122
+ ---
123
+
124
+ ## 2. PRISM Benchmark
125
+
126
+ ### 2.1 Overview
127
+
128
+ PRISM (Froudakis et al., 2025) is the first openly available, expert-validated benchmark for IoC extraction from threat reports.
129
+
130
+ - **Published:** ACSAC 2025, arXiv:2506.11325
131
+ - **Code/Data:** https://github.com/EvanFr/LANCE (GPL-3.0)
132
+
133
+ ### 2.2 Scope & Entity Types
134
+
135
+ PRISM focuses on **4 IoC types only:**
136
+
137
+ | Type | Definition |
138
+ |------|-----------|
139
+ | **IP** | IPv4/IPv6 addresses |
140
+ | **Domain** | Domain names |
141
+ | **URL** | Full URLs |
142
+ | **Hash** | File hashes (MD5, SHA1, SHA256) |
143
+
144
+ **Explicitly excluded:** Bitcoin addresses, email addresses, file paths, registry keys — "harder to verify at scale."
145
+
146
+ ### 2.3 Dataset Size
147
+
148
+ | Indicator Type | BAP | GAP | Total (non-unique instances) |
149
+ |---------------|-----|-----|-----|
150
+ | IP | 177 | 112 | 289 |
151
+ | Domain | 694 | 729 | 1,423 |
152
+ | URL | 426 | 445 | 871 |
153
+ | Hash | 962 | 758 | 1,720 |
154
+ | **Total** | **2,259** | **2,044** | **4,303** |
155
+
156
+ **Unique labeled indicators:** 1,774 total (1,401 IoC + 373 nonIoC)
157
+
158
+ - 50 real-world threat reports from ORKL (Apr 2023 - Nov 2024)
159
+ - Sources: Kaspersky, Palo Alto Networks, Microsoft (via AlienVault)
160
+
161
+ ### 2.4 Annotation Methodology
162
+
163
+ - **LANCE:** LLM-Assisted Notation and Classification Engine (ChatGPT-4o based)
164
+ - **Human-in-the-loop:** 5 junior analysts (PhD cybersecurity students) + 1 senior analyst (7yr experience)
165
+ - **Two-phase annotation:**
166
+ - BAP (Baseline Annotation Pass): Analysts label without LLM assistance
167
+ - GAP (Guided Annotation Pass): Analysts see LANCE labels + justifications
168
+ - **Dispute resolution:** Senior analyst resolves disagreements
169
+ - **Binary classification:** Each indicator labeled IoC or nonIoC
170
+
171
+ ### 2.5 Baseline Results
172
+
173
+ **LANCE (ChatGPT-4o) performance on PRISM:**
174
+ - Overall F1: **97.6%**
175
+ - IP: F1 ~1.00
176
+ - Hash: F1 ~1.00
177
+ - URL: F1 ~0.98-0.99
178
+ - Domain: F1 ~0.87 (lowest — context-dependent benign vs malicious)
179
+
180
+ **Other methods on PRISM:**
181
+ - RegEx + Whitelist: High recall, low precision (many FPs)
182
+ - AlienVault: Low recall (~25% for URLs), coverage gaps
183
+ - VirusTotal (threshold=1): F1 ~86%
184
+ - VirusTotal (threshold=5): Higher precision but low recall
185
+ - Naive ChatGPT prompting: F1 = 66.9%
186
+
187
+ **Cross-LLM generalization (on BAP subset):**
188
+ - GPT-4o: F1 = 0.98
189
+ - Gemma 3 27b: F1 = 0.92
190
+ - Gemini 2.0 Flash: F1 = 0.98
191
+ - Llama 3.3 70b: F1 = 0.83
192
+ - Nvidia Nemotron 70b: F1 = 0.85
193
+
194
+ ### 2.6 Key Differences from CyNER
195
+
196
+ | Aspect | CyNER | PRISM |
197
+ |--------|-------|-------|
198
+ | Task | NER (span extraction) | IoC classification (binary: IoC/nonIoC) |
199
+ | Entity types | 5 semantic types | 4 indicator types |
200
+ | Approach | Token classification | Document-level indicator labeling |
201
+ | Granularity | Token-level BIO tags | Indicator-level binary labels |
202
+
203
+ **Important:** PRISM is NOT a NER dataset in the traditional sense. It's an IoC classification benchmark. Indicators are first extracted by regex, then labeled as malicious/benign. This is complementary to, not a replacement for, NER training data.
204
+
205
+ ### 2.7 Data Format & Download
206
+
207
+ - **Repository:** https://github.com/EvanFr/LANCE
208
+ - **Dataset location:** `PRISM/GT.json` (ground truth), `PRISM/ReportsJSON/`, `PRISM/ReportsPDF/`
209
+ - **Format:** JSON (indicator-level labels, not token-level BIO/BIOES)
210
+ - **License:** GPL-3.0
211
+ - **Convertibility to BIOES:** Would require re-annotating at token level. The indicator boundaries are known from regex extraction, so conversion is feasible but requires mapping back to document text.
212
+
213
+ ---
214
+
215
+ ## 3. Dataset Catalog
216
+
217
+ ### 3.1 Summary Table
218
+
219
+ | # | Dataset | Size | Entity Types | Format | License | Download | Convertible to BIOES? |
220
+ |---|---------|------|-------------|--------|---------|----------|----------------------|
221
+ | 1 | **CyNER MITRE corpus** | 107K tokens, 4,530 spans, 60 docs | Malware, Indicator, System, Organization, Vulnerability | CoNLL BIO | Open (GitHub) | `github.com/aiforsec/CyNER/dataset/mitre/` | Yes — BIO→BIOES is trivial |
222
+ | 2 | **PRISM** | 1,774 unique indicators, 50 reports | IP, Domain, URL, Hash (binary IoC/nonIoC) | JSON | GPL-3.0 | `github.com/EvanFr/LANCE/PRISM/` | Partial — need token-level realignment |
223
+ | 3 | **bnsapa/cybersecurity-ner** (HuggingFace) | 4,166 rows (2,660 train / 785 val / 717 test) | ~10 types: File, Malware, Organization, Application, URL/Domain, Malware variant, Company, Product version | Token tags (numeric) | Apache 2.0 | `load_dataset("bnsapa/cybersecurity-ner")` | Yes — map numeric tags to BIOES |
224
+ | 4 | **Universal-NER/Pile-NER-type** | 45,889 passages, 13K+ entity types | Superset includes cybersecurity-relevant types (needs filtering) | Conversational JSON | CC-BY-NC-4.0 | `load_dataset("Universal-NER/Pile-NER-type")` | Needs extraction + filtering |
225
+ | 5 | **MITRE ATT&CK STIX** | 700+ techniques, 130+ groups, 600+ software | Threat actors, malware, techniques, tools, campaigns | STIX 2.1 JSON | Open | `github.com/mitre/cti` | Synthetic NER generation possible |
226
+ | 6 | **MITRE CVE/NVD** | 200K+ CVEs | Vulnerability IDs, affected software, versions | JSON | Public domain | `nvd.nist.gov/developers` | Gazetteer, not NER training |
227
+
228
+ ### 3.2 Detailed Notes Per Dataset
229
+
230
+ #### 3.2.1 CyNER MITRE Corpus
231
+ - **Location:** `github.com/aiforsec/CyNER/tree/main/dataset/mitre/` — `train.txt`, `valid.txt`, `test.txt`
232
+ - **Format:** CoNLL 2003 (token\tBIO-tag per line, blank lines between sentences)
233
+ - **Conversion:** BIO→BIOES is deterministic: B tags at end of entity become S, I tags at end become E
234
+ - **Quality:** Manually annotated by trained graduate students, BRAT tool
235
+ - **Limitation:** Small (4,530 spans), android malware biased
236
+
237
+ #### 3.2.2 PRISM
238
+ - **Best use:** Evaluation benchmark for IoC extraction, NOT primary training data
239
+ - **Conversion path:** Extract indicator spans from GT.json, map back to report text, generate token-level BIOES annotations. Feasible but engineering effort required.
240
+
241
+ #### 3.2.3 bnsapa/cybersecurity-ner (HuggingFace)
242
+ - **Appears to be derived from CyNER** — similar content (DroidJack RAT, FakeSpy examples), similar size
243
+ - **388 kB total** — very small
244
+ - **Has numeric labels** — need to verify exact mapping to entity type names
245
+ - Pre-trained model available: `yasserrmd/bert_cyber_ner`
246
+
247
+ #### 3.2.4 Pile-NER-type
248
+ - **Massive:** 45.9K passages with 13K+ entity types generated by GPT-3.5-turbo
249
+ - **Cybersecurity filtering strategy:**
250
+ - Filter conversation content for cybersecurity keywords (malware, CVE, vulnerability, exploit, threat, etc.)
251
+ - Filter by entity type names containing: IP_address, hash, malware, vulnerability, CVE, domain, URL, threat_actor, etc.
252
+ - Expected yield: ~500-2000 relevant passages (rough estimate)
253
+ - **Quality concern:** GPT-3.5-turbo generated, not human-validated
254
+ - **License:** CC-BY-NC-4.0 (non-commercial only)
255
+
256
+ #### 3.2.5 MITRE ATT&CK STIX
257
+ - **Download:** `git clone https://github.com/mitre/cti.git`
258
+ - **Contains:** Enterprise ATT&CK, Mobile ATT&CK, ICS ATT&CK
259
+ - **NER conversion:** Can extract (entity_name, entity_type) pairs from structured data, then use them as:
260
+ - Gazetteer for regex-based tagging of raw text
261
+ - Seed entities for distant supervision / weak labeling
262
+ - Entity dictionaries for data augmentation
263
+ - **Entity types extractable:** Threat groups (APT28, Lazarus), malware (Emotet, Cobalt Strike), tools (Mimikatz, PsExec), techniques (T1059, Credential Dumping)
264
+
265
+ ### 3.3 Datasets NOT Found / Not Publicly Available
266
+
267
+ | Dataset | Status |
268
+ |---------|--------|
269
+ | **DNRTI** | Referenced in some papers but no public download found |
270
+ | **SecureNLP** | No specific dataset found under this name |
271
+ | **APTNER** | No HuggingFace dataset found |
272
+ | **iACE dataset** | Paper mentions 1,500 IoC + 3,000 nonIoC but code/data not available |
273
+ | **Long et al. dataset** | 69,032 samples mentioned in PRISM survey but not available |
274
+
275
+ ---
276
+
277
+ ## 4. Proposed Label Taxonomy
278
+
279
+ ### 4.1 Recommended Unified Label Set
280
+
281
+ Based on cross-dataset analysis, we propose a **two-tier taxonomy** for Arcspan:
282
+
283
+ #### Tier 1: Core Labels (model-predicted, BIOES tagged)
284
+
285
+ | Label | Definition | Rationale |
286
+ |-------|-----------|-----------|
287
+ | **MALWARE** | Malware family names, variants | Appears in CyNER, bnsapa, Pile-NER. High value. Clear boundaries. |
288
+ | **THREAT_ACTOR** | APT groups, threat actor names | Extractable from ATT&CK. High value for attribution. |
289
+ | **TOOL** | Legitimate tools used in attacks | Mimikatz, Cobalt Strike, PsExec. Distinct from malware. |
290
+ | **VULNERABILITY** | CVE IDs and named vulnerabilities | Clear boundaries (CVE-XXXX-XXXXX). High value. |
291
+ | **SYSTEM** | OS, software, hardware platforms | CyNER class. Useful for affected-product extraction. |
292
+ | **ORGANIZATION** | Companies, institutions | CyNER class. Context for attribution and targeting. |
293
+
294
+ #### Tier 2: IOC Indicators (regex-extracted, optionally model-confirmed)
295
+
296
+ | Label | Definition | Rationale |
297
+ |-------|-----------|-----------|
298
+ | **IP_ADDRESS** | IPv4/IPv6 addresses | Regex handles well. Model confirms context (malicious vs benign). |
299
+ | **DOMAIN** | Domain names | Regex extracts, model disambiguates. |
300
+ | **URL** | Full URLs | Regex extracts, model disambiguates. |
301
+ | **HASH** | MD5, SHA1, SHA256 file hashes | Regex handles perfectly. |
302
+ | **EMAIL** | Email addresses | Regex handles well. |
303
+ | **CVE_ID** | CVE identifiers specifically | Regex: `CVE-\d{4}-\d{4,}`. Overlap with VULNERABILITY is intentional. |
304
+ | **FILEPATH** | File paths (Windows/Unix) | Regex-extractable. |
305
+
306
+ #### Excluded from model labeling (handled by regex only)
307
+
308
+ | Type | Reason for exclusion |
309
+ |------|---------------------|
310
+ | Registry keys | Too long, complex boundaries |
311
+ | Bitcoin addresses | Very rare, regex-perfect |
312
+ | Port numbers | Usually just integers, no NER value |
313
+ | MITRE technique IDs | Fixed pattern T\d{4}.\d{3}, pure regex |
314
+
315
+ ### 4.2 Rationale
316
+
317
+ 1. **Tier 1 entities** require semantic understanding — a malware name looks like any other proper noun. These benefit from learned representations and BIOES tagging.
318
+ 2. **Tier 2 entities** have structural patterns that regex handles well. The model's role is **confirmation/disambiguation** (is this IP malicious or benign in context?), similar to PRISM's approach.
319
+ 3. **Cross-dataset coverage:**
320
+ - CyNER maps to: MALWARE, SYSTEM, ORGANIZATION, VULNERABILITY + Tier 2 (Indicator → split into specific types)
321
+ - PRISM maps to: IP_ADDRESS, DOMAIN, URL, HASH (binary IoC/nonIoC overlay)
322
+ - bnsapa maps to: MALWARE, ORGANIZATION, APPLICATION (≈SYSTEM)
323
+ - ATT&CK provides: THREAT_ACTOR, MALWARE, TOOL gazetteers
324
+ 4. **6 Tier 1 + 7 Tier 2 = 13 total types.** For initial training, we could start with just Tier 1 (6 types) and use regex for Tier 2.
325
+
326
+ ---
327
+
328
+ ## 5. Data Acquisition Plan
329
+
330
+ ### 5.1 Immediate Downloads
331
+
332
+ | Priority | Dataset | Command | Estimated Effort |
333
+ |----------|---------|---------|-----------------|
334
+ | P0 | CyNER MITRE corpus | `git clone https://github.com/aiforsec/CyNER.git` → `dataset/mitre/` | 30 min to convert BIO→BIOES JSONL |
335
+ | P0 | bnsapa/cybersecurity-ner | `load_dataset("bnsapa/cybersecurity-ner")` | 1 hr to map labels and convert |
336
+ | P1 | PRISM | `git clone https://github.com/EvanFr/LANCE.git` → `PRISM/` | 4 hr to re-align to token-level BIOES |
337
+ | P1 | MITRE ATT&CK | `git clone https://github.com/mitre/cti.git` | 2 hr to extract entity gazetteers |
338
+ | P2 | Pile-NER cyber subset | `load_dataset("Universal-NER/Pile-NER-type")` + filter | 4 hr to filter, extract, convert |
339
+
340
+ ### 5.2 Conversion Pipeline
341
+
342
+ 1. **CyNER BIO → BIOES JSONL:**
343
+ - Read CoNLL format
344
+ - Convert BIO to BIOES (B at end of entity → S, I at end → E)
345
+ - Split "Indicator" class into sub-types using regex on the entity text
346
+ - Output: `{"tokens": [...], "labels": [...]}` JSONL
347
+
348
+ 2. **bnsapa → BIOES JSONL:**
349
+ - Map numeric tags to named types
350
+ - Already tokenized, just need format conversion
351
+
352
+ 3. **PRISM → BIOES JSONL:**
353
+ - Parse GT.json for indicator spans + labels
354
+ - Load report text from ReportsJSON/
355
+ - Tokenize reports, align indicator spans to token boundaries
356
+ - Generate BIOES tags (all Tier 2 types)
357
+ - This gives us **context-aware IoC/nonIoC labels** — unique training signal
358
+
359
+ 4. **ATT&CK → Gazetteers + Distant Supervision:**
360
+ - Extract all (name, type) pairs from STIX JSON
361
+ - Use as entity dictionaries for weak labeling of unlabeled text
362
+ - Can generate synthetic NER training data via string matching on large CTI corpus
363
+
364
+ 5. **Pile-NER → BIOES JSONL:**
365
+ - Filter passages containing cybersecurity content
366
+ - Extract entity spans from conversational format
367
+ - Map entity types to our taxonomy
368
+ - Convert to BIOES
369
+
370
+ ### 5.3 Estimated Total Training Data After Conversion
371
+
372
+ | Source | Estimated Spans | Quality |
373
+ |--------|----------------|---------|
374
+ | CyNER | 4,530 | High (human annotated) |
375
+ | bnsapa | ~4,000 | Medium (possibly CyNER derivative) |
376
+ | PRISM | ~1,774 | High (expert validated, but IoC types only) |
377
+ | Pile-NER filtered | ~2,000-5,000 | Low-medium (GPT-3.5 generated) |
378
+ | ATT&CK distant supervision | ~10,000-50,000 | Low (noisy distant supervision) |
379
+ | **Total** | **~20,000-65,000** | Mixed |
380
+
381
+ **Key gap:** We have good coverage for Tier 2 IOC types but limited training data for Tier 1 semantic types (THREAT_ACTOR, TOOL). MITRE ATT&CK distant supervision is crucial to fill this gap.
382
+
383
+ ---
384
+
385
+ ## 6. Sources
386
+
387
+ ### Papers
388
+ - Alam et al. (2022). "CyNER: A Python Library for Cybersecurity Named Entity Recognition." arXiv:2204.05754. https://arxiv.org/abs/2204.05754
389
+ - Froudakis et al. (2025). "Revealing the True Indicators: Understanding and Improving IoC Extraction From Threat Reports." arXiv:2506.11325, ACSAC 2025. https://arxiv.org/abs/2506.11325
390
+
391
+ ### Repositories & Datasets
392
+ - CyNER GitHub: https://github.com/aiforsec/CyNER
393
+ - CyNER dataset: https://github.com/aiforsec/CyNER/tree/main/dataset/mitre
394
+ - LANCE/PRISM GitHub: https://github.com/EvanFr/LANCE
395
+ - PRISM dataset: https://github.com/EvanFr/LANCE/tree/main/PRISM
396
+ - bnsapa/cybersecurity-ner: https://huggingface.co/datasets/bnsapa/cybersecurity-ner
397
+ - Universal-NER/Pile-NER-type: https://huggingface.co/datasets/Universal-NER/Pile-NER-type
398
+ - MITRE ATT&CK STIX: https://github.com/mitre/cti
399
+ - MITRE CVE: https://cve.mitre.org/
400
+ - NVD: https://nvd.nist.gov/
401
+
402
+ ### Other References
403
+ - MALOnt2.0: Christian et al. (2021), ACM CCS
404
+ - MALOnt: Rastogi et al. (2020), SIGKDD Workshop
405
+ - T-NER: Ushio & Camacho-Collados (2021), EACL
406
+ - seqeval: https://pypi.org/project/seqeval/
407
+ - BRAT annotation tool: Stenetorp et al. (2012)
408
+ - ORKL threat report repository: orkl.eu
research/notes/progress/2026-04-24-landscape-research-opus.md ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Landscape Analysis: Lightweight Span/Entity Detection for Arcspan
2
+
3
+ **Date:** 2026-04-24
4
+ **Researcher:** Claude (automated research session)
5
+
6
+ ---
7
+
8
+ ## 1. Executive Summary
9
+
10
+ **The biggest opportunity is cybersecurity IOC extraction from threat intelligence reports.** It sits at a unique intersection: (1) data is highly sensitive (can't send to cloud APIs), (2) current tools are either regex-only (miss context-dependent IOCs) or require BERT-large/GPT (too heavy for inline/edge use), (3) entity types are well-defined and short-span (IPs, hashes, CVEs, domains, malware names), (4) labeled datasets exist (CyNER, PRISM benchmark, Pile-NER subsets), and (5) the 257-token attention window is more than sufficient for the surrounding context needed. The gap between "regex that catches 70% of IOCs" and "BERT-large that catches 95% but requires a GPU" is exactly where a 50M-active-param model shines.
11
+
12
+ Close runners-up: **clinical/medical de-identification** (privacy-critical, well-funded, abundant datasets) and **developer tooling** (secret scanning, TODO detection — huge TAM, runs in CI/CD pipelines on CPU).
13
+
14
+ **Honest caveat:** The landscape is more crowded than expected at the ~100M param level (GLiNER, bert-base-NER, SpaCy transformers). Arcspan's edge is not raw accuracy — it's the combination of (a) extreme efficiency (50M active params via MoE vs 110M dense BERT), (b) trivial label-space reconfiguration via JSON, and (c) proven data efficiency (F1 0.962 with 10% of data). The play is "faster to customize, cheaper to run" rather than "better F1 on standard benchmarks."
15
+
16
+ ---
17
+
18
+ ## 2. Existing Tools Landscape
19
+
20
+ ### 2.1 Comparison Table
21
+
22
+ | Tool/Model | Params | Size on Disk | Domains | Local/Cloud | Fine-tunable | Key Gaps |
23
+ |---|---|---|---|---|---|---|
24
+ | **SpaCy en_core_web_sm** | ~4M (CNN) | ~40 MB | General (18 OntoNotes types) | Local | Yes (spacy train) | Low F1 (~85-86), no domain-specific types, CNN-based |
25
+ | **SpaCy en_core_web_md** | ~4M + vectors | ~140 MB | General (18 types) | Local | Yes | Marginal improvement over sm, bulky vectors |
26
+ | **SpaCy en_core_web_lg** | ~4M + vectors | ~600-800 MB | General (18 types) | Local | Yes | Large disk footprint for minimal NER gain (~86-87 F1) |
27
+ | **SpaCy en_core_web_trf** | ~110M (RoBERTa) | ~400 MB | General (18 types) | Local | Yes (slow) | Slow inference, GPU-preferred, 110M dense params |
28
+ | **dslim/bert-base-NER** | 110M | ~440 MB | General (4 CoNLL types: PER/ORG/LOC/MISC) | Local | Yes | Only 4 entity types, 110M dense, F1=91.3 on CoNLL |
29
+ | **GLiNER-S** | 50M (DeBERTa-v3-small) | ~200 MB | Zero-shot any type | Local | Limited | Zero-shot avg F1=52.7, needs many entity types at once |
30
+ | **GLiNER-M** | 90M (DeBERTa-v3-base) | ~360 MB | Zero-shot any type | Local | Limited | Avg F1=55.4 zero-shot, no BIOES, span-matching only |
31
+ | **GLiNER-L** | 300M (DeBERTa-v3-large) | ~1.2 GB | Zero-shot any type | Local | Limited | 300M params, F1=60.9 zero-shot avg, too large for edge |
32
+ | **GLiNER-BioMed** | Multiple scales | Varies | Biomedical zero-shot NER | Local | Yes (distilled) | Domain-specific, 5.96% F1 improvement over baselines |
33
+ | **StanfordAIMI deidentifier** | ~110M (PubMedBERT) | ~440 MB | Medical PHI/PII | Local | Yes | Medical-only, 110M dense, F1=97.9-99.6 on radiology |
34
+ | **Microsoft Presidio** | Varies (SpaCy backend) | ~100 MB+ | PII (credit cards, SSN, names, etc.) | Local | Extensible | Relies on SpaCy + regex + checksums; limited ML depth |
35
+ | **Google Cloud DLP** | Unknown (proprietary) | N/A | PII (50+ types) | Cloud only | No | Cloud-only, can't run locally, expensive at scale |
36
+ | **AWS Comprehend** | Unknown (proprietary) | N/A | General NER + PII | Cloud only | Custom models possible | Cloud-only, latency, cost per API call |
37
+ | **Flair NER (english-fast)** | ~50M (Flair embeddings) | ~250 MB | General (4 CoNLL types) | Local | Yes | Sequential LSTM, slower than transformers, limited types |
38
+ | **CyNER** | ~560M (XLM-RoBERTa-large) | ~2 GB | Cybersecurity IOCs | Local | Yes | Very large model, combines regex+transformer+SpaCy |
39
+ | **d4data/biomedical-ner-all** | 66M | ~265 MB | Biomedical (multiple entity types) | Local | Yes | Small community, limited benchmarking |
40
+ | **Arcspan (this project)** | 50M active (1.5B total MoE) | TBD | Any (JSON config) | Local/Edge/Browser | Yes (very data-efficient) | Unproven on non-PII tasks, 257-token window |
41
+
42
+ ### 2.2 Key Observations
43
+
44
+ **GLiNER is the closest competitor** to what Arcspan could become. Key differences:
45
+ - GLiNER uses span-matching in latent space (dot product of entity-type embeddings and span embeddings). Arcspan uses BIOES sequence labeling with Viterbi decoding.
46
+ - GLiNER-S at 50M params achieves avg F1 of 52.7 in zero-shot. This is the zero-shot ceiling for this parameter budget.
47
+ - GLiNER's strength is arbitrary entity types without retraining. Arcspan's strength would be higher F1 after minimal fine-tuning (F1 0.962 with 10% data).
48
+ - **Critical differentiator:** GLiNER requires entity type descriptions at inference time (added to context window). Arcspan's labels are baked in at training — no context-window tax.
49
+
50
+ **SpaCy dominates the "just works" space** but is stuck on 18 OntoNotes entity types for NER. Custom NER requires full retraining with `spacy train`. The CNN-based models (sm/md/lg) are fast but weak on edge cases; the transformer model (trf) is accurate but as heavy as BERT.
51
+
52
+ **The 110M-param dense BERT remains the workhorse.** dslim/bert-base-NER has 1.86M downloads. Most domain-specific NER models on HuggingFace are fine-tuned BERT-base variants (~110M params). This is the bar Arcspan must beat on efficiency while matching on accuracy.
53
+
54
+ ---
55
+
56
+ ## 3. Vertical-by-Vertical Analysis
57
+
58
+ ### 3.1 Cybersecurity — IOC Extraction from Threat Intelligence
59
+
60
+ **Current State:**
61
+ - **CyNER** (Alam et al., 2022): Python library combining XLM-RoBERTa-large (~560M params) + regex + SpaCy. Entities: malware names, threat actors, attack types, IOCs.
62
+ - **PRISM benchmark** (Froudakis et al., 2025): 1,791 labeled IOCs from 50 real-world threat reports. First high-quality ground truth for IOC extraction.
63
+ - Most SOC teams use regex-based YARA rules + manual extraction. Heavy tools like BERT are used in research papers but not deployed inline.
64
+ - OTuHunt framework proposes NLP-based IOC extraction for OT/ICS environments using MITRE ATT&CK mapping.
65
+
66
+ **Gaps:**
67
+ - CyNER is 560M+ params — can't run in a SIEM plugin or browser extension
68
+ - Regex catches structured IOCs (IPs, hashes) well but misses: malware family names in context, CVE references without standard format, threat actor aliases, attack technique descriptions
69
+ - No lightweight (<100M) production-ready cybersecurity NER model exists
70
+ - Privacy concern: threat reports often contain internal network details that can't be sent to cloud APIs
71
+
72
+ **Opportunity Assessment: HIGH**
73
+ - Entity types are well-bounded and short-span (perfect for 257-token window)
74
+ - PRISM + CyNER training data exists
75
+ - Clear "this is done with 560M params but could be done with 50M active" opportunity
76
+ - Data efficiency matters: new threat types emerge constantly, need fast retraining
77
+ - Commercial market: every SIEM vendor needs this
78
+
79
+ **Available Datasets:**
80
+ - Pile-NER (Universal-NER): 44,889 passages, 240k entity spans, 13k entity types — filter for cybersecurity subset
81
+ - PRISM: 1,791 labeled IOCs from 50 threat reports
82
+ - CyNER training corpus (publicly available)
83
+ - MITRE ATT&CK technique descriptions (structured, convertible)
84
+
85
+ ### 3.2 Medical/Clinical — De-identification and Entity Extraction
86
+
87
+ **Current State:**
88
+ - **StanfordAIMI de-identifier**: PubMedBERT-based (~110M), F1 97.9-99.6 on radiology reports. Gold standard for medical PII.
89
+ - **i2b2 2006/2014 shared tasks**: Standard benchmarks for clinical NER and de-identification.
90
+ - **GLiNER-BioMed** (Yazdani et al., 2025): Domain-adapted GLiNER for biomedical NER, 5.96% F1 improvement over baselines in zero-shot.
91
+ - **d4data/biomedical-ner-all**: 66M param model, 91.5k downloads.
92
+ - Discontinuous NER is a known challenge in medical text (Chen & Lin, 2024 — ensemble of 5 SOTA + ChatGPT on CADEC, ShARe13/14 datasets).
93
+
94
+ **Gaps:**
95
+ - Clinical NER models are mostly 110M+ params (BERT-base or larger)
96
+ - HIPAA compliance requires local processing — cloud APIs are a non-starter for many hospitals
97
+ - Edge deployment on medical devices (e.g., bedside monitors parsing clinical notes) needs <50M models
98
+ - Drug-drug interaction entities, adverse event mentions, and dosage spans are context-dependent (regex fails)
99
+ - Non-English clinical NER is severely underserved
100
+
101
+ **Opportunity Assessment: MEDIUM-HIGH**
102
+ - Strong need for local, lightweight processing (HIPAA)
103
+ - But: medical NER demands very high recall (missed entities = patient safety risk)
104
+ - The 257-token window may be limiting for long clinical documents
105
+ - StanfordAIMI at 110M with F1 99.6 is a high bar
106
+ - Better angle: Arcspan for quick fine-tuning to new clinical entity types (new drugs, new conditions) where the data-efficiency advantage shines
107
+
108
+ **Available Datasets:**
109
+ - i2b2 2006: ~870 clinical records, PHI entities (DUA required)
110
+ - i2b2 2014: ~1,304 records, more comprehensive PHI types (DUA required)
111
+ - CADEC: Adverse drug event corpus
112
+ - ShARe/CLEF 2013/2014: Clinical disorder mentions
113
+ - BC5CDR: Chemical-disease relations, ~1,500 PubMed articles
114
+ - JNLPBA: Gene/protein NER from MEDLINE abstracts
115
+ - NCBI Disease: Disease name recognition, 793 PubMed abstracts
116
+ - MedMentions: 4,392 PubMed abstracts, UMLS concepts
117
+
118
+ ### 3.3 Financial — Entities in SEC Filings, Reports, and News
119
+
120
+ **Current State:**
121
+ - **SEC-filings NER dataset**: CC-BY 3.0, financial entity annotations
122
+ - Most financial NER is done with general-purpose models (BERT-base) or LLMs via API
123
+ - Bloomberg, Reuters, and financial data providers use proprietary NER systems
124
+ - Ticker symbol detection is largely regex-based but fails for ambiguous tickers (e.g., "META" as word vs ticker, "CASH" the ticker vs the word)
125
+
126
+ **Gaps:**
127
+ - No widely-adopted lightweight financial NER model exists
128
+ - Monetary amounts with complex formatting (ranges, currencies, percentages in context)
129
+ - Risk indicators and sentiment-bearing financial terms are context-dependent
130
+ - Compliance teams need local processing for material non-public information (MNPI)
131
+
132
+ **Opportunity Assessment: MEDIUM**
133
+ - Financial text is relatively structured — regex handles more than in other domains
134
+ - But context-dependent ambiguity (ticker vs. word, amount vs. reference) is real
135
+ - MNPI sensitivity argues for local processing
136
+ - Smaller market than cybersecurity or medical
137
+
138
+ ### 3.4 Legal — Contract Analysis
139
+
140
+ **Current State:**
141
+ - German court decision segmentation dataset: 251,038 decisions (Darji et al., 2026)
142
+ - LegalEval@SemEval2023: Rhetorical role prediction in legal opinions
143
+ - Most legal NER is done with BERT-based models or LLMs
144
+ - Contract analysis tools (Kira Systems, Luminance, Ironclad) use proprietary models
145
+
146
+ **Gaps:**
147
+ - No public lightweight legal NER model for contract entities (parties, obligations, dates, defined terms)
148
+ - Legal language is highly domain-specific with nested references
149
+ - Clause boundary detection is not standard NER but is a span detection task
150
+ - Confidential contracts can't go to cloud APIs
151
+
152
+ **Opportunity Assessment: MEDIUM**
153
+ - Clear privacy need (contracts are confidential)
154
+ - But legal entities are often long spans (entire clauses) — may exceed the 257-token window
155
+ - Defined terms are often multi-word and require document-level context (cross-reference to definitions section)
156
+ - Would need significant labeled data creation
157
+
158
+ ### 3.5 Developer Tools — Secret Scanning, TODO/FIXME Detection
159
+
160
+ **Current State:**
161
+ - GitHub secret scanning uses regex patterns for known secret formats
162
+ - Tools like `detect-secrets` (Yelp), `truffleHog`, `gitleaks` use regex + entropy-based detection
163
+ - TODO/FIXME detection is purely regex (`grep -r "TODO"`)
164
+ - License detection tools (ScanCode, licensee) use text matching
165
+
166
+ **Gaps:**
167
+ - Regex-based secret scanning has known false positive/negative issues:
168
+ - High-entropy strings that aren't secrets (base64-encoded data, UUIDs)
169
+ - Secrets in non-standard formats (custom API key patterns)
170
+ - Test/example keys vs. real keys (context matters: `example_key = "sk_test_..."` vs `API_KEY = "sk_live_..."`)
171
+ - TODO/FIXME detection misses natural language equivalents ("we need to fix this later", "temporary workaround")
172
+ - No context-aware lightweight code annotation detector exists
173
+
174
+ **Opportunity Assessment: HIGH**
175
+ - Massive TAM (every development team, every CI/CD pipeline)
176
+ - Runs in CI/CD where speed and CPU-only inference matter
177
+ - Short spans with clear contextual cues — perfect for 257-token window
178
+ - False positive reduction in secret scanning is commercially valuable
179
+ - Data can be synthetically generated from code repositories
180
+
181
+ ### 3.6 Scientific Literature — Chemical/Gene/Material Entities
182
+
183
+ **Current State:**
184
+ - Biomedical NER is well-served (see section 3.2)
185
+ - Chemical NER: ChemNER, CHEMDNER dataset
186
+ - Gene NER: JNLPBA, BioCreative
187
+ - Material science NER: relatively underserved, mostly custom BERT models
188
+
189
+ **Opportunity Assessment: LOW-MEDIUM**
190
+ - Well-served by existing BERT-based models
191
+ - Scientific entities are often long, complex terms (chemical formulas, gene names with variants)
192
+ - Domain expertise needed for label schema design
193
+ - Niche market
194
+
195
+ ### 3.7 Energy/Power Systems — Equipment and Fault Entities
196
+
197
+ **Current State:**
198
+ - **No publicly available NER model or dataset specifically for power systems exists.** This was confirmed by extensive searching.
199
+ - Maintenance logs, SCADA alarm descriptions, and protection relay settings contain equipment IDs, fault codes, and protection settings
200
+ - Currently handled by regex patterns or manual extraction
201
+ - Some proprietary systems within ABB, Siemens, GE Vernova use custom NLP
202
+
203
+ **Gaps:**
204
+ - Equipment naming is highly inconsistent across utilities (e.g., "Breaker 101" vs "CB-101" vs "52-1" for the same physical device)
205
+ - Fault descriptions mix technical jargon with natural language ("phase A-to-ground fault on 138kV bus" — multiple entities interleaved)
206
+ - Protection settings are contextual ("Zone 1 reach set to 80% of line impedance" — "80%" is meaningless without context)
207
+
208
+ **Opportunity Assessment: LOW (for now)**
209
+ - No training data exists publicly
210
+ - Tiny market (hundreds of utilities vs. millions of developers)
211
+ - Would require partnerships with utilities to get labeled data
212
+ - The human's domain expertise is relevant but shouldn't force this choice
213
+ - Could revisit if a utility partner materializes
214
+
215
+ ### 3.8 Supply Chain/Manufacturing — Part Numbers and Defects
216
+
217
+ **Current State:**
218
+ - Part number extraction is largely regex-based
219
+ - Defect description NER is underserved
220
+ - Quality management systems use proprietary text analysis
221
+
222
+ **Opportunity Assessment: LOW-MEDIUM**
223
+ - Part numbers are highly structured (good for regex)
224
+ - Defect descriptions need context but data is proprietary
225
+ - No public datasets
226
+
227
+ ### 3.9 Education — Learning Objectives and Prerequisites
228
+
229
+ **Current State:**
230
+ - Minimal NER work in education domain
231
+ - Curriculum analysis is mostly manual or LLM-based
232
+
233
+ **Opportunity Assessment: LOW**
234
+ - Entities are often full sentences (learning objectives) — too long for span detection
235
+ - Small market
236
+ - No data
237
+
238
+ ### 3.10 HR/Recruiting — Resume Entities
239
+
240
+ **Current State:**
241
+ - Resume parsing is a mature field (Sovren, HireEZ, etc.)
242
+ - Most use regex + rule-based systems or cloud NER APIs
243
+ - Privacy concerns are significant (resume data is PII-heavy)
244
+
245
+ **Opportunity Assessment: MEDIUM**
246
+ - Clear privacy need (resumes contain PII)
247
+ - Skills, certifications, and experience spans are well-suited entity types
248
+ - But: commercial resume parsers are entrenched
249
+ - Resume NER datasets exist on HuggingFace (Chinese Resume NER dataset used in research)
250
+
251
+ ---
252
+
253
+ ## 4. The Regex Ceiling — Where Rules Fail and ML Is Needed
254
+
255
+ ### 4.1 Concrete Examples
256
+
257
+ **1. Cybersecurity: Malware Family Names**
258
+ - Regex can match `CVE-2024-\d{4,5}` but cannot identify "the Lazarus group deployed a new variant of their DreamJob toolkit" — "Lazarus group" (threat actor) and "DreamJob" (malware family) require contextual understanding.
259
+
260
+ **2. Financial: Ticker Symbol Ambiguity**
261
+ - "META" appears in: "Meta Platforms (META) reported earnings" vs. "the meta-analysis of clinical trials" vs. "META filed a 10-K." Regex matches all; only context resolves.
262
+
263
+ **3. Medical: Drug Names in Clinical Notes**
264
+ - "Patient was started on aspirin" is easy. "Pt was started on ASA 81 daily" requires knowing ASA=aspirin. "Started on baby aspirin" — "baby" modifies the entity but isn't part of the drug name.
265
+
266
+ **4. Secret Scanning: Context-Dependent Sensitivity**
267
+ - `password = "correct horse battery staple"` — is this a real password or a reference to the famous XKCD comic? Context (file path, variable name patterns, test file vs. production) matters.
268
+ - `API_KEY = "sk_test_abc123"` — test key, low risk. `API_KEY = "sk_live_abc123"` — production key, high risk. Regex treats both the same.
269
+
270
+ **5. Legal: Party References**
271
+ - "Acme Corp (hereinafter 'the Company')" — after this point, "the Company" refers to Acme Corp. Regex can't resolve coreference.
272
+
273
+ **6. Energy: Equipment References**
274
+ - "Open breaker at Station A" vs. "The breaker opened due to a fault at Station A" — "breaker" is equipment in both, but the action and causality differ. More critically: "The 138kV line from Station A to Station B" — the entity is the entire phrase, not just "138kV" or "Station A."
275
+
276
+ ### 4.2 The Pattern
277
+
278
+ Regex fails when:
279
+ 1. **The same surface form has different meanings** depending on context (META, ASA, "the Company")
280
+ 2. **Entity boundaries are context-dependent** ("baby aspirin" — is "baby" part of the entity?)
281
+ 3. **Entity types require understanding surrounding text** (a string is only a "secret" if it's in a production config, not a test file)
282
+ 4. **Entities are described in natural language** rather than following a pattern ("the threat actor known as...")
283
+ 5. **Domain-specific abbreviations** resolve differently in context (ASA = aspirin vs. ASA = American Standards Association)
284
+
285
+ This is precisely the sweet spot for a lightweight ML model: the entities are short spans, the context window needed is small (usually within a sentence or two), and the contextual cues are learnable.
286
+
287
+ ---
288
+
289
+ ## 5. Top 5 Opportunity Areas (Ranked)
290
+
291
+ ### Rank 1: Cybersecurity IOC Extraction
292
+ - **Impact:** High (every SOC team, every SIEM vendor)
293
+ - **Feasibility:** High (datasets exist, entity types are well-defined, short spans)
294
+ - **Gap in market:** Large (CyNER is 560M+ params, regex misses 30%+ of contextual IOCs)
295
+ - **Arcspan advantage:** 50M active params for inline/edge use, data-efficient retraining for new threat types
296
+ - **Score: 9/10**
297
+
298
+ ### Rank 2: Developer Tools — Context-Aware Secret & Annotation Scanning
299
+ - **Impact:** Very High (millions of developers, every CI/CD pipeline)
300
+ - **Feasibility:** High (code is structured, synthetic training data possible, short spans)
301
+ - **Gap in market:** Moderate (regex tools exist but high false positive rate; no ML-based lightweight alternative)
302
+ - **Arcspan advantage:** CPU-only inference in CI/CD, browser extension for code review, JSON-configurable for custom secret patterns
303
+ - **Score: 8.5/10**
304
+
305
+ ### Rank 3: Clinical/Medical De-identification
306
+ - **Impact:** High (HIPAA compliance, patient safety)
307
+ - **Feasibility:** Medium-High (i2b2 datasets exist but DUA-restricted, entity types well-defined)
308
+ - **Gap in market:** Moderate (StanfordAIMI exists at 110M but nothing at 50M; edge medical devices need smaller)
309
+ - **Arcspan advantage:** 2x smaller than BERT-base, data-efficient for new institution adaptation
310
+ - **Score: 7.5/10**
311
+
312
+ ### Rank 4: PII Detection (Improved Presidio Backend)
313
+ - **Impact:** High (GDPR/CCPA compliance is universal)
314
+ - **Feasibility:** Very High (model is already trained for PII, just extend/adapt)
315
+ - **Gap in market:** Moderate (Presidio exists but uses SpaCy CNN or full BERT; Arcspan could be a drop-in replacement with better accuracy/speed)
316
+ - **Arcspan advantage:** Native capability (the model was designed for PII), MoE efficiency, already proven
317
+ - **Score: 7/10**
318
+
319
+ ### Rank 5: Financial Entity Extraction
320
+ - **Impact:** Medium-High (compliance, MNPI handling)
321
+ - **Feasibility:** Medium (SEC-filings dataset exists, but financial text is semi-structured)
322
+ - **Gap in market:** Moderate (no lightweight financial NER, but regex covers a lot)
323
+ - **Arcspan advantage:** Context-dependent disambiguation (ticker vs. word), local processing for MNPI
324
+ - **Score: 6/10**
325
+
326
+ ---
327
+
328
+ ## 6. Available Datasets
329
+
330
+ | Dataset | Domain | Size | Entity Types | Source/Link | BIOES-Convertible? |
331
+ |---|---|---|---|---|---|
332
+ | **CoNLL-2003** | General news | 14,987 sentences, 203K tokens | PER, ORG, LOC, MISC | Standard benchmark | Yes (natively BIO, trivial BIOES conversion) |
333
+ | **OntoNotes 5** | Mixed | ~1.7M tokens | 18 types (PERSON, ORG, GPE, etc.) | LDC2013T19 | Yes |
334
+ | **Pile-NER** | Multi-domain | 44,889 passages, 240K spans, 13K entity types | Open/diverse | [HuggingFace](https://huggingface.co/datasets/Universal-NER/Pile-NER-type) | Needs conversion from span format |
335
+ | **PRISM** | Cybersecurity IOCs | 1,791 IOCs from 50 threat reports | IP, hash, domain, CVE, malware name | arxiv:2506.11325 | Needs conversion |
336
+ | **CyNER corpus** | Cybersecurity | Unknown size (publicly available) | Malware, threat actor, attack type, IOC | [GitHub](https://github.com/aiforsec/CyNER) | Needs conversion |
337
+ | **i2b2 2006** | Clinical PHI | ~870 records | PHI types (name, date, location, etc.) | DUA required | Yes (natively BIO) |
338
+ | **i2b2 2014** | Clinical PHI | ~1,304 records | Expanded PHI types | DUA required | Yes |
339
+ | **BC5CDR** | Biomedical | ~1,500 PubMed articles | Chemical, Disease | [BioCreative](https://biocreative.bioinformatics.udel.edu/) | Yes (BIO format) |
340
+ | **JNLPBA** | Biomedical | 2,404 MEDLINE abstracts | Protein, DNA, RNA, cell line, cell type | JNLPBA shared task | Yes (natively BIO) |
341
+ | **NCBI Disease** | Biomedical | 793 PubMed abstracts | Disease names | [NCBI](https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/) | Yes |
342
+ | **SEC-filings** | Financial | Unknown size | Financial entities | CC-BY 3.0 | Likely convertible |
343
+ | **WikiNER** | General (multilingual) | ~170K sentences (French), similar for other langs | PER, ORG, LOC, MISC | Multiple HuggingFace repos | Yes |
344
+ | **WNUT17** | Social media | ~5K tweets | Person, Location, Corporation, Product, Creative Work, Group | CC-BY 4.0 | Yes (BIO format) |
345
+ | **Few-NERD** | General (fine-grained) | 188K sentences | 66 fine-grained types in 8 coarse types | [HuggingFace](https://huggingface.co/datasets/DFKI-SLT/few-nerd) | Yes |
346
+ | **conll2025-ner** | General | 144K examples | Multiple types | [HuggingFace](https://huggingface.co/datasets/boltuix/conll2025-ner) | Likely yes |
347
+ | **MedMentions** | Biomedical | 4,392 PubMed abstracts | UMLS concepts | PubMed | Needs conversion from UMLS |
348
+ | **pasteproof-pii-dataset** | PII | 150K examples | PII entity types | [HuggingFace](https://huggingface.co/datasets/joneauxedgar/pasteproof-pii-dataset-v2) | Likely yes |
349
+
350
+ ---
351
+
352
+ ## 7. Sources
353
+
354
+ ### Papers
355
+ - Zaratiana et al. (2023). "GLiNER: Generalist Model for Named Entity Recognition using Bidirectional Transformer." arXiv:2311.08526. https://arxiv.org/abs/2311.08526
356
+ - Yazdani et al. (2025). "GLiNER-biomed: A Suite of Efficient Models for Open Biomedical Named Entity Recognition." arXiv:2504.00676. https://arxiv.org/abs/2504.00676
357
+ - Yoo et al. (2025). "ReProCon: Scalable and Resource-Efficient Few-Shot Biomedical NER." arXiv:2508.16833
358
+ - Xu et al. (2024). "GoalBERT: A Lightweight Named-Entity Recognition Model Based on Multiple Fusion." Applied Sciences 14(23):11003
359
+ - Alam et al. (2022). "CyNER: A Python Library for Cybersecurity Named Entity Recognition." arXiv:2204.05754. https://arxiv.org/abs/2204.05754
360
+ - Froudakis et al. (2025). "Revealing the True Indicators: Understanding and Improving IoC Extraction From Threat Reports." arXiv:2506.11325. https://arxiv.org/abs/2506.11325
361
+ - Arikkat et al. (2023). "Discerning Reliable Cyber Threat Indicators for Timely Cyber Threat Intelligence." arXiv:2306.16087
362
+ - Chen & Lin (2024). "On Fusing ChatGPT and Ensemble Learning in Discontinuous NER in Health Corpora." arXiv:2412.16976
363
+ - Darji et al. (2026). "Segmentation and Processing of German Court Decisions from Open Legal Data." arXiv:2601.01449
364
+ - Belfathi et al. (2023). "Enhancing Pre-Trained Language Models with Sentence Position Embeddings for Rhetorical Roles Recognition in Legal Opinions." arXiv:2310.05276
365
+ - Atuhurra et al. (2024). "NERsocial: Efficient NER Dataset Construction for HRI Utilizing RapidNER." arXiv:2412.09634
366
+
367
+ ### Models & Tools (HuggingFace / GitHub)
368
+ - dslim/bert-base-NER: https://huggingface.co/dslim/bert-base-NER (110M params, F1=91.3 CoNLL)
369
+ - GLiNER models: https://huggingface.co/urchade/gliner_medium-v2.1 (50M/90M/300M variants)
370
+ - StanfordAIMI de-identifier: https://huggingface.co/StanfordAIMI/stanford-deidentifier-base (F1=97.9-99.6)
371
+ - d4data/biomedical-ner-all: https://huggingface.co/d4data/biomedical-ner-all (66M params)
372
+ - CyNER: https://github.com/aiforsec/CyNER
373
+ - Microsoft Presidio: https://github.com/microsoft/presidio
374
+ - GLiNER repo: https://github.com/urchade/GLiNER
375
+ - Entity recognition datasets list: https://github.com/juand-r/entity-recognition-datasets
376
+ - SpaCy models: https://spacy.io/models/en
377
+
378
+ ### Web Sources
379
+ - Edge AI & Vision Alliance (2026). "On-Device LLMs in 2026: What Changed, What Matters." https://www.edge-ai-vision.com/2026/01/on-device-llms-in-2026-what-changed-what-matters-whats-next/
380
+ - ACM Queue. "Generative AI at the Edge: Challenges and Opportunities." https://queue.acm.org/detail.cfm?id=3733702
381
+ - Facebook Research. MobileLLM. https://github.com/facebookresearch/MobileLLM
382
+ - HuggingFace token-classification models: https://huggingface.co/models?pipeline_tag=token-classification&sort=downloads
383
+ - HuggingFace token-classification datasets: https://huggingface.co/datasets?task_categories=token-classification&sort=downloads
384
+
385
+ ### Research Limitations
386
+ - Could not access OpenAI's PII redaction blog post (403 error) — technical details about the base model architecture sourced from the user's project description
387
+ - PapersWithCode redirected to HuggingFace (302) — could not access benchmark leaderboards directly
388
+ - Brave Search MCP was intermittently returning no results for many queries
389
+ - Paper search engine returned many irrelevant results for domain-specific queries (NER search polluted with unrelated papers)
390
+ - GLiNER paper PDF could not be parsed by WebFetch — extracted text via paper-search CLI instead
research/notes/progress/2026-04-24-ner-recall-improvement-techniques.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NER Recall Improvement Techniques Research
2
+
3
+ **Date:** 2026-04-24
4
+ **Context:** Cybersecurity NER, ~63% span F1, recall bottleneck (40-60%), 1.5B sparse MoE (50M active), BIOES + Viterbi
5
+
6
+ ## Ranked Recommendations (implementable in 1-2 days)
7
+
8
+ ### Tier 1: Highest impact, lowest effort
9
+
10
+ 1. **Lower the entity confidence threshold / bias the O-class down**
11
+ Viterbi decoding uses transition + emission scores. Add a negative bias to the "O" emission logit (e.g., -0.5 to -2.0) to make the model less eager to predict outside-entity. Tune on val set. Zero training cost.
12
+ - Source: calibration paper https://ar5iv.labs.arxiv.org/html/2004.04361
13
+
14
+ 2. **Self-training / pseudo-labeling on unlabeled cyber text**
15
+ Run current model on large unlabeled corpus (CVE descriptions, threat reports), keep only high-confidence spans (>0.9), add to training data, retrain. SeqUST framework shows this matches 3-8x more labeled data.
16
+ - Source: https://arxiv.org/abs/2302.08659 (AAAI), https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0246310
17
+
18
+ 3. **Negative sampling / entity ratio rebalancing**
19
+ If training data is dominated by O tokens (typical >80%), downsample O-tagged sentences or upsample entity-rich sentences. Alternatively, use focal loss or class-weighted cross-entropy penalizing O less.
20
+
21
+ ### Tier 2: Medium effort, strong evidence
22
+
23
+ 4. **Label taxonomy harmonization (CyberNER insight)**
24
+ CyberNER paper showed harmonizing disparate tag schemas onto STIX 2.1 improved F1 from 0.569 to 0.736 (30% relative). Review if our label definitions have overlapping/ambiguous categories.
25
+ - Source: https://arxiv.org/html/2510.26499v1
26
+
27
+ 5. **Two-stage: binary entity detection then classification**
28
+ Train first with binary BIOES (entity vs not-entity), then fine-tune with full label set. Addresses recall by simplifying the initial detection task. Common in biomedical NER.
29
+
30
+ 6. **Auxiliary binary "is-entity" head**
31
+ Add a parallel binary classification head alongside the BIOES head during training. Multi-task signal forces the model to first learn entity boundaries.
32
+
33
+ ### Tier 3: MoE-specific and architecture tweaks
34
+
35
+ 7. **Reduce auxiliary load-balancing loss coefficient during fine-tuning**
36
+ Default load-balancing loss may fight task adaptation. Reduce coefficient (e.g., 0.01 → 0.001) to let experts specialize for cyber entities.
37
+ - Source: https://apxml.com/courses/mixture-of-experts-advanced-implementation/chapter-3-training-large-scale-moes/fine-tuning-pretrained-moe
38
+
39
+ 8. **Freeze router, fine-tune experts only (or vice versa)**
40
+ BOND-MoE paper shows document-level expert routing helps NER. Try freezing router weights for first N epochs, then unfreezing.
41
+ - Source: https://arxiv.org/pdf/2404.19192
42
+
43
+ 9. **Data augmentation: entity mention replacement**
44
+ Replace entity mentions with synonyms/variants (e.g., swap malware names, IP formats). Preserves context while diversifying entity surface forms. Low effort with regex + lookup tables.
45
+
46
+ ### Tier 4: Worth trying if time permits
47
+
48
+ 10. **Iterative label re-correction**
49
+ Use model predictions to find false negatives in training data (model predicts entity but gold says O). Manually review top disagreements — often reveals annotation errors.
50
+ - Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC8170952/
51
+
52
+ ## Key Cyber NER Benchmarks
53
+ - CyberNER (RoBERTa): 73.6% F1 on unified STIX corpus
54
+ - Fine-tuned LLMs (Mistral 7B etc.): ~74% F1
55
+ - BERT fine-tuned on clean single-domain: up to 96% F1 (narrow scope)
56
+ - Zero-shot self-improving: 61-75% F1 depending on dataset
57
+
58
+ ## Open Questions
59
+ - What is our current O-token ratio in training data?
60
+ - Are we using any load-balancing loss? What coefficient?
61
+ - Have we tried threshold tuning on Viterbi emission scores?
research/notes/progress/2026-04-26-01-feasibility-check-approach.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feasibility Check: Is the Arcspan Cyber-NER Approach Viable?
2
+
3
+ **Date:** 2026-04-26
4
+ **Status:** External validation pass after R8/R9 local work
5
+
6
+ ## What we found
7
+
8
+ Three external facts materially support the current direction:
9
+
10
+ 1. **OpenAI Privacy Filter is explicitly designed for domain adaptation and precision/recall control.**
11
+ The model card states that the model can be adapted through fine-tuning, that runtime decoding can control precision/recall tradeoffs, and that even small fine-tuning sets can yield large gains on shifted distributions.
12
+
13
+ 2. **CyberNER validates the harmonization thesis.**
14
+ The CyberNER paper argues that naive concatenation of cyber NER datasets degrades performance, while principled STIX-based harmonization yields about a **30% relative F1 improvement** over naive merging. This directly supports the repo’s effort to normalize CyNER, DNRTI, APTNER, and related sources before training.
15
+
16
+ 3. **SecureBERT 2.0 is strong evidence of the ceiling, but mostly because of domain pretraining rather than just supervised NER fine-tuning.**
17
+ The SecureBERT 2.0 paper reports pretraining on **13B+ cybersecurity text tokens** plus **53M code tokens** and then achieving state-of-the-art benchmark performance. This is a useful target, but not a fair apples-to-apples baseline for the current Arcspan setup, which is adapting a general span model rather than a cyber-native encoder.
18
+
19
+ Also relevant:
20
+
21
+ 4. **MoE for NER is plausible, but evidence is about robustness to noisy labels more than guaranteed domain transfer.**
22
+ BOND-MoE supports the general idea that MoE can help NER under noisy or weak supervision, but it does not by itself prove that the current OPF routing behavior is optimal for cyber CTI extraction.
23
+
24
+ ## Why it matters
25
+
26
+ The current approach is **feasible**, but only under the right claim:
27
+
28
+ - Feasible claim: "A compact general-purpose span detector can be retargeted into a useful cyber NER model with careful schema harmonization, leakage control, and targeted data augmentation."
29
+ - Not yet supported claim: "This approach will match or beat cyber-native models like SecureBERT 2.0 on top-end benchmark F1."
30
+
31
+ The repo’s current bottleneck is no longer whether the base OPF model can learn the task at all. R8 already shows it can. The bottleneck is whether data representation and decoding can close the Organization/System recall gap enough to produce a competitive and useful model.
32
+
33
+ ## Source
34
+
35
+ - OpenAI Privacy Filter model card:
36
+ https://cdn.openai.com/pdf/c66281ed-b638-456a-8ce1-97e9f5264a90/OpenAI-Privacy-Filter-Model-Card.pdf
37
+ - CyberNER:
38
+ https://arxiv.org/abs/2510.26499
39
+ - SecureBERT 2.0:
40
+ https://arxiv.org/abs/2510.00240
41
+ - BOND-MoE:
42
+ https://arxiv.org/abs/2404.19192
43
+
44
+ ## Open questions
45
+
46
+ - Can strict R9 training plus decode calibration move exact-match APTNER F1 from ~0.50 into the low 0.60s?
47
+ - Is the propagated dataset genuinely useful, or does it mainly inject memorization and ambiguous false positives?
48
+ - Would the highest-leverage next step be better decoding/calibration, or another round of targeted Org/System data creation?
research/notes/progress/2026-04-26-02-cyner-exact-match-and-gap-analysis.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # R8 Exact-Match on CyNER + Gap Analysis vs APTNER
2
+
3
+ **Date:** 2026-04-26
4
+ **Status:** Completed locally on saved R8 checkpoint
5
+
6
+ ## What we found
7
+
8
+ Ran the exact-match evaluator on the saved R8 checkpoint against the converted original CyNER test set:
9
+
10
+ - Checkpoint: `checkpoints/r8_5class/epoch_4`
11
+ - Test data: `data/processed/cyner_test.jsonl`
12
+ - Output JSON: `results/r8_cyner_exact_match.json`
13
+
14
+ ### CyNER exact-match results
15
+
16
+ | Metric | Value |
17
+ |---|---:|
18
+ | Micro precision | 0.4540 |
19
+ | Micro recall | 0.3655 |
20
+ | **Micro F1** | **0.4050** |
21
+
22
+ For comparison, containment on the same run was:
23
+
24
+ | Metric | Value |
25
+ |---|---:|
26
+ | Micro precision | 0.5097 |
27
+ | Micro recall | 0.4283 |
28
+ | **Micro F1** | **0.4655** |
29
+
30
+ Per-class exact-match F1 on CyNER:
31
+
32
+ | Class | Precision | Recall | F1 | Support |
33
+ |---|---:|---:|---:|---:|
34
+ | Malware | 0.5847 | 0.5702 | **0.5774** | 242 |
35
+ | Indicator | 0.5181 | 0.1648 | **0.2500** | 261 |
36
+ | Organization | 0.2875 | 0.3511 | **0.3162** | 131 |
37
+ | System | 0.4120 | 0.3871 | **0.3992** | 248 |
38
+ | Vulnerability | 0.5000 | 0.3000 | **0.3750** | 10 |
39
+
40
+ ### Test-distribution comparison
41
+
42
+ Span distribution by class:
43
+
44
+ | Class | R8 train | R9 train | CyNER test | APTNER test |
45
+ |---|---:|---:|---:|---:|
46
+ | Indicator | 32.1% | 23.8% | 29.3% | 16.2% |
47
+ | Malware | 22.0% | 24.1% | 27.1% | 30.0% |
48
+ | Organization | 19.3% | 24.5% | 14.7% | 26.8% |
49
+ | System | 17.3% | 18.5% | 27.8% | 25.6% |
50
+ | Vulnerability | 9.3% | 9.1% | 1.1% | 1.5% |
51
+
52
+ ### Main gap signals
53
+
54
+ 1. **CyNER is the hardest Indicator benchmark** for the current model.
55
+ The repo’s audit already found that CyNER indicators are often defanged, partial, or unconventional: package names, registry paths, and odd multi-token indicators. R8 exact-match confirms this with Indicator F1 collapsing to **0.25**.
56
+
57
+ 2. **APTNER and CyNER stress different weaknesses.**
58
+ - APTNER exposes **Organization/System contextual recall** problems in APT-report text.
59
+ - CyNER exposes **Indicator format coverage** problems plus some boundary issues.
60
+
61
+ 3. **R9 is better aligned for APTNER than for CyNER.**
62
+ R9 meaningfully increases Organization share (19.3% → 24.5%) and slightly System share, which should help APTNER more than CyNER. It does **not** directly solve the CyNER-specific defanged/unconventional Indicator problem.
63
+
64
+ ## Why it matters
65
+
66
+ This clarifies the anti-benchmaxxing position:
67
+
68
+ - Improving APTNER requires **APT-style Org/System data**.
69
+ - Improving CyNER requires **CyNER-style Indicator coverage**.
70
+
71
+ Those are related, but not identical, objectives. A generalized model needs both. If we only optimize for one benchmark distribution, we will regress on the other.
72
+
73
+ ## Source
74
+
75
+ - Result file: `results/r8_cyner_exact_match.json`
76
+ - Evaluator: `scripts/eval_exact_match.py`
77
+ - Prior audits:
78
+ - `research/notes/progress/2026-04-24-45-data-quality-audit.md`
79
+ - `research/notes/progress/2026-04-24-54-audit-train-test-leakage.md`
80
+
81
+ ## Open questions
82
+
83
+ - Should we split future data work into two explicit buckets: `APT-style Org/System` and `CyNER-style Indicator`?
84
+ - Is a separate Indicator-format normalization pass more valuable than adding more generic cybersecurity spans?
85
+ - Should R9 strict be followed immediately by an `R9+indicator-coverage` dataset branch rather than a single monolithic next round?
research/paper/outline.md ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paper Outline: Arcspan — Efficient Cybersecurity Entity Extraction via Sparse Mixture-of-Experts
2
+
3
+ **Working title — will refine later**
4
+
5
+ ---
6
+
7
+ ## Abstract (draft skeleton)
8
+
9
+ We present Arcspan, a cybersecurity entity extraction system built by fine-tuning a 1.5B-parameter sparse Mixture-of-Experts (MoE) bidirectional token classifier with only 50M active parameters per token. Starting from OpenAI's Privacy Filter architecture, we replace the PII label space with cybersecurity-specific entity types and fine-tune on [datasets TBD]. On [benchmark TBD], Arcspan achieves [F1 TBD] compared to CyNER's [F1 TBD] while using 11x fewer active parameters, running [Nx] faster on CPU, and requiring as few as [N] labeled examples to reach [F1] accuracy. Our results demonstrate that sparse MoE architectures offer a compelling efficiency–accuracy tradeoff for domain-specific entity extraction, enabling deployment in resource-constrained environments where dense transformers are impractical.
10
+
11
+ ---
12
+
13
+ ## 1. Introduction
14
+
15
+ - The need for automated IOC/entity extraction from threat intelligence reports
16
+ - Current approaches: regex (fast but misses contextual entities), dense transformers (accurate but heavy), hybrid (CyNER)
17
+ - The opportunity: sparse MoE models offer large parameter capacity with small active compute
18
+ - Our contribution: first application of sparse MoE token classification to cybersecurity NER
19
+ - Preview of results
20
+
21
+ ## 2. Related Work
22
+
23
+ ### 2.1 Cybersecurity NER
24
+ - CyNER (Alam et al., 2022) — hybrid regex + XLM-RoBERTa-large + SpaCy
25
+ - PRISM benchmark (Froudakis et al., 2025) — IOC extraction evaluation
26
+ - Regex-based tools (YARA, Sigma rules, IOC parsers)
27
+ - OTuHunt and other OT/ICS-specific NLP
28
+
29
+ ### 2.2 Lightweight NER
30
+ - GLiNER (Zaratiana et al., 2023) — zero-shot span detection
31
+ - SpaCy models — CNN and transformer variants
32
+ - BERT-base-NER (dslim) — the dense baseline
33
+ - Flair, Stanza, and other lightweight alternatives
34
+
35
+ ### 2.3 Sparse Mixture-of-Experts for NLP
36
+ - MoE in language modeling (Switch Transformer, GShard)
37
+ - MoE for classification tasks (less explored)
38
+ - The Privacy Filter architecture as an MoE token classifier
39
+
40
+ ## 3. Architecture
41
+
42
+ ### 3.1 Base Model
43
+ - OpenAI Privacy Filter: 1.5B total / 50M active params
44
+ - Architecture: 8-layer pre-norm transformer encoder, 128 experts (top-4 routing), GQA (14 Q / 2 KV heads), banded bidirectional attention (257-token window)
45
+ - Pretraining: autoregressive on gpt-oss pipeline, then converted to bidirectional classifier
46
+ - Decoding: constrained Viterbi with BIOES transition enforcement and 6 tunable transition biases
47
+
48
+ ### 3.2 Label Space Adaptation
49
+ - Custom label-space JSON → automatic output head rebuild with warm-start row copying
50
+ - Our cybersecurity label taxonomy: [entity types TBD]
51
+ - BIOES encoding: 1 + N×4 token-level classes
52
+
53
+ ### 3.3 Fine-tuning Procedure
54
+ - Full-model fine-tuning (all parameters, not just head)
55
+ - AdamW optimizer, hyperparameters: [TBD, will tune]
56
+ - Windowed training for long documents
57
+
58
+ ## 4. Experimental Setup
59
+
60
+ ### 4.1 Datasets
61
+ - Training data: [TBD — CyNER corpus + PRISM + Pile-NER subset]
62
+ - Evaluation data: [held-out test split from same, plus cross-dataset generalization]
63
+ - Data conversion pipeline: source format → BIOES JSONL
64
+
65
+ ### 4.2 Baselines
66
+ 1. **CyNER** (Alam et al., 2022) — 560M params, hybrid pipeline (our primary comparison)
67
+ 2. **BERT-base-NER** fine-tuned on same data — 110M dense params (standard NER baseline)
68
+ 3. **GLiNER-M** zero-shot — 90M params (zero-shot ceiling)
69
+ 4. **Regex-only** — pattern matching for structured IOCs (lower bound)
70
+ 5. **SpaCy en_core_web_trf** — 110M params, general NER (out-of-domain baseline)
71
+
72
+ ### 4.3 Evaluation Protocol
73
+ - **Primary metric:** Span-level F1, precision, recall (exact match)
74
+ - **Secondary metrics:** Token-level F1, partial span overlap F1
75
+ - **Per-entity-type breakdown** — critical for understanding where MoE wins/loses
76
+ - **Statistical significance:** bootstrap confidence intervals on F1
77
+ - All on same held-out test set, 5 random seed runs for variance estimation
78
+
79
+ ### 4.4 Efficiency Metrics
80
+ - Inference latency: mean ± std ms per document (CPU single-thread, batch size 1)
81
+ - Throughput: documents per second (CPU, various batch sizes)
82
+ - Memory: peak RSS during inference
83
+ - Model size on disk (bf16)
84
+ - All measured on same hardware (document specs: CPU model, RAM, OS)
85
+
86
+ ## 5. Results
87
+
88
+ ### 5.1 Main Results (Table 1)
89
+ - Arcspan vs. all baselines: span-level F1, precision, recall, latency, model size
90
+ - Per-entity-type F1 breakdown (Table 2)
91
+
92
+ ### 5.2 Data Efficiency (Figure 1 — the money chart)
93
+ - Learning curve: F1 at {1%, 5%, 10%, 25%, 50%, 100%} of training data
94
+ - Compare Arcspan curve vs. BERT-base fine-tuned on same fractions
95
+ - This is where the MoE pretraining advantage should be most visible
96
+
97
+ ### 5.3 Efficiency–Accuracy Tradeoff (Figure 2)
98
+ - Scatter plot: F1 vs. active parameters for all models
99
+ - Scatter plot: F1 vs. inference latency (CPU)
100
+ - Arcspan should be Pareto-optimal or near-Pareto
101
+
102
+ ### 5.4 Viterbi Decoding Analysis
103
+ - Impact of Viterbi vs. per-token argmax on span-level F1
104
+ - Precision–recall curves at different Viterbi operating points
105
+ - This is unique to our architecture — no other lightweight NER has this
106
+
107
+ ### 5.5 Error Analysis
108
+ - Where does Arcspan fail vs. CyNER?
109
+ - Structured IOCs (IPs, hashes) — how much do regex pre-filters help?
110
+ - Context-dependent entities (malware names, threat actors) — the ML sweet spot
111
+ - Obfuscated/defanged indicators — known hard case
112
+
113
+ ### 5.6 Ablation Studies
114
+ - Full model fine-tuning vs. head-only fine-tuning
115
+ - Effect of number of active experts (top-2 vs top-4 vs top-8)
116
+ - Effect of Viterbi transition biases
117
+ - Effect of banded attention window size (if modifiable)
118
+
119
+ ## 6. Discussion
120
+
121
+ - When does sparse MoE beat dense transformers for NER? (our hypothesis: when domain adaptation is needed and data is limited)
122
+ - Practical deployment considerations (CPU inference, edge devices, browser)
123
+ - Limitations: 257-token window, English-centric, static label space
124
+ - The label-space reconfigurability as a platform feature (same base model, different verticals)
125
+
126
+ ## 7. Conclusion
127
+
128
+ - Summary of contribution
129
+ - Open-source release: fine-tuned checkpoint, label space config, evaluation scripts
130
+ - Future work: additional verticals (energy, medical), multilingual cybersecurity NER
131
+
132
+ ---
133
+
134
+ ## Key Experiments We MUST Run
135
+
136
+ 1. **Main comparison table** — Arcspan vs CyNER vs BERT-base vs GLiNER vs regex (span F1, latency, size)
137
+ 2. **Data efficiency curve** — F1 vs training data fraction (the killer chart)
138
+ 3. **Per-entity-type breakdown** — where MoE wins/loses vs dense
139
+ 4. **Viterbi vs argmax** — unique architectural advantage
140
+ 5. **Ablation: experts_per_token** — the `OPF_EXPERTS_PER_TOKEN` env var lets us test top-2 vs top-4
141
+
142
+ ## What We Need Before We Can Run Experiments
143
+
144
+ - [ ] Finalized label taxonomy (from CyNER deep dive)
145
+ - [ ] Training data in BIOES JSONL format
146
+ - [ ] CyNER installed and runnable as baseline
147
+ - [ ] BERT-base fine-tuning pipeline (HuggingFace Trainer)
148
+ - [ ] GLiNER inference pipeline for zero-shot baseline
149
+ - [ ] Evaluation harness that runs all models on same test set
150
+ - [ ] Hardware specs documented
research/securebert2/.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ doc embedding/attack_bert_cont_eval.py
2
+ doc embedding/CrossEncoder_infer.py
3
+ mlm/code_mlm_eval_secure_bert.py
4
+ mlm/code_mlm_eval.py
5
+ mlm/dataset.py
6
+ mlm/primus_load.py
7
+ mlm/run_modernbert.py
8
+ mlm/SecureBERT_mlm_eval.py
9
+ mlm/code_mlm_eval_secure_bert.py
10
+ ner/dataset.py
11
+ vuln_classification/CodeVuln_infer.py
12
+ ner/NER_infer.py
research/securebert2/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, caste, color, religion, or sexual
10
+ identity and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ * Demonstrating empathy and kindness toward other people
21
+ * Being respectful of differing opinions, viewpoints, and experiences
22
+ * Giving and gracefully accepting constructive feedback
23
+ * Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ * Focusing on what is best not just for us as individuals, but for the overall
26
+ community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ * The use of sexualized language or imagery, and sexual attention or advances of
31
+ any kind
32
+ * Trolling, insulting or derogatory comments, and personal or political attacks
33
+ * Public or private harassment
34
+ * Publishing others' private information, such as a physical or email address,
35
+ without their explicit permission
36
+ * Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official email address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
+ [oss-conduct@cisco.com](mailto:oss-conduct@cisco.com). All complaints will be reviewed and investigated
64
+ promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series of
86
+ actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or permanent
93
+ ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within the
113
+ community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.1, available at
119
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120
+
121
+ Community Impact Guidelines were inspired by [Mozilla's code of conduct
122
+ enforcement ladder][Mozilla CoC].
123
+
124
+ For answers to common questions about this code of conduct, see the FAQ at
125
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126
+ [https://www.contributor-covenant.org/translations][translations].
127
+
128
+ [homepage]: https://www.contributor-covenant.org
129
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130
+ [Mozilla CoC]: https://github.com/mozilla/diversity
131
+ [FAQ]: https://www.contributor-covenant.org/faq
132
+ [translations]: https://www.contributor-covenant.org/translations
research/securebert2/CONTRIBUTING.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to Contribute
2
+
3
+ Thanks for your interest in contributing to `securebert2`! Here are a few
4
+ general guidelines on contributing and reporting bugs that we ask you to review.
5
+ Following these guidelines helps to communicate that you respect the time of the
6
+ contributors managing and developing this open source project. In return, they
7
+ should reciprocate that respect in addressing your issue, assessing changes, and
8
+ helping you finalize your pull requests. In that spirit of mutual respect, we
9
+ endeavor to review incoming issues and pull requests within 10 days, and will
10
+ close any lingering issues or pull requests after 60 days of inactivity.
11
+
12
+ Please note that all of your interactions in the project are subject to our
13
+ [Code of Conduct](/CODE_OF_CONDUCT.md). This includes creation of issues or pull
14
+ requests, commenting on issues or pull requests, and extends to all interactions
15
+ in any real-time space e.g., Slack, Discord, etc.
16
+
17
+ ## Reporting Issues
18
+
19
+ Before reporting a new issue, please ensure that the issue was not already
20
+ reported or fixed by searching through our [issues
21
+ list](https://github.com/cisco-ai-defense/securebert2/issues).
22
+
23
+ When creating a new issue, please be sure to include a **title and clear
24
+ description**, as much relevant information as possible, and, if possible, a
25
+ test case.
26
+
27
+ **If you discover a security bug, please do not report it through GitHub.
28
+ Instead, please see security procedures in [SECURITY.md](/SECURITY.md).**
29
+
30
+ ## Sending Pull Requests
31
+
32
+ Before sending a new pull request, take a look at existing pull requests and
33
+ issues to see if the proposed change or fix has been discussed in the past, or
34
+ if the change was already implemented but not yet released.
35
+
36
+ We expect new pull requests to include tests for any affected behavior, and, as
37
+ we follow semantic versioning, we may reserve breaking changes until the next
38
+ major version release.
39
+
40
+ ## Other Ways to Contribute
41
+
42
+ We welcome anyone that wants to contribute to `securebert2` to triage and
43
+ reply to open issues to help troubleshoot and fix existing bugs. Here is what
44
+ you can do:
45
+
46
+ - Help ensure that existing issues follows the recommendations from the
47
+ _[Reporting Issues](#reporting-issues)_ section, providing feedback to the
48
+ issue's author on what might be missing.
49
+ - Review and update the existing content of our
50
+ [Wiki](https://github.com/cisco-ai-defense/securebert2) with up-to-date
51
+ instructions and code samples.
52
+ - Review existing pull requests, and testing patches against real existing
53
+ applications that use `securebert2`.
54
+ - Write a test, or add a missing test case to an existing test.
55
+
56
+ Thanks again for your interest on contributing to `securebert2`!
57
+
58
+ :heart:
research/securebert2/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [2025] [CISCO - AI DEFENSE]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
research/securebert2/MAINTAINERS.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Maintainers
2
+ - [ai-threat-intel@cisco.com](mailto:ai-threat-intel@cisco.com)
research/securebert2/README.md ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SecureBERT 2.0: Advanced Domain-Specific Language Model for Cybersecurity Intelligence
2
+ ## About The Project
3
+
4
+ [**SecureBERT 2.0**](https://arxiv.org/pdf/2510.00240) is **Cisco AI**'s officially released, domain-adapted encoder-based language model for cybersecurity and threat intelligence. Built on the ModernBERT architecture, it incorporates hierarchical encoding and long-context modeling, enabling effective processing of complex cybersecurity documents, source code, and threat intelligence reports. Pretrained on a massive, multi-modal corpus—including over 13 billion text tokens and 53 million code tokens—SecureBERT 2.0 achieves state-of-the-art performance in semantic search, named entity recognition, code vulnerability detection, and threat analysis. With this release, Cisco aims to advance research in cybersecurity and AI by promoting transparency, enabling collaboration, and empowering practitioners, researchers, and organizations to build upon this work, accelerate innovation, and strengthen defenses against emerging cyber threats.
5
+
6
+
7
+ ---
8
+
9
+ ## Key Features
10
+
11
+ - **Domain-Specific Pretraining**: Extensive cybersecurity corpus, including threat reports, vulnerability advisories, technical blogs, and source code.
12
+ - **Multi-Modal Understanding**: Integrates natural language and code for advanced vulnerability detection and threat intelligence.
13
+ - **Hierarchical & Long-Context Modeling**: Captures both fine-grained and high-level structures across extended documents.
14
+ - **Optimized for Cybersecurity Tasks**:
15
+ - Semantic search and document retrieval
16
+ - Named entity recognition (NER)
17
+ - Code vulnerability detection
18
+ - Threat intelligence analysis
19
+
20
+ ---
21
+
22
+ ## Pretraining Dataset
23
+
24
+ | Dataset Category | Code Tokens | Text Tokens |
25
+ |-------------------------------|------------|------------|
26
+ | Seed corpus | 9,406,451 | 256,859,788 |
27
+ | Large-scale web text | 268,993 | 12,231,942,693 |
28
+ | Reasoning-focused data | -- | 3,229,293 |
29
+ | Instruction-tuning data | 61,590 | 2,336,218 |
30
+ | Code vulnerability corpus | 2,146,875 | -- |
31
+ | Cybersecurity dialogue data | 41,503,749 | 56,871,556 |
32
+ | Original baseline dataset | -- | 1,072,798,637 |
33
+ | **Total** | 53,387,658 | 13,623,037,185 |
34
+
35
+ ---
36
+
37
+ ## MLM Evaluation (Masked Language Modeling)
38
+
39
+ SecureBERT 2.0 demonstrates strong domain-specific understanding:
40
+
41
+ | Top-n | Objects (Nouns) | Verbs (Actions) | Code Tokens |
42
+ |-------|----------------|----------------|-------------|
43
+ | 1 | 56.20% | 45.02% | 39.27% |
44
+ | 5 | 82.72% | 74.12% | 55.41% |
45
+ | 10 | 88.80% | 81.64% | 60.03% |
46
+
47
+ > Outperforms general-purpose models in predicting cybersecurity-specific terms and code elements.
48
+
49
+ ---
50
+
51
+ ## Downstream Tasks
52
+
53
+ ### 1. Document Embedding
54
+
55
+ **Cross-Encoder Results**
56
+
57
+ | Model | mAP | R@1 | NDCG@10 | MRR@10 |
58
+ |----------------------|-------|-------|---------|--------|
59
+ | ms-marco-TinyBERT-L2 | 0.920 | 0.849 | 0.964 | 0.955 |
60
+ | **SecureBERT 2.0** | 0.955 | 0.948 | 0.986 | 0.983 |
61
+
62
+ **Bi-Encoder Results**
63
+
64
+ | Model | mAP | R@1 | MRR@10 |
65
+ |--------------------------|-------|-------|--------|
66
+ | all-MiniLM-L12-v2 | 0.912 | 0.924 | 0.945 |
67
+ | **SecureBERT 2.0** | 0.951 | 0.984 | 0.989 |
68
+
69
+ > Demonstrates high precision in semantic search and scalable retrieval.
70
+
71
+ ---
72
+
73
+ ### 2. Named Entity Recognition (NER)
74
+
75
+ | Model | F1 | Recall | Precision |
76
+ |--------------------|-------|--------|-----------|
77
+ | CyBERT | 0.351 | 0.281 | 0.467 |
78
+ | SecureBERT | 0.734 | 0.759 | 0.717 |
79
+ | **SecureBERT 2.0** | 0.945 | 0.965 | 0.927 |
80
+
81
+ > Near-perfect recognition of cybersecurity entities such as Malware, Vulnerability, System, Indicator, and Organization.
82
+
83
+ ---
84
+
85
+ ### 3. Code Vulnerability Detection
86
+
87
+ | Model | Accuracy | F1 | Recall | Precision |
88
+ |-------------|----------|-------|--------|-----------|
89
+ | CodeBERT | 0.627 | 0.372 | 0.241 | 0.821 |
90
+ | CyBERT | 0.459 | 0.630 | 1.000 | 0.459 |
91
+ | **SecureBERT 2.0** | 0.655 | 0.616 | 0.602 | 0.630 |
92
+
93
+ > Balanced detection performance with higher F1 score and reduced false positives compared to prior models.
94
+
95
+
96
+ ** All for models are available on Huggingface **
97
+ ## Hugging Face Model Paths
98
+
99
+ | Task | Model Path |
100
+ |------|------------|
101
+ | SecureBERT 2.0 | `cisco-ai/SecureBERT2.0-base` |
102
+ | Cross Encoder | `cisco-ai/SecureBERT2.0-cross_encoder` |
103
+ | Bi-Encoder | `cisco-ai/SecureBERT2.0-biencoder` |
104
+ | Named Entity Recognition (NER) | `cisco-ai/SecureBERT2.0-NER` |
105
+ | Vulnerability Classification | `cisco-ai/SecureBERT2.0-code-vuln-detection` |
106
+
107
+ # Getting Started
108
+
109
+ This repository provides the full framework for pretraining, fine-tuning, and evaluating SecureBERT 2.0 across key cybersecurity tasks.
110
+
111
+ Repository Structure
112
+ ```
113
+ .
114
+ ├── mlm/ # Model pretraining (Masked Language Modeling)
115
+ │ ├── train.py # Pretraining script for MLM
116
+ │ └── SecureBERT_mlm_eval.py # MLM evaluation script
117
+ ├── vuln_classification/ # Code vulnerability detection
118
+ │ ├── CodeVuln_train.py # Fine-tuning SecureBERT for vulnerability detection
119
+ │ └── CodeVuln_eval.py # Evaluation on code vulnerability datasets
120
+ ├── rt2/ner/ # Named Entity Recognition (NER) tasks
121
+ │ ├── NER_train.py # Fine-tuning SecureBERT for cybersecurity NER
122
+ │ └── NER_eval.py # Evaluation script for NER models
123
+ ├── doc_embedding/ # Document embedding tasks
124
+ │ ├── BiEncoder_train.py # Bi-encoder training for semantic search
125
+ │ ├── CrossEncoder_train.py # Cross-encoder training for fine-grained ranking
126
+ │ ├── BiEncoder_eval.py # Bi-encoder evaluation
127
+ │ └── CrossEncoder_eval.py # Cross-encoder evaluation
128
+ ├── opensource_data/ # Preprocessed datasets
129
+ │ ├── data_vuln_dataset.parquet
130
+ │ ├── data_vuln_dataset_test.parquet
131
+ │ ├── data_NER_train.json
132
+ │ ├── data_NER_test.json
133
+ │ ├── data_sentence_pairs.parquet
134
+ │ ├── data_sentence_pairs_test.parquet
135
+ │ └── data_pretrain.parquet
136
+ ├── dataset.py # Dataset loading and preprocessing utilities
137
+ ├── requirements.txt # Python dependencies
138
+ ├── LICENSE
139
+ ├── README.md
140
+ ├── CODE_OF_CONDUCT.md
141
+ ├── CONTRIBUTING.md
142
+ ├── SECURITY.md
143
+ ├── MAINTAINERS.md
144
+ └── .gitignore
145
+ ```
146
+
147
+
148
+
149
+ ## Requirements
150
+
151
+ - Python 3.10+
152
+ - PyTorch 2.1+ with CUDA
153
+ - Hugging Face Transformers
154
+ - Lightning Fabric
155
+ - tqdm
156
+
157
+ ## Installation
158
+
159
+ 1. **Clone the repository:**
160
+ ```bash
161
+ git clone https://github.com/cisco-ai-defense/securebert2.git
162
+ ```
163
+
164
+ 2. **Create a virtual environment (recommended):**
165
+ ```bash
166
+ python -m venv venv
167
+ source venv/bin/activate # On Windows: `venv\Scripts\activate`
168
+ ```
169
+
170
+ 3. **Install the required Python packages:**
171
+ ```bash
172
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 # Adjust cu121 for your CUDA version
173
+ pip install transformers lightning tqdm pandas pyarrow
174
+ ```
175
+ *Note: Ensure your `torch` installation matches your CUDA version. The example above is for CUDA 12.1.*
176
+
177
+ 4. **Ensure `dataset.py` is available: and also the datasets are available**
178
+
179
+ ## Train and Evaluate
180
+ Every directory contains `train` and `eval` files. Make sure to customize them with your desired model or dataset path.
181
+
182
+ ```bash
183
+ cd mlm
184
+ ```
185
+ By default, the dataset is set to `ModernBertDataset()` from `dataset.py`, and the model is set to `nswerdotai/ModernBERT-base`.
186
+
187
+ To start training on a single GPU, simply run:
188
+ ```bash
189
+ python train.py
190
+ ```
191
+
192
+ For multi-GPU setting, run:
193
+ ```bash
194
+ torchrun --nproc_per_node=8 train.py
195
+ ```
196
+
197
+ For evaluation, provide a list of Hugging Face model IDs along with the evaluation dataset. Below is an example format for the MLM task.
198
+ ```python
199
+ sentences = [
200
+ "The attacker gained access through a [MASK] vulnerability.",
201
+ "Users should always enable [MASK] authentication for better security.",
202
+ "The malicious [MASK] was detected by the intrusion detection system.",
203
+ "The ransomware encrypted all [MASK] on the server.",
204
+ "A strong [MASK] policy helps prevent brute-force attacks."
205
+ ]
206
+
207
+ ground_truths = ["software", "multi-factor", "payload", "files", "password"]
208
+
209
+ model_ids = [
210
+ "cisco-ai/SecureBERT2.0-base",
211
+ "answerdotai/ModernBERT-base",
212
+ "ehsanaghaei/SecureBERT",
213
+
214
+ ]
215
+
216
+ ```
217
+ Similar to training, simply run:
218
+ ```bash
219
+ python SecureBERT2_mlm_eval.py
220
+ ```
221
+
222
+ # Contribution
223
+
224
+ We welcome contributions to improve SecureBERT 2.0, including:
225
+ * New datasets and pretraining corpora
226
+ * Additional downstream cybersecurity tasks
227
+ * Model architecture enhancements
228
+ * Optimized evaluation pipelines
229
+
230
+ Please review CONTRIBUTING.md for guidelines.
231
+
research/securebert2/SECURITY.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Security Policies and Procedures
2
+
3
+ This document outlines security procedures and general policies for the
4
+ `securebert2` project.
5
+
6
+ - [Disclosing a security issue](#disclosing-a-security-issue)
7
+ - [Vulnerability management](#vulnerability-management)
8
+ - [Suggesting changes](#suggesting-changes)
9
+
10
+ ## Disclosing a security issue
11
+
12
+ The `securebert2` maintainers take all security issues in the project
13
+ seriously. Thank you for improving the security of `securebert2`. We
14
+ appreciate your dedication to responsible disclosure and will make every effort
15
+ to acknowledge your contributions.
16
+
17
+ `securebert2` leverages GitHub's private vulnerability reporting.
18
+
19
+ To learn more about this feature and how to submit a vulnerability report,
20
+ review [GitHub's documentation on private reporting](https://docs.github.com/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability).
21
+
22
+ Here are some helpful details to include in your report:
23
+
24
+ - a detailed description of the issue
25
+ - the steps required to reproduce the issue
26
+ - versions of the project that may be affected by the issue
27
+ - if known, any mitigations for the issue
28
+
29
+ A maintainer will acknowledge the report within three (3) business days, and
30
+ will send a more detailed response within an additional three (3) business days
31
+ indicating the next steps in handling your report.
32
+
33
+ If you've been unable to successfully draft a vulnerability report via GitHub
34
+ or have not received a response during the alloted response window, please
35
+ reach out via the [Cisco Open security contact email](mailto:oss-security@cisco.com).
36
+
37
+ After the initial reply to your report, the maintainers will endeavor to keep
38
+ you informed of the progress towards a fix and full announcement, and may ask
39
+ for additional information or guidance.
40
+
41
+ ## Vulnerability management
42
+
43
+ When the maintainers receive a disclosure report, they will assign it to a
44
+ primary handler.
45
+
46
+ This person will coordinate the fix and release process, which involves the
47
+ following steps:
48
+
49
+ - confirming the issue
50
+ - determining affected versions of the project
51
+ - auditing code to find any potential similar problems
52
+ - preparing fixes for all releases under maintenance
53
+
54
+ ## Suggesting changes
55
+
56
+ If you have suggestions on how this process could be improved please submit an
57
+ issue or pull request.
research/securebert2/dataset.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Cisco Systems, Inc. and its affiliates
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from datasets import Dataset
6
+ import re
7
+ from transformers import AutoTokenizer
8
+ import json
9
+ import torch
10
+ import pandas as pd
11
+ from sentence_transformers import InputExample
12
+
13
+ class ModernBertDataset(Dataset):
14
+ def __init__(self, parquet_path="./opensource_data/data_pretrain.parquet", n=None):
15
+ """
16
+ Args:
17
+ parquet_path (str): Path to the single .parquet file.
18
+ n (int, optional): If provided, randomly sample n rows from the file.
19
+ """
20
+ self.tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
21
+
22
+ # Load the parquet file (must contain a column named "text")
23
+ df = pd.read_parquet(parquet_path)
24
+
25
+ # Sample n rows if requested
26
+ if n is not None and n < len(df):
27
+ df = df.sample(n=n, random_state=42)
28
+
29
+ # Clean the text
30
+ self.txt_data = [self.clean_text(t) for t in df["text"].astype(str).tolist()]
31
+
32
+ def clean_text(self, text):
33
+ text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE) # remove markdown headings
34
+ text = re.sub(r"\*\*(.*?)\*\*", r"\1", text) # bold -> plain
35
+ text = re.sub(r"\*(.*?)\*", r"\1", text) # italic -> plain
36
+ text = re.sub(r"\[.*?\]\(.*?\)", "", text) # links -> remove
37
+ text = re.sub(r"`([^`]*)`", r"\1", text) # inline code -> plain
38
+ text = re.sub(r"```.*?```", "", text, flags=re.DOTALL) # code blocks -> remove
39
+ text = re.sub(r"\n+", "\n", text) # collapse multiple newlines
40
+ text = re.sub(r"\s{2,}", " ", text) # collapse extra spaces
41
+ return text.strip()
42
+
43
+ def __getitem__(self, idx):
44
+ curr_text = self.txt_data[idx]
45
+ encoded = self.tokenizer(
46
+ curr_text,
47
+ padding="max_length",
48
+ truncation=True,
49
+ max_length=1024,
50
+ return_tensors="pt",
51
+ )
52
+ return {
53
+ "input_ids": encoded["input_ids"].squeeze(0),
54
+ "attention_mask": encoded["attention_mask"].squeeze(0),
55
+ }
56
+
57
+ def __len__(self):
58
+ return len(self.txt_data)
59
+
60
+ class ContrastiveLearningDataset:
61
+ def __init__(self, parquet_path="./opensource_data/data_sentence_pairs.parquet"):
62
+ df = pd.read_parquet(parquet_path, engine="pyarrow")
63
+ self.txt_data = list(zip(df["sentence1"], df["sentence2"]))
64
+
65
+ def __getitem__(self, idx):
66
+ return self.txt_data[idx] # already a tuple
67
+
68
+ def __len__(self):
69
+ return len(self.txt_data)
70
+
71
+ class Eval_ContrastiveDataset:
72
+ def __init__(self, parquet_path="./opensource_data/data_sentence_pairs_test.parquet"):
73
+ df = pd.read_parquet(parquet_path, engine="pyarrow")
74
+ self.txt_data = list(zip(df["sentence1"], df["sentence2"]))
75
+
76
+ def __getitem__(self, idx):
77
+ return self.txt_data[idx]
78
+
79
+ def __len__(self):
80
+ return len(self.txt_data)
81
+
82
+ class Mrr_ContrastiveLearningDataset:
83
+ def __init__(self, parquet_path="./opensource_data/data_sentence_pairs.parquet"):
84
+ df = pd.read_parquet(parquet_path, engine="pyarrow")
85
+ self.txt_data = list(zip(df["sentence1"], df["sentence2"]))
86
+
87
+ def __getitem__(self, idx):
88
+ curr_txt = self.txt_data[idx]
89
+ return InputExample(texts=[curr_txt[0], curr_txt[1]])
90
+
91
+ def __len__(self):
92
+ return len(self.txt_data)
93
+
94
+ class NerDataset():
95
+ def __init__(self, data_path="./opensource_data/data_NER_test.json", mode="train"):
96
+ self.txt_data = list()
97
+ self.ner_tags = list()
98
+ self.load_data(data_path)
99
+ assert len(self.ner_tags) == len(self.txt_data)
100
+ # Run tokenization step separately
101
+ self.tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
102
+ self.tokenized_inputs, self.labels = self.tokenize_and_align_labels()
103
+
104
+ def tokenize_and_align_labels(self):
105
+ """Tokenizes txt_data and aligns NER tags with subword tokens."""
106
+ tokenized_inputs = self.tokenizer(
107
+ self.txt_data,
108
+ is_split_into_words=True,
109
+ truncation=True,
110
+ max_length=1024,
111
+ )
112
+
113
+ labels = []
114
+ # Align labels with word pieces
115
+ for i, ner_tags_for_example in enumerate(self.ner_tags):
116
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
117
+ current_labels = []
118
+ previous_word_idx = None
119
+ for word_idx in word_ids:
120
+ if word_idx is None:
121
+ current_labels.append(-100)
122
+ elif word_idx != previous_word_idx:
123
+ current_labels.append(ner_tags_for_example[word_idx])
124
+ else:
125
+ current_labels.append(-100)
126
+ previous_word_idx = word_idx
127
+ labels.append(current_labels)
128
+
129
+ return tokenized_inputs, labels
130
+
131
+ def __getitem__(self, idx):
132
+ return {
133
+ 'input_ids': torch.tensor(self.tokenized_inputs['input_ids'][idx], dtype=torch.long),
134
+ 'attention_mask': torch.tensor(self.tokenized_inputs['attention_mask'][idx], dtype=torch.long),
135
+ 'labels': torch.tensor(self.labels[idx], dtype=torch.long)
136
+ }
137
+ def __len__(self):
138
+ return len(self.txt_data)
139
+
140
+ def save_data(self, path, n=None):
141
+ """Save txt_data, ner_tags, and label metadata to JSON file."""
142
+ if not n:
143
+ n = len(self.txt_data)
144
+ with open(path, "w", encoding="utf-8") as f:
145
+ json.dump({
146
+ "txt_data": self.txt_data[:n],
147
+ "ner_tags": self.ner_tags[:n],
148
+ "num_labels": self.num_labels
149
+ }, f)
150
+
151
+ def load_data(self, path):
152
+ """Load txt_data, ner_tags, and label metadata from JSON file."""
153
+ with open(path, "r", encoding="utf-8") as f:
154
+ data = json.load(f)
155
+ self.txt_data = data["txt_data"]
156
+ self.ner_tags = data["ner_tags"]
157
+ self.num_labels = data["num_labels"]
158
+
159
+ class SentimentVulnerabilityDataset():
160
+ def __init__(self, parquet_path="./opensource_data/data_vuln_dataset.parquet"):
161
+ self.txt_data = list()
162
+ df = pd.read_parquet(parquet_path, engine="pyarrow")
163
+ self.txt_data = list(zip(df["code"], df["label"]))
164
+ def __getitem__(self, idx):
165
+ curr_txt = self.txt_data[idx]
166
+ return curr_txt[0], curr_txt[1]
167
+ def __len__(self):
168
+ return len(self.txt_data)
169
+
170
+ class Eval_SentimentVulnerabilityDataset():
171
+ def __init__(self, parquet_path="./opensource_data/data_vuln_dataset_test.parquet"):
172
+ self.txt_data = list()
173
+ df = pd.read_parquet(parquet_path, engine="pyarrow")
174
+ self.txt_data = list(zip(df["code"], df["label"]))
175
+ def __getitem__(self, idx):
176
+ curr_txt = self.txt_data[idx]
177
+ return curr_txt[0], curr_txt[1]
178
+ def __len__(self):
179
+ return len(self.txt_data)