| |
| """Merge all LLM annotations into enriched dataset. Re-run anytime new files appear.""" |
| import json |
| from pathlib import Path |
| from collections import Counter |
| import shutil |
|
|
| DATA = Path("/home/ubuntu/alkyline/data/processed") |
|
|
| LLM_FILES = sorted(DATA.glob("llm_annotated_*.jsonl")) + sorted(DATA.glob("llm_generated_*.jsonl")) |
|
|
| LABEL_MAP_5 = { |
| "MALWARE": "Malware", "THREAT_ACTOR": None, "TOOL": None, |
| "VULNERABILITY": "Vulnerability", "SYSTEM": "System", "ORGANIZATION": "Organization", |
| "IP_ADDRESS": "Indicator", "DOMAIN": "Indicator", "URL": "Indicator", |
| "HASH": "Indicator", "EMAIL": "Indicator", "CVE_ID": "Vulnerability", "FILEPATH": None, |
| } |
|
|
| agg_13 = list(open(DATA / "aggregated_13class_train.jsonl")) |
| llm_lines = [] |
| totals = Counter() |
|
|
| for f in LLM_FILES: |
| n = 0 |
| for line in open(f): |
| llm_lines.append(line.strip()) |
| for key, offsets in json.loads(line)["spans"].items(): |
| totals[key.split(": ", 1)[0]] += len(offsets) |
| n += 1 |
| print(f"{f.name}: {n} examples") |
|
|
| print(f"\nTotal LLM: {len(llm_lines)} examples, {sum(totals.values())} spans") |
| for l, c in sorted(totals.items(), key=lambda x: -x[1]): |
| print(f" {l}: {c}") |
|
|
| with open(DATA / "enriched_13class_train.jsonl", "w") as f: |
| for line in agg_13: |
| f.write(line.rstrip("\n") + "\n") |
| for line in llm_lines: |
| f.write(line + "\n") |
|
|
| with open(DATA / "enriched_5class_train.jsonl", "w") as f: |
| for line in open(DATA / "aggregated_5class_train.jsonl"): |
| f.write(line) |
| for line in llm_lines: |
| rec = json.loads(line) |
| new_spans = {} |
| for key, offsets in rec["spans"].items(): |
| l5 = LABEL_MAP_5.get(key.split(": ", 1)[0]) |
| if l5: |
| new_spans.setdefault(f"{l5}: {key.split(': ', 1)[1]}", []).extend(offsets) |
| rec["spans"] = new_spans |
| if rec["spans"]: |
| f.write(json.dumps(rec, ensure_ascii=False) + "\n") |
|
|
| for split in ["valid", "test"]: |
| shutil.copy(DATA / f"aggregated_13class_{split}.jsonl", DATA / f"enriched_13class_{split}.jsonl") |
| shutil.copy(DATA / f"aggregated_5class_{split}.jsonl", DATA / f"enriched_5class_{split}.jsonl") |
|
|
| n13 = sum(1 for _ in open(DATA / "enriched_13class_train.jsonl")) |
| n5 = sum(1 for _ in open(DATA / "enriched_5class_train.jsonl")) |
| print(f"\nEnriched 13-class: {n13} | 5-class: {n5}") |
|
|