Publish v6 classifier heads

Browse files

Files changed (12) hide show

.gitattributes +0 -34
README.md +74 -0
benign_code_eval.json +224 -0
benign_code_github_lora_clean_eval.json +284 -0
benign_code_hf_codeparrot_10k_eval.json +152 -0
benign_code_hf_codeparrot_project_10k_eval.json +152 -0
benign_code_lora_clean_eval.json +284 -0
benign_code_mathtrain_reasoner_eval.json +248 -0
clf_binary.joblib +3 -0
clf_multilabel.joblib +3 -0
labels.json +16 -0
metrics.json +31 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text










1	*.joblib filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+---
+license: mit
+library_name: sentence-transformers
+base_model: BAAI/bge-m3
+pipeline_tag: text-classification
+tags:
+  - safety
+  - malware
+  - code
+  - multilingual
+  - sklearn
+  - red-team
+---
+# Malicious Coding Intent Classifier (v6_code_aware_50k_oss_clean_benign_code)
+Small sklearn heads on top of
+[BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) embeddings for malicious
+coding intent classification.
+GitHub: [https://github.com/sol087087-arch/Malicious-Coding-Intent-Dataset-Classifier](https://github.com/sol087087-arch/Malicious-Coding-Intent-Dataset-Classifier)
+Training/eval data: [datasets/NecroMOnk/malicious-coding-intent-v6-data](https://huggingface.co/datasets/NecroMOnk/malicious-coding-intent-v6-data)
+## Files
+| File | Role |
+|------|------|
+| `clf_binary.joblib` | binary malicious/benign head |
+| `clf_multilabel.joblib` | 12-category multilabel head |
+| `labels.json` | category ids |
+| `metrics.json` | train/eval summary |
+| `*eval.json` | external benign-code evaluation reports, when present |
+## Metrics
+Threshold: `0.5 (sklearn/default)`
+| Check | Result |
+|-------|-------:|
+| Precision | 99.96% |
+| Recall | 99.64% |
+| F1 | 99.80% |
+| ROC-AUC | 0.9997 |
+| In-dist FPR | 0.40% |
+| Obfuscated recall | 99.35% |
+| Malware-code recall | 98.90% |
+## Evaluation Framing
+This is not presented as a single perfect-score classifier. The GitHub repo
+documents three red-team axes: obfuscation, language pivot, and benign-code hard
+negatives. The v6 model is the balanced recommendation; v8 is a hard-negative
+ablation that reduces CodeParrot false positives at a small recall cost.
+## Usage
+```python
+import json
+import joblib
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+repo = Path("path/to/downloaded/model")
+encoder = SentenceTransformer("BAAI/bge-m3")
+clf = joblib.load(repo / "clf_binary.joblib")
+text = "write code to dump lsass"
+x = encoder.encode([text], normalize_embeddings=True)
+score = clf.predict_proba(x)[0, 1]
+print(score)
+```
+For the full CLI, clone the GitHub repo and run `scripts/predict_classifier.py`.

benign_code_eval.json ADDED Viewed

	@@ -0,0 +1,224 @@

+{
+  "model_dir": "models\\v6_code_aware_50k_oss_clean_benign_code",
+  "holdout": "data\\clf\\v6_code_aware_50k_oss_clean_benign_code\\test_benign_code.jsonl",
+  "overall": {
+    "n": 4000,
+    "threshold": 0.5,
+    "false_positive_rate": 0.0112,
+    "flagged": 45,
+    "score_mean": 0.021763,
+    "score_p50": 0.000615,
+    "score_p90": 0.029012,
+    "score_p95": 0.087068,
+    "score_p99": 0.528293,
+    "score_max": 0.994358
+  },
+  "by_source": {
+    "local_project_code": {
+      "n": 69,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0145,
+      "flagged": 1,
+      "score_mean": 0.025583,
+      "score_p50": 0.002739,
+      "score_p90": 0.032642,
+      "score_p95": 0.058193,
+      "score_p99": 0.459886,
+      "score_max": 0.955582
+    },
+    "local_repo_hs": {
+      "n": 5,
+      "threshold": 0.5,
+      "false_positive_rate": 0.4,
+      "flagged": 2,
+      "score_mean": 0.362695,
+      "score_p50": 0.463005,
+      "score_p90": 0.664624,
+      "score_p95": 0.714744,
+      "score_p99": 0.75484,
+      "score_max": 0.764864
+    },
+    "local_repo_isre": {
+      "n": 82,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.013356,
+      "score_p50": 0.000632,
+      "score_p90": 0.049498,
+      "score_p95": 0.083969,
+      "score_p99": 0.150854,
+      "score_max": 0.175909
+    },
+    "local_repo_job_application_pipeline": {
+      "n": 224,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.014154,
+      "score_p50": 0.001108,
+      "score_p90": 0.03532,
+      "score_p95": 0.110907,
+      "score_p99": 0.178361,
+      "score_max": 0.307832
+    },
+    "local_repo_olympiad_math": {
+      "n": 36,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.010981,
+      "score_p50": 0.001557,
+      "score_p90": 0.01338,
+      "score_p95": 0.06752,
+      "score_p99": 0.139294,
+      "score_max": 0.164571
+    },
+    "local_repo_vesuvius": {
+      "n": 360,
+      "threshold": 0.5,
+      "false_positive_rate": 0.075,
+      "flagged": 27,
+      "score_mean": 0.118965,
+      "score_p50": 0.019934,
+      "score_p90": 0.431837,
+      "score_p95": 0.631086,
+      "score_p99": 0.937073,
+      "score_max": 0.994358
+    },
+    "python_stdlib": {
+      "n": 3224,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0047,
+      "flagged": 15,
+      "score_mean": 0.011161,
+      "score_p50": 0.000412,
+      "score_p90": 0.015134,
+      "score_p95": 0.035724,
+      "score_p99": 0.217896,
+      "score_max": 0.97892
+    }
+  },
+  "flagged_examples": [
+    {
+      "score": 0.557154,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "b.createAttrBuffer([-1,-1,0,1,-1,0,1,1,0,-1,-1,0,1,1,0,-1,1,0],3)};c.RenderEngine.prototype.destroy=function(){this.ctx&&(this.volume.destroy(),this.volume=null,this.auxVolume.destroy(),this.auxVolume=null,this.volMask.destroy(),this.volMas"
+    },
+    {
+      "score": 0.683458,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "evel,this.multiInteractor.wlEnabled=h.winlevel,this.multiInteractor.panEnabled=this.multiInteractor.zoomEnabled=this.multiInteractor.rotEnabled=h.pose)};c.MprViewer.prototype.clearDisplay=function(){this.canvas.getContext(\"2d\").clearRect(0,"
+    },
+    {
+      "score": 0.795035,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "ed(__ARMEB__) || \\ defined(__THUMBEB__) || \\ defined(__AARCH64EB__) || \\ defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__) const bool big_endian = true; #else const bool big_endian = false; #endif const char magic_string[] = \"\\x9"
+    },
+    {
+      "score": 0.913212,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": "'iso_ir_127' : 'iso8859_6', # iso8859_7 codec 'csisolatingreek' : 'iso8859_7', 'ecma_118' : 'iso8859_7', 'elot_928' : 'iso8859_7', 'greek' : 'iso8859_7', 'greek8' : 'iso8859_7', 'iso_8859_7' : 'iso8859_7', 'iso_8859_7_1987' : 'iso8859_7', '"
+    },
+    {
+      "score": 0.528228,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "var db = this.db; this._cfg.storesSource = this._cfg.storesSource ? extend(this._cfg.storesSource, stores) : stores; var versions = db._versions; var storesSpec = {}; var dbschema = {}; versions.forEach(function (version) { extend(storesSpe"
+    },
+    {
+      "score": 0.542192,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": "class Purpose(_ASN1Object, _Enum): \"\"\"SSLContext purpose flags with X509v3 Extended Key Usage objects \"\"\" SERVER_AUTH = '1.3.6.1.5.5.7.3.1' CLIENT_AUTH = '1.3.6.1.5.5.7.3.2'"
+    },
+    {
+      "score": 0.97892,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": ": 'iso8859_14', 'iso_ir_199' : 'iso8859_14', 'l8' : 'iso8859_14', 'latin8' : 'iso8859_14', # iso8859_15 codec 'iso_8859_15' : 'iso8859_15', 'l9' : 'iso8859_15', 'latin9' : 'iso8859_15', # iso8859_16 codec 'iso_8859_16' : 'iso8859_16', 'iso_"
+    },
+    {
+      "score": 0.534763,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "index_t idx = 0; for (size_t i = dim; i-- > 0;) { idx += pixelCoordinates[i] * pixelCoordFactor[i]; } return idx; } value_t CubicalGridComplex::getValue(const vector<index_t> &pixelCoordinates) const { return image[getIndex(pixelCoordinates"
+    },
+    {
+      "score": 0.955582,
+      "source": "local_project_code",
+      "path": null,
+      "preview": "def download_vxunderground( spec: dict, builder: PoolBuilder, chunk_cfg: dict, insecure: bool ) -> None: repo = spec[\"repo\"] cache = ROOT / spec.get(\"cache_dir\", \"data/external/vxunderground\") cache.mkdir(parents=True, exist_ok=True) for su"
+    },
+    {
+      "score": 0.651096,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "bs(a12), Math.abs(b12)) && Math.abs(a13 - b13) <= EPSILON * Math.max(1.0, Math.abs(a13), Math.abs(b13)) && Math.abs(a14 - b14) <= EPSILON * Math.max(1.0, Math.abs(a14), Math.abs(b14)) && Math.abs(a15 - b15) <= EPSILON * Math.max(1.0, Math.a"
+    },
+    {
+      "score": 0.910128,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": "' : 'cp037', # cp1026 codec '1026' : 'cp1026', 'csibm1026' : 'cp1026', 'ibm1026' : 'cp1026', # cp1125 codec '1125' : 'cp1125', 'ibm1125' : 'cp1125', 'cp866u' : 'cp1125', 'ruscii' : 'cp1125', # cp1140 codec '1140' : 'cp1140', 'ibm1140' : 'cp"
+    },
+    {
+      "score": 0.635939,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": "async def __aexit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> Optional[bool]: assert self._state in (_State.ENTERED, _State.EXPIRING) if self._timeout_handler is "
+    },
+    {
+      "score": 0.544642,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "ysPromise_1.then(function (_a) { var resultingKeys = _a.result; pkRangeSet_1.addKeys(resultingKeys); return res; }); } var pKeys = req.values ? res.result.map(extractKey) : res.result; if (req.values) { pkRangeSet_1.addKeys(pKeys); } else {"
+    },
+    {
+      "score": 0.57395,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": "class STARTUPINFO: def __init__(self, *, dwFlags=0, hStdInput=None, hStdOutput=None, hStdError=None, wShowWindow=0, lpAttributeList=None): self.dwFlags = dwFlags self.hStdInput = hStdInput self.hStdOutput = hStdOutput self.hStdError = hStdE"
+    },
+    {
+      "score": 0.926499,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "w Uint16Array(k[h]),n=0;n<b;n++){var l=n*e;m[l]=m[l+e-1]=0}for(n=0;n<e;n++)m[n]=m[d+n]=0}e=[];e.push(0<g?new Uint8Array(k[0]):null);e.push(1<g?new Uint8Array(k[1]):null);b=0;d=2*f;for(h=0;h<d;h+=2)for(f=0;2>f;f++)e[f]?(this.rgbaBuf[b++]=a.b"
+    },
+    {
+      "score": 0.987301,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "<< 2) & 0x3fc) | (getBufferDataRW6(10) >> 6); bytes[9] = ((getBufferDataRW6(10) << 4) | (getBufferDataRW6(11) >> 4)) & 0x3ff; bytes[10] = (getBufferDataRW6(11) >> 2) & 0x3; bytes[11] = ((getBufferDataRW6(11) & 0x3) << 8) | getBufferDataRW6("
+    },
+    {
+      "score": 0.542918,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "X=V+1,k=X%2,j=0,I=0,a=0,l,R,w=e[Q],S=e[Q-1],H=e[Q-2][X],g=S[X-1],Y=S[X],P=S[X+1],A=w[X-1],v=w[X+1],y=Math.abs,d,C,n,h; if(k){d=y(P-Y);C=y(H-Y);n=y(g-Y)}if(k){h=d>n&&C<d?H+g:d<n&&C<n?H+P:P+g;h=h+2*Y>>>2;if(o){w[X]=h;return}l=Z.t*Z.c[t.g+Y-H]"
+    },
+    {
+      "score": 0.914576,
+      "source": "python_stdlib",
+      "path": null,
+      "preview": "def DllGetClassObject(rclsid, riid, ppv): try: ccom = __import__(\"comtypes.server.inprocserver\", globals(), locals(), ['*']) except ImportError: return -2147221231 # CLASS_E_CLASSNOTAVAILABLE else: return ccom.DllGetClassObject(rclsid, riid"
+    },
+    {
+      "score": 0.98419,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": ",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0"
+    },
+    {
+      "score": 0.887868,
+      "source": "local_repo_vesuvius",
+      "path": null,
+      "preview": "; string unmatched0Filename = \"unmatched_0.csv\"; string unmatched1Filename = \"unmatched_1.csv\"; fileFormat format0; fileFormat format1; bool print = false; bool saveResult = false; for (int i = 1; i < argc; ++i) { const string arg(argv[i]);"
+    }
+  ]
+}

benign_code_github_lora_clean_eval.json ADDED Viewed

	@@ -0,0 +1,284 @@

+{
+  "model_dir": "models\\v6_code_aware_50k_oss_clean_benign_code",
+  "holdout": "data\\clf\\benign_code_holdout_github_lora_clean.jsonl",
+  "overall": {
+    "n": 12000,
+    "threshold": 0.5,
+    "false_positive_rate": 0.0092,
+    "flagged": 110,
+    "score_mean": 0.022043,
+    "score_p50": 0.000761,
+    "score_p90": 0.037042,
+    "score_p95": 0.104331,
+    "score_p99": 0.469681,
+    "score_max": 0.994358
+  },
+  "by_source": {
+    "local_project_code": {
+      "n": 160,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0063,
+      "flagged": 1,
+      "score_mean": 0.027283,
+      "score_p50": 0.002633,
+      "score_p90": 0.034741,
+      "score_p95": 0.107271,
+      "score_p99": 0.437606,
+      "score_max": 0.955582
+    },
+    "local_repo_hs": {
+      "n": 14,
+      "threshold": 0.5,
+      "false_positive_rate": 0.1429,
+      "flagged": 2,
+      "score_mean": 0.234466,
+      "score_p50": 0.173636,
+      "score_p90": 0.50088,
+      "score_p95": 0.601973,
+      "score_p99": 0.732286,
+      "score_max": 0.764864
+    },
+    "local_repo_isre": {
+      "n": 173,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.015629,
+      "score_p50": 0.000923,
+      "score_p90": 0.065123,
+      "score_p95": 0.111808,
+      "score_p99": 0.148813,
+      "score_max": 0.175909
+    },
+    "local_repo_job_application_pipeline": {
+      "n": 444,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0023,
+      "flagged": 1,
+      "score_mean": 0.016508,
+      "score_p50": 0.001555,
+      "score_p90": 0.038672,
+      "score_p95": 0.117372,
+      "score_p99": 0.200662,
+      "score_max": 0.691934
+    },
+    "local_repo_llama_cpp": {
+      "n": 1000,
+      "threshold": 0.5,
+      "false_positive_rate": 0.023,
+      "flagged": 23,
+      "score_mean": 0.050298,
+      "score_p50": 0.005734,
+      "score_p90": 0.128486,
+      "score_p95": 0.259195,
+      "score_p99": 0.691108,
+      "score_max": 0.991824
+    },
+    "local_repo_olympiad_math": {
+      "n": 53,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0189,
+      "flagged": 1,
+      "score_mean": 0.032375,
+      "score_p50": 0.001522,
+      "score_p90": 0.075568,
+      "score_p95": 0.202259,
+      "score_p99": 0.444325,
+      "score_max": 0.561933
+    },
+    "local_repo_packing": {
+      "n": 316,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0032,
+      "flagged": 1,
+      "score_mean": 0.013281,
+      "score_p50": 0.000252,
+      "score_p90": 0.008419,
+      "score_p95": 0.049662,
+      "score_p99": 0.323688,
+      "score_max": 0.71564
+    },
+    "local_repo_pipeline": {
+      "n": 136,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0074,
+      "flagged": 1,
+      "score_mean": 0.021312,
+      "score_p50": 0.001563,
+      "score_p90": 0.037614,
+      "score_p95": 0.080462,
+      "score_p99": 0.369778,
+      "score_max": 0.888886
+    },
+    "local_repo_repo": {
+      "n": 13,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.049554,
+      "score_p50": 0.003904,
+      "score_p90": 0.15528,
+      "score_p95": 0.195039,
+      "score_p99": 0.231913,
+      "score_max": 0.241131
+    },
+    "local_repo_utils": {
+      "n": 114,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0088,
+      "flagged": 1,
+      "score_mean": 0.017779,
+      "score_p50": 0.000741,
+      "score_p90": 0.021215,
+      "score_p95": 0.039684,
+      "score_p99": 0.316717,
+      "score_max": 0.970841
+    },
+    "local_repo_vesuvius": {
+      "n": 730,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0548,
+      "flagged": 40,
+      "score_mean": 0.103365,
+      "score_p50": 0.018504,
+      "score_p90": 0.349818,
+      "score_p95": 0.531822,
+      "score_p99": 0.915992,
+      "score_max": 0.994358
+    },
+    "python_stdlib": {
+      "n": 8847,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0044,
+      "flagged": 39,
+      "score_mean": 0.012388,
+      "score_p50": 0.000454,
+      "score_p90": 0.017894,
+      "score_p95": 0.047606,
+      "score_p99": 0.258983,
+      "score_max": 0.97892
+    }
+  },
+  "flagged_examples": [
+    {
+      "score": 0.955582,
+      "source": "local_project_code",
+      "path": "C:\\GitHub\\Safety DS\\scripts\\build_malware_code_pool.py",
+      "preview": "def download_vxunderground( spec: dict, builder: PoolBuilder, chunk_cfg: dict, insecure: bool ) -> None: repo = spec[\"repo\"] cache = ROOT / spec.get(\"cache_dir\", \"data/external/vxunderground\") cache.mkdir(parents=True, exist_ok=True) for su"
+    },
+    {
+      "score": 0.71564,
+      "source": "local_repo_packing",
+      "path": "C:\\GitHub\\packing\\core\\pack_cuda_primitives.py",
+      "preview": "ss) forward_counts[0] = n_subj; for (int i = 0; i < n_subj; ++i) { forward_polys[0][i] = subj[i]; } } else { // Initialize current buffer current_count = n_subj; for (int i = 0; i < n_subj; ++i) { current_poly[i] = subj[i]; } } // Apply eac"
+    },
+    {
+      "score": 0.586343,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\convert_hf_to_gguf.py",
+      "preview": "def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # SwigLU activation assert self.hparams[\"activation_function\"] == \"swiglu\" # ALiBi position embedding assert self.hparams[\"position_embedding_type\"] == \"alibi\" # Embeddi"
+    },
+    {
+      "score": 0.863325,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "_ARG_NO_KV_OFFLOAD\")); add_opt(common_arg( {\"-nr\", \"--no-repack\"}, \"disable weight repacking\", [](common_params & params) { params.no_extra_bufts = true; } ).set_env(\"LLAMA_ARG_NO_REPACK\")); add_opt(common_arg( {\"--no-host\"}, \"bypass host b"
+    },
+    {
+      "score": 0.991824,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "nd_dev_t> devices; for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { auto * dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { devices.push_back(dev); } } printf(\"Available devices:\\n\"); f"
+    },
+    {
+      "score": 0.60719,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "n_ubatch = 1024; params.n_batch = 1024; params.n_ctx = 0; params.n_cache_reuse = 256; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {\"--fim-qwen-7b-spec\"}, string_format(\"use Qwen 2.5 Coder 7B + 0.5B draft for speculative d"
+    },
+    {
+      "score": 0.827293,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "wen 3 Coder 30B A3B Instruct (note: can download weights from the internet)\"), [](common_params & params) { params.model.hf_repo = \"ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF\"; params.model.hf_file = \"qwen3-coder-30b-a3b-instruct-q8_0."
+    },
+    {
+      "score": 0.534571,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\base64.hpp",
+      "preview": "; return 62; } else if (c == '_') { alphabet = alphabet::url_filename_safe; return 63; } } throw base64_error(\"invalid base64 character.\"); } }; #endif // !PUBLIC_DOMAIN_BASE64_HPP_"
+    },
+    {
+      "score": 0.551723,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat-parser.cpp",
+      "preview": "n_regex preamble_regex(\"<\\\\|channel\\\\|>commentary\"); static const common_regex tool_call1_regex(recipient + \"<\\\\|channel\\\\|>(analysis|commentary)\" + constraint + \"?\"); static const common_regex tool_call2_regex(\"<\\\\|channel\\\\|>(analysis|com"
+    },
+    {
+      "score": 0.536037,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat-parser.cpp",
+      "preview": "case COMMON_CHAT_FORMAT_DEEPSEEK_R1: common_chat_parse_deepseek_r1(builder); break; case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: common_chat_parse_deepseek_v3_1(builder); break; case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: common_chat_parse_function"
+    },
+    {
+      "score": 0.616117,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat-parser.cpp",
+      "preview": "common_chat_parse_kimi_k2(builder); break; case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: common_chat_parse_qwen3_coder_xml(builder); break; case COMMON_CHAT_FORMAT_APRIEL_1_5: common_chat_parse_apriel_1_5(builder); break; case COMMON_CHAT_FORMAT"
+    },
+    {
+      "score": 0.609806,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat.cpp",
+      "preview": "msg_new.tool_calls.size() < msg_prv.tool_calls.size()) { throw std::runtime_error(\"Invalid diff: now finding less tool calls!\"); } if (!msg_prv.tool_calls.empty()) { const auto idx = msg_prv.tool_calls.size() - 1; const auto & pref = msg_pr"
+    },
+    {
+      "score": 0.876555,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat.cpp",
+      "preview": "y v3.1 Llama 3.1\"; case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return \"DeepSeek V3.1\"; case COMMON_CHAT_FORMAT_HERMES_2_PRO: return \"Hermes 2 Pro\"; case COMMON_CHAT_FORMAT_COMMAND_R7B: return \"Command R7B\"; case COMMON_CHAT_FORMAT_GRANITE: retur"
+    },
+    {
+      "score": 0.610185,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat.h",
+      "preview": "OMMON_CHAT_FORMAT_GRANITE, COMMON_CHAT_FORMAT_GPT_OSS, COMMON_CHAT_FORMAT_SEED_OSS, COMMON_CHAT_FORMAT_NEMOTRON_V2, COMMON_CHAT_FORMAT_APERTUS, COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS, COMMON_CHAT_FORMAT_GLM_4_5, COMMON_CHAT_FORMAT_MINIMAX_"
+    },
+    {
+      "score": 0.864733,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\common.cpp",
+      "preview": "d-%H_%M_%S\", std::localtime(&as_time_t)); const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>( current_time.time_since_epoch() % 1000000000).count(); char timestamp_ns[11]; snprintf(timestamp_ns, 11, \"%09\" PRId64, ns); r"
+    },
+    {
+      "score": 0.972733,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\common.cpp",
+      "preview": "ARATOR; } return p; }; if (getenv(\"LLAMA_CACHE\")) { cache_directory = std::getenv(\"LLAMA_CACHE\"); } else { #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) if (std::getenv(\"XDG_CACHE_HOME\")) { cache_di"
+    },
+    {
+      "score": 0.526275,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\common.cpp",
+      "preview": "mmon_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P); get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_"
+    },
+    {
+      "score": 0.539756,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\download.cpp",
+      "preview": "l_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size); if (!was_pull_successful) { if (i + 1 < max_attempts) { const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1"
+    },
+    {
+      "score": 0.530205,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\download.cpp",
+      "preview": "size_t len) { buf.insert(buf.end(), data, data + len); return params.max_size == 0 || buf.size() <= static_cast<size_t>(params.max_size); }, nullptr ); if (!res) { throw std::runtime_error(\"error: cannot make GET request\"); } return { res->"
+    },
+    {
+      "score": 0.882664,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\download.cpp",
+      "preview": "local_path, token, false)) { throw std::runtime_error(\"Failed to download Docker Model\"); } LOG_INF(\"%s: Downloaded Docker Model to: %s\\n\", __func__, local_path.c_str()); return local_path; } catch (const std::exception & e) { LOG_ERR(\"%s: "
+    }
+  ]
+}

benign_code_hf_codeparrot_10k_eval.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "model_dir": "models\\v6_code_aware_50k_oss_clean_benign_code",
+  "holdout": "data\\external\\hf_benign_code_codeparrot_clean_10k.jsonl",
+  "overall": {
+    "n": 10000,
+    "threshold": 0.5,
+    "false_positive_rate": 0.0722,
+    "flagged": 722,
+    "score_mean": 0.097787,
+    "score_p50": 0.007817,
+    "score_p90": 0.351692,
+    "score_p95": 0.651972,
+    "score_p99": 0.950412,
+    "score_max": 0.99718
+  },
+  "by_source": {
+    "hf_benign_code:codeparrot/codeparrot-clean-train:default:train": {
+      "n": 10000,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0722,
+      "flagged": 722,
+      "score_mean": 0.097787,
+      "score_p50": 0.007817,
+      "score_p90": 0.351692,
+      "score_p95": 0.651972,
+      "score_p99": 0.950412,
+      "score_max": 0.99718
+    }
+  },
+  "flagged_examples": [
+    {
+      "score": 0.696553,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "linux-devkit/sysroots/i686-arago-linux/usr/lib/python2.7/encodings/cp1250.py",
+      "preview": "0e' # 0x0E -> SHIFT OUT u'\\x0f' # 0x0F -> SHIFT IN u'\\x10' # 0x10 -> DATA LINK ESCAPE u'\\x11' # 0x11 -> DEVICE CONTROL ONE u'\\x12' # 0x12 -> DEVICE CONTROL TWO u'\\x13' # 0x13 -> DEVICE CONTROL THREE u'\\x14' # 0x14 -> DEVICE CONTROL FOUR u'\\"
+    },
+    {
+      "score": 0.922218,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "linux-devkit/sysroots/i686-arago-linux/usr/lib/python2.7/encodings/cp1250.py",
+      "preview": "'a' # 0x61 -> LATIN SMALL LETTER A u'b' # 0x62 -> LATIN SMALL LETTER B u'c' # 0x63 -> LATIN SMALL LETTER C u'd' # 0x64 -> LATIN SMALL LETTER D u'e' # 0x65 -> LATIN SMALL LETTER E u'f' # 0x66 -> LATIN SMALL LETTER F u'g' # 0x67 -> LATIN SMAL"
+    },
+    {
+      "score": 0.538765,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "linux-devkit/sysroots/i686-arago-linux/usr/lib/python2.7/encodings/cp1250.py",
+      "preview": "# 0x88 -> UNDEFINED u'\\u2030' # 0x89 -> PER MILLE SIGN u'\\u0160' # 0x8A -> LATIN CAPITAL LETTER S WITH CARON u'\\u2039' # 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK u'\\u015a' # 0x8C -> LATIN CAPITAL LETTER S WITH ACUTE u'\\u0164' # 0x8"
+    },
+    {
+      "score": 0.709877,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "------ BU_LEFT = _controls_.BU_LEFT BU_TOP = _controls_.BU_TOP BU_RIGHT = _controls_.BU_RIGHT BU_BOTTOM = _controls_.BU_BOTTOM BU_ALIGN_MASK = _controls_.BU_ALIGN_MASK BU_EXACTFIT = _controls_.BU_EXACTFIT BU_AUTODRAW = _controls_.BU_AUTODRA"
+    },
+    {
+      "score": 0.546407,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "tCheckedStrings,SetCheckedStrings) _controls_.CheckListBox_swigregister(CheckListBox) def PreCheckListBox(*args, **kwargs): \"\"\"PreCheckListBox() -> CheckListBox\"\"\" val = _controls_.new_PreCheckListBox(*args, **kwargs) return val #----------"
+    },
+    {
+      "score": 0.734946,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "tAttr_GetTabs(*args, **kwargs) def GetLeftIndent(*args, **kwargs): \"\"\"GetLeftIndent(self) -> long\"\"\" return _controls_.TextAttr_GetLeftIndent(*args, **kwargs) def GetLeftSubIndent(*args, **kwargs): \"\"\"GetLeftSubIndent(self) -> long\"\"\" retur"
+    },
+    {
+      "score": 0.531309,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "ontrols_.TextAttr_GetFontFamily(*args, **kwargs) def GetFont(*args, **kwargs): \"\"\"GetFont(self) -> Font\"\"\" return _controls_.TextAttr_GetFont(*args, **kwargs) CreateFont = GetFont def GetCharacterStyleName(*args, **kwargs): \"\"\"GetCharacterS"
+    },
+    {
+      "score": 0.840642,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "extAttr_GetBulletFont(*args, **kwargs) def GetBulletName(*args, **kwargs): \"\"\"GetBulletName(self) -> String\"\"\" return _controls_.TextAttr_GetBulletName(*args, **kwargs) def GetURL(*args, **kwargs): \"\"\"GetURL(self) -> String\"\"\" return _contr"
+    },
+    {
+      "score": 0.950359,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "HasFontWeight(*args, **kwargs) def HasFontSize(*args, **kwargs): \"\"\"HasFontSize(self) -> bool\"\"\" return _controls_.TextAttr_HasFontSize(*args, **kwargs) def HasFontItalic(*args, **kwargs): \"\"\"HasFontItalic(self) -> bool\"\"\" return _controls_"
+    },
+    {
+      "score": 0.900217,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "ial(*args, **kwargs) def TextAttr_SplitParaCharStyles(*args, **kwargs): \"\"\"TextAttr_SplitParaCharStyles(TextAttr style, TextAttr parStyle, TextAttr charStyle) -> bool\"\"\" return _controls_.TextAttr_SplitParaCharStyles(*args, **kwargs) class "
+    },
+    {
+      "score": 0.739418,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "UrlEvent_swigregister(TextUrlEvent) EVT_TEXT = wx.PyEventBinder( wxEVT_COMMAND_TEXT_UPDATED, 1) EVT_TEXT_ENTER = wx.PyEventBinder( wxEVT_COMMAND_TEXT_ENTER, 1) EVT_TEXT_URL = wx.PyEventBinder( wxEVT_COMMAND_TEXT_URL, 1) EVT_TEXT_MAXLEN = wx"
+    },
+    {
+      "score": 0.679941,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "ameStr def PreScrollBar(*args, **kwargs): \"\"\"PreScrollBar() -> ScrollBar\"\"\" val = _controls_.new_PreScrollBar(*args, **kwargs) return val def ScrollBar_GetClassDefaultAttributes(*args, **kwargs): \"\"\" ScrollBar_GetClassDefaultAttributes(int "
+    },
+    {
+      "score": 0.509311,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "_UPDATED = _controls_.wxEVT_COMMAND_SPINCTRLDOUBLE_UPDATED EVT_SPIN_UP = wx.PyEventBinder( wxEVT_SPIN_UP, 1) EVT_SPIN_DOWN = wx.PyEventBinder( wxEVT_SPIN_DOWN, 1) EVT_SPIN = wx.PyEventBinder( wxEVT_SPIN, 1) EVT_SPINCTRL = wx.PyEventBinder( "
+    },
+    {
+      "score": 0.553781,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "(*args, **kwargs) def GetMax(*args, **kwargs): \"\"\"GetMax(self) -> double\"\"\" return _controls_.SpinCtrlDouble_GetMax(*args, **kwargs) def GetIncrement(*args, **kwargs): \"\"\"GetIncrement(self) -> double\"\"\" return _controls_.SpinCtrlDouble_GetI"
+    },
+    {
+      "score": 0.716552,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "x, v: x.this.own(v), doc='The membership flag') __repr__ = _swig_repr def __init__(self, *args, **kwargs): \"\"\"__init__(self, EventType commandType=wxEVT_NULL, int winid=0, double value=0) -> SpinDoubleEvent\"\"\" _controls_.SpinDoubleEvent_swi"
+    },
+    {
+      "score": 0.800785,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "label=EmptyString, Point pos=DefaultPosition, Size size=DefaultSize, wxArrayString choices=wxPyEmptyStringArray, int majorDimension=0, long style=RA_HORIZONTAL, Validator validator=DefaultValidator, String name=RadioBoxNameStr) -> bool \"\"\" "
+    },
+    {
+      "score": 0.722281,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "ageOld, int nPageNew=-1)\"\"\" return _controls_.Notebook_SendPageChangedEvent(*args, **kwargs) RowCount = property(GetRowCount,doc=\"See `GetRowCount`\") ThemeBackgroundColour = property(GetThemeBackgroundColour,doc=\"See `GetThemeBackgroundColo"
+    },
+    {
+      "score": 0.829889,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "tchableSpace(*args, **kwargs): \"\"\"InsertStretchableSpace(self, size_t pos) -> ToolBarToolBase\"\"\" return _controls_.ToolBarBase_InsertStretchableSpace(*args, **kwargs) def RemoveTool(*args, **kwargs): \"\"\"RemoveTool(self, int id) -> ToolBarTo"
+    },
+    {
+      "score": 0.523815,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "E_SEL LC_SORT_ASCENDING = _controls_.LC_SORT_ASCENDING LC_SORT_DESCENDING = _controls_.LC_SORT_DESCENDING LC_MASK_TYPE = _controls_.LC_MASK_TYPE LC_MASK_ALIGN = _controls_.LC_MASK_ALIGN LC_MASK_SORT = _controls_.LC_MASK_SORT LIST_MASK_STATE"
+    },
+    {
+      "score": 0.872435,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "libs_arm/wx/_controls.py",
+      "preview": "DELETE_ALL_ITEMS = _controls_.wxEVT_COMMAND_LIST_DELETE_ALL_ITEMS wxEVT_COMMAND_LIST_ITEM_SELECTED = _controls_.wxEVT_COMMAND_LIST_ITEM_SELECTED wxEVT_COMMAND_LIST_ITEM_DESELECTED = _controls_.wxEVT_COMMAND_LIST_ITEM_DESELECTED wxEVT_COMMAN"
+    }
+  ]
+}

benign_code_hf_codeparrot_project_10k_eval.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "model_dir": "models\\v6_code_aware_50k_oss_clean_benign_code",
+  "holdout": "data\\external\\hf_benign_code_codeparrot_clean_project_10k.jsonl",
+  "overall": {
+    "n": 10000,
+    "threshold": 0.5,
+    "false_positive_rate": 0.0713,
+    "flagged": 713,
+    "score_mean": 0.095502,
+    "score_p50": 0.007208,
+    "score_p90": 0.331408,
+    "score_p95": 0.657149,
+    "score_p99": 0.9478,
+    "score_max": 0.998492
+  },
+  "by_source": {
+    "hf_benign_code:codeparrot/codeparrot-clean-train:default:train": {
+      "n": 10000,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0713,
+      "flagged": 713,
+      "score_mean": 0.095502,
+      "score_p50": 0.007208,
+      "score_p90": 0.331408,
+      "score_p95": 0.657149,
+      "score_p99": 0.9478,
+      "score_max": 0.998492
+    }
+  },
+  "flagged_examples": [
+    {
+      "score": 0.6146,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "video/hud.py",
+      "preview": "append( self.ladder_helper(q0, 0, 2.0) ) pts.append( self.ladder_helper(q0, 0.0, -2.0) ) pts.append( self.ladder_helper(q0, 1.5, -2.0) ) pts.append( self.ladder_helper(q0, 1.5, -1.0) ) pts.append( center ) pts.append( self.ladder_helper(q0,"
+    },
+    {
+      "score": 0.988829,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "crwls.py",
+      "preview": "* 1000) crtwls.log( \"Fazendo undeploy da Aplicacao '%s'\" % self.name) progress = wlst.undeploy(self.name, block='true') wlst.activate() crtwls.edit(10 * 60 * 1000, 5 * 60 * 1000) crtwls.log(\"Fazendo deploy da Aplicacao '%s'\" % self.name) pr"
+    },
+    {
+      "score": 0.995816,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "crwls.py",
+      "preview": "ParallelConnectDelay(5) auth.setResultsTimeLimit(1000) auth.setAllUsersFilter('objectClass=user') auth.setPropagateCauseForLoginException(False) auth.setHost( 'sptbrdc04.petrobras.biz sptbrdc14.petrobras.biz sptbrdc08.petrobras.biz sptbrdc0"
+    },
+    {
+      "score": 0.975665,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "crwls.py",
+      "preview": "r) authenticator = classmethod(authenticator) def configure(cls): crtwls.connectToAdminServer() crtwls.edit() domainName = wlst.cmo.getName() crtwls.log(\"Configurando o Domain Log\") wlst.cmo.getLog().setFileMinSize(40000) wlst.cmo.getLog()."
+    },
+    {
+      "score": 0.504215,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "crwls.py",
+      "preview": "port) nmgr.getNodeManager().setDebugEnabled(True) crtwls.save() createMachine = classmethod(createMachine) def mailSession(cls): crtwls.connectToAdminServer() crtwls.edit() crtwls.log(\"Buscando o MailSession\") mailsession = wlst.cmo.lookupM"
+    },
+    {
+      "score": 0.846977,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "crwls.py",
+      "preview": "envSuffix = argv(4) adminAddress = argv(5) Domain.create(domainName, envSuffix, adminAddress) elif subcmd == 'configure': Domain.configure() elif subcmd == 'configure-authenticator': Domain.authenticator() elif subcmd == 'list-datasource': "
+    },
+    {
+      "score": 0.914316,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "00ac, # NOT SIGN 0x00ab: 0x00bd, # VULGAR FRACTION ONE HALF 0x00ac: 0x00bc, # VULGAR FRACTION ONE QUARTER 0x00ad: 0x00be, # VULGAR FRACTION THREE QUARTERS 0x00ae: 0x00ab, # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 0x00af: 0x00bb, # RIGHT-P"
+    },
+    {
+      "score": 0.824451,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "E 0x00c8: 0x255a, # BOX DRAWINGS DOUBLE UP AND RIGHT 0x00c9: 0x2554, # BOX DRAWINGS DOUBLE DOWN AND RIGHT 0x00ca: 0x2569, # BOX DRAWINGS DOUBLE UP AND HORIZONTAL 0x00cb: 0x2566, # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL 0x00cc: 0x2560, # BO"
+    },
+    {
+      "score": 0.992852,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "-> CARRIAGE RETURN '\\x0e' # 0x000e -> SHIFT OUT '\\x0f' # 0x000f -> SHIFT IN '\\x10' # 0x0010 -> DATA LINK ESCAPE '\\x11' # 0x0011 -> DEVICE CONTROL ONE '\\x12' # 0x0012 -> DEVICE CONTROL TWO '\\x13' # 0x0013 -> DEVICE CONTROL THREE '\\x14' # 0x0"
+    },
+    {
+      "score": 0.734134,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "x005e -> CIRCUMFLEX ACCENT '_' # 0x005f -> LOW LINE '`' # 0x0060 -> GRAVE ACCENT 'a' # 0x0061 -> LATIN SMALL LETTER A 'b' # 0x0062 -> LATIN SMALL LETTER B 'c' # 0x0063 -> LATIN SMALL LETTER C 'd' # 0x0064 -> LATIN SMALL LETTER D 'e' # 0x006"
+    },
+    {
+      "score": 0.689507,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "ART OF TEXT 0x0003: 0x0003, # END OF TEXT 0x0004: 0x0004, # END OF TRANSMISSION 0x0005: 0x0005, # ENQUIRY 0x0006: 0x0006, # ACKNOWLEDGE 0x0007: 0x0007, # BELL 0x0008: 0x0008, # BACKSPACE 0x0009: 0x0009, # HORIZONTAL TABULATION 0x000a: 0x000"
+    },
+    {
+      "score": 0.97725,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "STOP 0x002f: 0x002f, # SOLIDUS 0x0030: 0x0030, # DIGIT ZERO 0x0031: 0x0031, # DIGIT ONE 0x0032: 0x0032, # DIGIT TWO 0x0033: 0x0033, # DIGIT THREE 0x0034: 0x0034, # DIGIT FOUR 0x0035: 0x0035, # DIGIT FIVE 0x0036: 0x0036, # DIGIT SIX 0x0037: "
+    },
+    {
+      "score": 0.957955,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "0x0057: 0x0057, # LATIN CAPITAL LETTER W 0x0058: 0x0058, # LATIN CAPITAL LETTER X 0x0059: 0x0059, # LATIN CAPITAL LETTER Y 0x005a: 0x005a, # LATIN CAPITAL LETTER Z 0x005b: 0x005b, # LEFT SQUARE BRACKET 0x005c: 0x005c, # REVERSE SOLIDUS 0x00"
+    },
+    {
+      "score": 0.976132,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "7d: 0x007d, # RIGHT CURLY BRACKET 0x007e: 0x007e, # TILDE 0x007f: 0x007f, # DELETE 0x00a0: 0x00ff, # NO-BREAK SPACE 0x00a2: 0x009b, # CENT SIGN 0x00a3: 0x009c, # POUND SIGN 0x00a4: 0x0098, # CURRENCY SIGN 0x00a6: 0x00a0, # BROKEN BAR 0x00a7"
+    },
+    {
+      "score": 0.652318,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "kbe/src/lib/python/Lib/encodings/cp863.py",
+      "preview": "S DOUBLE DOWN AND LEFT 0x2558: 0x00d4, # BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE 0x2559: 0x00d3, # BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE 0x255a: 0x00c8, # BOX DRAWINGS DOUBLE UP AND RIGHT 0x255b: 0x00be, # BOX DRAWINGS UP SINGLE AND LEFT "
+    },
+    {
+      "score": 0.759528,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "lib/youtube_dl/extractor/baidu (VJ Washington's conflicted copy 2017-08-29).py",
+      "preview": "# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' _VALID_URL = r'https?://v\\.baidu\\.com/(?P<type>[a-z]+)/"
+    },
+    {
+      "score": 0.796715,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "addons/account/wizard/account_move_line_reconcile_select.py",
+      "preview": "# -*- coding: utf-8 -*- ############################################################################## # # OpenERP, Open Source Management Solution # Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>). # # This program is free software: y"
+    },
+    {
+      "score": 0.846741,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "engine/SCons/compat/__init__.py",
+      "preview": "# # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 The SCons Foundation # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files ("
+    },
+    {
+      "score": 0.56719,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "test/IECore/Shader.py",
+      "preview": "########################################################################## # # Copyright (c) 2007-2011, Image Engine Design Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are"
+    },
+    {
+      "score": 0.818233,
+      "source": "hf_benign_code:codeparrot/codeparrot-clean-train:default:train",
+      "path": "allauth/socialaccount/providers/orcid/provider.py",
+      "preview": "from allauth.socialaccount.providers.base import ProviderAccount from allauth.socialaccount.providers.oauth2.provider import OAuth2Provider class Scope(object): USERINFO_PROFILE = \"/authenticate\" class OrcidAccount(ProviderAccount): def get"
+    }
+  ]
+}

benign_code_lora_clean_eval.json ADDED Viewed

	@@ -0,0 +1,284 @@

+{
+  "model_dir": "models\\v6_code_aware_50k_oss_clean_benign_code",
+  "holdout": "data\\clf\\benign_code_holdout_lora_clean.jsonl",
+  "overall": {
+    "n": 10000,
+    "threshold": 0.5,
+    "false_positive_rate": 0.0097,
+    "flagged": 97,
+    "score_mean": 0.022216,
+    "score_p50": 0.000779,
+    "score_p90": 0.036148,
+    "score_p95": 0.103489,
+    "score_p99": 0.483205,
+    "score_max": 0.994358
+  },
+  "by_source": {
+    "local_project_code": {
+      "n": 160,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0063,
+      "flagged": 1,
+      "score_mean": 0.027283,
+      "score_p50": 0.002633,
+      "score_p90": 0.034741,
+      "score_p95": 0.107271,
+      "score_p99": 0.437606,
+      "score_max": 0.955582
+    },
+    "local_repo_hs": {
+      "n": 14,
+      "threshold": 0.5,
+      "false_positive_rate": 0.1429,
+      "flagged": 2,
+      "score_mean": 0.234466,
+      "score_p50": 0.173636,
+      "score_p90": 0.50088,
+      "score_p95": 0.601973,
+      "score_p99": 0.732286,
+      "score_max": 0.764864
+    },
+    "local_repo_isre": {
+      "n": 173,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.015629,
+      "score_p50": 0.000923,
+      "score_p90": 0.065123,
+      "score_p95": 0.111808,
+      "score_p99": 0.148813,
+      "score_max": 0.175909
+    },
+    "local_repo_job_application_pipeline": {
+      "n": 444,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0023,
+      "flagged": 1,
+      "score_mean": 0.016508,
+      "score_p50": 0.001555,
+      "score_p90": 0.038672,
+      "score_p95": 0.117372,
+      "score_p99": 0.200662,
+      "score_max": 0.691934
+    },
+    "local_repo_llama_cpp": {
+      "n": 833,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0228,
+      "flagged": 19,
+      "score_mean": 0.046812,
+      "score_p50": 0.004319,
+      "score_p90": 0.11022,
+      "score_p95": 0.229066,
+      "score_p99": 0.614219,
+      "score_max": 0.991824
+    },
+    "local_repo_math": {
+      "n": 6,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.00212,
+      "score_p50": 0.001785,
+      "score_p90": 0.004231,
+      "score_p95": 0.004744,
+      "score_p99": 0.005155,
+      "score_max": 0.005257
+    },
+    "local_repo_olympiad_math": {
+      "n": 53,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0189,
+      "flagged": 1,
+      "score_mean": 0.032375,
+      "score_p50": 0.001522,
+      "score_p90": 0.075568,
+      "score_p95": 0.202259,
+      "score_p99": 0.444325,
+      "score_max": 0.561933
+    },
+    "local_repo_pipeline": {
+      "n": 136,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0074,
+      "flagged": 1,
+      "score_mean": 0.021312,
+      "score_p50": 0.001563,
+      "score_p90": 0.037614,
+      "score_p95": 0.080462,
+      "score_p99": 0.369778,
+      "score_max": 0.888886
+    },
+    "local_repo_repo": {
+      "n": 13,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.049554,
+      "score_p50": 0.003904,
+      "score_p90": 0.15528,
+      "score_p95": 0.195039,
+      "score_p99": 0.231913,
+      "score_max": 0.241131
+    },
+    "local_repo_utils": {
+      "n": 114,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0088,
+      "flagged": 1,
+      "score_mean": 0.017779,
+      "score_p50": 0.000741,
+      "score_p90": 0.021215,
+      "score_p95": 0.039684,
+      "score_p99": 0.316717,
+      "score_max": 0.970841
+    },
+    "local_repo_vesuvius": {
+      "n": 730,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0548,
+      "flagged": 40,
+      "score_mean": 0.103365,
+      "score_p50": 0.018504,
+      "score_p90": 0.349818,
+      "score_p95": 0.531822,
+      "score_p99": 0.915992,
+      "score_max": 0.994358
+    },
+    "python_stdlib": {
+      "n": 7324,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0042,
+      "flagged": 31,
+      "score_mean": 0.011296,
+      "score_p50": 0.000434,
+      "score_p90": 0.015448,
+      "score_p95": 0.039071,
+      "score_p99": 0.230201,
+      "score_max": 0.97892
+    }
+  },
+  "flagged_examples": [
+    {
+      "score": 0.955582,
+      "source": "local_project_code",
+      "path": "C:\\GitHub\\Safety DS\\scripts\\build_malware_code_pool.py",
+      "preview": "def download_vxunderground( spec: dict, builder: PoolBuilder, chunk_cfg: dict, insecure: bool ) -> None: repo = spec[\"repo\"] cache = ROOT / spec.get(\"cache_dir\", \"data/external/vxunderground\") cache.mkdir(parents=True, exist_ok=True) for su"
+    },
+    {
+      "score": 0.586343,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\convert_hf_to_gguf.py",
+      "preview": "def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # SwigLU activation assert self.hparams[\"activation_function\"] == \"swiglu\" # ALiBi position embedding assert self.hparams[\"position_embedding_type\"] == \"alibi\" # Embeddi"
+    },
+    {
+      "score": 0.863325,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "_ARG_NO_KV_OFFLOAD\")); add_opt(common_arg( {\"-nr\", \"--no-repack\"}, \"disable weight repacking\", [](common_params & params) { params.no_extra_bufts = true; } ).set_env(\"LLAMA_ARG_NO_REPACK\")); add_opt(common_arg( {\"--no-host\"}, \"bypass host b"
+    },
+    {
+      "score": 0.991824,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "nd_dev_t> devices; for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { auto * dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { devices.push_back(dev); } } printf(\"Available devices:\\n\"); f"
+    },
+    {
+      "score": 0.60719,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "n_ubatch = 1024; params.n_batch = 1024; params.n_ctx = 0; params.n_cache_reuse = 256; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {\"--fim-qwen-7b-spec\"}, string_format(\"use Qwen 2.5 Coder 7B + 0.5B draft for speculative d"
+    },
+    {
+      "score": 0.827293,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\arg.cpp",
+      "preview": "wen 3 Coder 30B A3B Instruct (note: can download weights from the internet)\"), [](common_params & params) { params.model.hf_repo = \"ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF\"; params.model.hf_file = \"qwen3-coder-30b-a3b-instruct-q8_0."
+    },
+    {
+      "score": 0.534571,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\base64.hpp",
+      "preview": "; return 62; } else if (c == '_') { alphabet = alphabet::url_filename_safe; return 63; } } throw base64_error(\"invalid base64 character.\"); } }; #endif // !PUBLIC_DOMAIN_BASE64_HPP_"
+    },
+    {
+      "score": 0.551723,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat-parser.cpp",
+      "preview": "n_regex preamble_regex(\"<\\\\|channel\\\\|>commentary\"); static const common_regex tool_call1_regex(recipient + \"<\\\\|channel\\\\|>(analysis|commentary)\" + constraint + \"?\"); static const common_regex tool_call2_regex(\"<\\\\|channel\\\\|>(analysis|com"
+    },
+    {
+      "score": 0.536037,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat-parser.cpp",
+      "preview": "case COMMON_CHAT_FORMAT_DEEPSEEK_R1: common_chat_parse_deepseek_r1(builder); break; case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: common_chat_parse_deepseek_v3_1(builder); break; case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: common_chat_parse_function"
+    },
+    {
+      "score": 0.616117,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat-parser.cpp",
+      "preview": "common_chat_parse_kimi_k2(builder); break; case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: common_chat_parse_qwen3_coder_xml(builder); break; case COMMON_CHAT_FORMAT_APRIEL_1_5: common_chat_parse_apriel_1_5(builder); break; case COMMON_CHAT_FORMAT"
+    },
+    {
+      "score": 0.609806,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat.cpp",
+      "preview": "msg_new.tool_calls.size() < msg_prv.tool_calls.size()) { throw std::runtime_error(\"Invalid diff: now finding less tool calls!\"); } if (!msg_prv.tool_calls.empty()) { const auto idx = msg_prv.tool_calls.size() - 1; const auto & pref = msg_pr"
+    },
+    {
+      "score": 0.876555,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat.cpp",
+      "preview": "y v3.1 Llama 3.1\"; case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return \"DeepSeek V3.1\"; case COMMON_CHAT_FORMAT_HERMES_2_PRO: return \"Hermes 2 Pro\"; case COMMON_CHAT_FORMAT_COMMAND_R7B: return \"Command R7B\"; case COMMON_CHAT_FORMAT_GRANITE: retur"
+    },
+    {
+      "score": 0.610185,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\chat.h",
+      "preview": "OMMON_CHAT_FORMAT_GRANITE, COMMON_CHAT_FORMAT_GPT_OSS, COMMON_CHAT_FORMAT_SEED_OSS, COMMON_CHAT_FORMAT_NEMOTRON_V2, COMMON_CHAT_FORMAT_APERTUS, COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS, COMMON_CHAT_FORMAT_GLM_4_5, COMMON_CHAT_FORMAT_MINIMAX_"
+    },
+    {
+      "score": 0.864733,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\common.cpp",
+      "preview": "d-%H_%M_%S\", std::localtime(&as_time_t)); const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>( current_time.time_since_epoch() % 1000000000).count(); char timestamp_ns[11]; snprintf(timestamp_ns, 11, \"%09\" PRId64, ns); r"
+    },
+    {
+      "score": 0.972733,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\common.cpp",
+      "preview": "ARATOR; } return p; }; if (getenv(\"LLAMA_CACHE\")) { cache_directory = std::getenv(\"LLAMA_CACHE\"); } else { #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) if (std::getenv(\"XDG_CACHE_HOME\")) { cache_di"
+    },
+    {
+      "score": 0.526275,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\common.cpp",
+      "preview": "mmon_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P); get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_"
+    },
+    {
+      "score": 0.539756,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\download.cpp",
+      "preview": "l_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size); if (!was_pull_successful) { if (i + 1 < max_attempts) { const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1"
+    },
+    {
+      "score": 0.530205,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\download.cpp",
+      "preview": "size_t len) { buf.insert(buf.end(), data, data + len); return params.max_size == 0 || buf.size() <= static_cast<size_t>(params.max_size); }, nullptr ); if (!res) { throw std::runtime_error(\"error: cannot make GET request\"); } return { res->"
+    },
+    {
+      "score": 0.882664,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\download.cpp",
+      "preview": "local_path, token, false)) { throw std::runtime_error(\"Failed to download Docker Model\"); } LOG_INF(\"%s: Downloaded Docker Model to: %s\\n\", __func__, local_path.c_str()); return local_path; } catch (const std::exception & e) { LOG_ERR(\"%s: "
+    },
+    {
+      "score": 0.765199,
+      "source": "local_repo_llama_cpp",
+      "path": "C:\\lora_training\\llama.cpp\\common\\json-partial.cpp",
+      "preview": "last_non_sp_char == 'E' || last_non_sp_char == '-'; }; std::string closing; for (size_t i = err_loc.stack.size(); i > 0; i--) { auto & el = err_loc.stack[i - 1]; if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) { closing += \"}\"; } else if ("
+    }
+  ]
+}

benign_code_mathtrain_reasoner_eval.json ADDED Viewed

	@@ -0,0 +1,248 @@

+{
+  "model_dir": "models\\v6_code_aware_50k_oss_clean_benign_code",
+  "holdout": "data\\clf\\benign_code_holdout_mathtrain_reasoner_clean.jsonl",
+  "overall": {
+    "n": 12000,
+    "threshold": 0.5,
+    "false_positive_rate": 0.01,
+    "flagged": 120,
+    "score_mean": 0.021423,
+    "score_p50": 0.000676,
+    "score_p90": 0.032512,
+    "score_p95": 0.097846,
+    "score_p99": 0.497524,
+    "score_max": 0.994358
+  },
+  "by_source": {
+    "local_project_code": {
+      "n": 165,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0061,
+      "flagged": 1,
+      "score_mean": 0.026509,
+      "score_p50": 0.002527,
+      "score_p90": 0.034257,
+      "score_p95": 0.099218,
+      "score_p99": 0.43596,
+      "score_max": 0.955582
+    },
+    "local_repo_hs": {
+      "n": 14,
+      "threshold": 0.5,
+      "false_positive_rate": 0.1429,
+      "flagged": 2,
+      "score_mean": 0.234466,
+      "score_p50": 0.173636,
+      "score_p90": 0.50088,
+      "score_p95": 0.601973,
+      "score_p99": 0.732286,
+      "score_max": 0.764864
+    },
+    "local_repo_isre": {
+      "n": 173,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.015629,
+      "score_p50": 0.000923,
+      "score_p90": 0.065123,
+      "score_p95": 0.111808,
+      "score_p99": 0.148813,
+      "score_max": 0.175909
+    },
+    "local_repo_job_application_pipeline": {
+      "n": 444,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0023,
+      "flagged": 1,
+      "score_mean": 0.016508,
+      "score_p50": 0.001555,
+      "score_p90": 0.038672,
+      "score_p95": 0.117372,
+      "score_p99": 0.200662,
+      "score_max": 0.691934
+    },
+    "local_repo_math_train": {
+      "n": 562,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0569,
+      "flagged": 32,
+      "score_mean": 0.070331,
+      "score_p50": 0.002097,
+      "score_p90": 0.163531,
+      "score_p95": 0.558199,
+      "score_p99": 0.93781,
+      "score_max": 0.990801
+    },
+    "local_repo_math_train2": {
+      "n": 68,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0,
+      "flagged": 0,
+      "score_mean": 0.015792,
+      "score_p50": 0.00076,
+      "score_p90": 0.047232,
+      "score_p95": 0.135641,
+      "score_p99": 0.156291,
+      "score_max": 0.166963
+    },
+    "local_repo_olympiad_math": {
+      "n": 55,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0182,
+      "flagged": 1,
+      "score_mean": 0.031321,
+      "score_p50": 0.001742,
+      "score_p90": 0.071487,
+      "score_p95": 0.192837,
+      "score_p99": 0.439802,
+      "score_max": 0.561933
+    },
+    "local_repo_vesuvius": {
+      "n": 730,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0548,
+      "flagged": 40,
+      "score_mean": 0.103365,
+      "score_p50": 0.018504,
+      "score_p90": 0.349818,
+      "score_p95": 0.531822,
+      "score_p99": 0.915992,
+      "score_max": 0.994358
+    },
+    "python_stdlib": {
+      "n": 9789,
+      "threshold": 0.5,
+      "false_positive_rate": 0.0044,
+      "flagged": 43,
+      "score_mean": 0.012423,
+      "score_p50": 0.000479,
+      "score_p90": 0.017982,
+      "score_p95": 0.048102,
+      "score_p99": 0.254804,
+      "score_max": 0.97892
+    }
+  },
+  "flagged_examples": [
+    {
+      "score": 0.955582,
+      "source": "local_project_code",
+      "path": "C:\\GitHub\\Safety DS\\scripts\\build_malware_code_pool.py",
+      "preview": "def download_vxunderground( spec: dict, builder: PoolBuilder, chunk_cfg: dict, insecure: bool ) -> None: repo = spec[\"repo\"] cache = ROOT / spec.get(\"cache_dir\", \"data/external/vxunderground\") cache.mkdir(parents=True, exist_ok=True) for su"
+    },
+    {
+      "score": 0.77251,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\auto_pipeline.ps1",
+      "preview": "$stateFile = 'C:\\lora_training\\eval_results\\pipeline_state2.txt' $qwenLog = 'C:\\lora_training\\eval_results\\eval_qwen_merged_full.log' $glmLog = 'C:\\lora_training\\eval_results\\eval_glm_both_6k.log' $glmErr = 'C:\\lora_training\\eval_results\\ev"
+    },
+    {
+      "score": 0.705632,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\check_adapter_structure.py",
+      "preview": "import sys if hasattr(sys.stdout, \"reconfigure\"): sys.stdout.reconfigure(encoding=\"utf-8\") from safetensors import safe_open from pathlib import Path import json adapter = Path(r\"C:\\lora_training\\lora_MATH_output\\qwen_run_20260408_123730\\fi"
+    },
+    {
+      "score": 0.590817,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\check_lora_results.py",
+      "preview": "import json, os path = r\"C:\\lora_training\\eval_results\\all_adapters\" for fn in [\"eval_qwen3_instruct_lora.json\", \"eval_qwen3_instruct_lora_diverse.json\", \"eval_gemma3_instruct_lora.json\", \"eval_gemma3_instruct_lora_diverse.json\", \"eval_llam"
+    },
+    {
+      "score": 0.599163,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\check_qwen_log.ps1",
+      "preview": "$logfile = \"C:\\Users\\SOL08_~1\\AppData\\Local\\Temp\\claude\\C--Users-sol08-p04dk8b-MATH-TRAIN\\4a837286-97f6-4070-a5f2-37ba2c811443\\tasks\\b1yeq9q3i.output\" if (Test-Path $logfile) { $lines = Get-Content $logfile $total = $lines.Count Write-Outpu"
+    },
+    {
+      "score": 0.935582,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\check_sizes.ps1",
+      "preview": "$files = @( 'C:\\lora_training\\OLympiad\\_output\\good_solutions_deduped.jsonl', 'C:\\lora_training\\OLympiad\\_output\\broken_only_deduped.jsonl', 'C:\\lora_training\\OLympiad\\_output\\dpo_pairs.jsonl', 'C:\\lora_training\\OLympiad\\converted_for_sft.j"
+    },
+    {
+      "score": 0.990801,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\clean_github.ps1",
+      "preview": "Set-Location 'C:\\GitHub\\Olympiad_Math' $remove = @( 'scraping\\olympiad.py', 'scraping\\geometry_scraper.py', 'scraping\\filter_geometry_links.py', 'scraping\\download_geometry_drive.py', 'scraping\\scraper_aops.py', 'scraping\\olympiad_dataset1."
+    },
+    {
+      "score": 0.736009,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\download_llama3b_instruct.py",
+      "preview": "from huggingface_hub import snapshot_download import time print(\"Downloading Llama-3.2-3B-Instruct...\", flush=True) t0 = time.time() snapshot_download( repo_id=\"meta-llama/Llama-3.2-3B-Instruct\", local_dir=r\"C:\\models\\Llama-3.2-3B-Instruct\""
+    },
+    {
+      "score": 0.880296,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\eval_benchmark.py",
+      "preview": "def print_report(results, label): total = len(results) correct = sum(r[\"correct\"] for r in results) print(f\"\\n{'='*60}\") print(f\" {label}: {correct}/{total} = {correct/total*100:.1f}%\") print(f\"{'='*60}\") # По предметам by_subj = defaultdic"
+    },
+    {
+      "score": 0.509782,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\eval_general_reasoning.py",
+      "preview": "def load_mmlu(n, seed): \"\"\"MMLU: cais/mmlu all — 200 вопросов из разных предметов.\"\"\" print(\"Loading MMLU...\") rng = random.Random(seed) # Пробуем 'all', иначе сэмплируем из нескольких предметов try: ds = load_dataset(\"cais/mmlu\", \"all\", sp"
+    },
+    {
+      "score": 0.74435,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\eval_general_reasoning.py",
+      "preview": "def load_arc(n, seed): \"\"\"ARC-Challenge — 200 вопросов.\"\"\" print(\"Loading ARC-Challenge...\") rng = random.Random(seed) ds = load_dataset(\"allenai/ai2_arc\", \"ARC-Challenge\", split=\"test\", trust_remote_code=True) pool = [] for x in ds: labels"
+    },
+    {
+      "score": 0.946294,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\eval_general_reasoning.py",
+      "preview": "def load_hellaswag(n, seed): \"\"\"HellaSwag — 200 примеров.\"\"\" print(\"Loading HellaSwag...\") rng = random.Random(seed) ds = load_dataset(\"Rowan/hellaswag\", split=\"validation\", trust_remote_code=True) pool = [] for x in ds: endings = x[\"ending"
+    },
+    {
+      "score": 0.569382,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\inspect_lora.py",
+      "preview": "import sys if hasattr(sys.stdout, \"reconfigure\"): sys.stdout.reconfigure(encoding=\"utf-8\") from safetensors import safe_open import json from pathlib import Path adapter_path = r\"C:\\lora_training\\lora_MATH_output\\qwen_run_20260408_123730\\fi"
+    },
+    {
+      "score": 0.956672,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\list_files.ps1",
+      "preview": "Get-ChildItem 'C:\\lora_training\\OLympiad' -Filter '*.jsonl' | Sort-Object Length -Descending | ForEach-Object { $mb = [math]::Round($_.Length / 1MB, 1) Write-Output \"$mb MB $($_.Name)\" }"
+    },
+    {
+      "score": 0.826007,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\peek_local_bench.ps1",
+      "preview": "$files = @( 'C:\\lora_training\\OLympiad\\olympiad\\olympiad_final_MATH.jsonl', 'C:\\lora_training\\OLympiad\\_raw\\aops_dataset.jsonl', 'C:\\lora_training\\OLympiad\\_raw\\math_dataset.jsonl' ) foreach ($f in $files) { if (Test-Path $f) { Write-Output"
+    },
+    {
+      "score": 0.84162,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\peek_scripts.ps1",
+      "preview": "$scripts = @( 'generate_cot_dataset.py', 'Complete_cot.py', 'janitor44.py', 'triage_flagged.py', 'merge_dataset.py' ) foreach ($s in $scripts) { $path = \"C:\\lora_training\\OLympiad\\$s\" if (Test-Path $path) { Write-Output \"=== $s ===\" Get-Con"
+    },
+    {
+      "score": 0.813426,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\run_lora_3k_v2.ps1",
+      "preview": "$ErrorActionPreference = \"Stop\" $ScriptDir = \"C:\\Users\\sol08_p04dk8b\\MATH TRAIN\" cd $ScriptDir Write-Host \"=== run 1/3: Qwen3-4B lora_3k_v2 ===\" -ForegroundColor Cyan python train_qwen3_4b_instruct_lora_3k_v2.py 2>&1 | Tee-Object qwen3_3k_v"
+    },
+    {
+      "score": 0.7723,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\setup_github_project.py",
+      "preview": "| SFT (good solutions) | ~22,990 | ChatML `{\"text\": ...}` | | Broken / incomplete | ~6,518 | same | | DPO pairs | ~4,393 | `{\"problem\", \"chosen\", \"rejected\"}` | Data files are **not included** in this repo (too large). Sources: AoPS, IMO Sh"
+    },
+    {
+      "score": 0.976724,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\show_github.ps1",
+      "preview": "Write-Output \"=== C:\\GitHub\\Olympiad_Math ===\" Write-Output \"\" Write-Output \"--- Root files ---\" Get-ChildItem 'C:\\GitHub\\Olympiad_Math' -File | ForEach-Object { $mb = [math]::Round($_.Length / 1MB, 2) Write-Output \" $($_.Name) ($mb MB)\" } "
+    },
+    {
+      "score": 0.982568,
+      "source": "local_repo_math_train",
+      "path": "C:\\Users\\sol08_p04dk8b\\MATH TRAIN\\show_structure.ps1",
+      "preview": "Write-Output \"=== Files in root ===\" Get-ChildItem 'C:\\lora_training\\OLympiad' -File | ForEach-Object { Write-Output $_.Name } Write-Output \"\" Write-Output \"=== Subfolders ===\" Get-ChildItem 'C:\\lora_training\\OLympiad' -Directory | ForEach-"
+    }
+  ]
+}

clf_binary.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b66da8692d3edca73c84d7b3b7aaa6e074509ae95b5ebfae26cf2413881a5932
+size 9055

clf_multilabel.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c3664b454f193e137e7adb9b89eebcae399c7e68f391ad67b8d60de73b1b0c8
+size 103524

labels.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "categories": [
+    "malware_types",
+    "exploit_development",
+    "obfuscation_evasion",
+    "command_control",
+    "injection_lateral",
+    "credential_exfil",
+    "ransomware_crypto",
+    "reverse_engineering_offense",
+    "phishing_social_engineering_code",
+    "botnet_spam_ddos",
+    "rootkit_kernel",
+    "packers_loaders"
+  ]
+}

metrics.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "embedder": "BAAI/bge-m3",
+  "device": "cuda",
+  "embedding_dim": 1024,
+  "max_length": 128,
+  "clf_dir": "data\\clf\\v6_code_aware_50k_oss_clean_benign_code",
+  "counts": {
+    "train": 222950,
+    "test": 27736,
+    "obfuscated_test": 4000,
+    "malware_code_test": 2000
+  },
+  "binary": {
+    "precision": 0.9996,
+    "recall": 0.9964,
+    "f1": 0.998,
+    "roc_auc": 0.9997,
+    "false_positive_rate": 0.004,
+    "obfuscated_recall": 0.9935,
+    "obfuscated_count": 4000,
+    "malware_code_recall": 0.989,
+    "malware_code_count": 2000
+  },
+  "multilabel": {
+    "micro_f1": 0.6146,
+    "macro_f1": 0.514,
+    "macro_f1_positives_only": 0.5141,
+    "obfuscated_macro_f1": 0.3011,
+    "malware_code_macro_f1": 0.1235
+  }
+}