pliny-the-prompter commited on
Commit
a9b2ff1
Β·
verified Β·
1 Parent(s): f6b07b4

Upload 128 files

Browse files
app.py CHANGED
@@ -68,48 +68,72 @@ _lock = threading.Lock()
68
  _bench_configs: dict[str, dict] = {}
69
 
70
  # ---------------------------------------------------------------------------
71
- # Model presets (subset that fits on a T4 16GB)
72
  # ---------------------------------------------------------------------------
73
 
74
- MODELS = {
75
- # ── Tiny (< 2B) ──────────────────────────────────────────────────────
76
- # All models below are non-gated (no HF approval required)
77
- "Qwen2.5 0.5B Instruct": "Qwen/Qwen2.5-0.5B-Instruct",
78
- "Qwen3 0.6B": "Qwen/Qwen3-0.6B",
79
- "OLMo 2 1B Instruct": "allenai/OLMo-2-0425-1B-Instruct",
80
- "TinyLlama 1.1B Chat": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
81
- "DeepSeek R1 Distill Qwen 1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
82
- "Qwen2.5 1.5B Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
83
- "Qwen3 1.7B": "Qwen/Qwen3-1.7B",
84
- "SmolLM2 1.7B Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
85
- # ── Small (2-5B) ─────────────────────────────────────────────────────
86
- "Phi-2 (2.7B)": "microsoft/phi-2",
87
- "Qwen2.5 3B Instruct": "Qwen/Qwen2.5-3B-Instruct",
88
- "SmolLM3 3B": "HuggingFaceTB/SmolLM3-3B",
89
- "Falcon3 3B Instruct": "tiiuae/Falcon3-3B-Instruct",
90
- "Phi-4 Mini Instruct (3.8B)": "microsoft/Phi-4-mini-instruct",
91
- "MiniCPM3 4B": "openbmb/MiniCPM3-4B",
92
- "Qwen3 4B": "Qwen/Qwen3-4B",
93
- # ── Medium (5-9B) ────────────────────────────────────────────────────
94
- "Qwen2.5 7B Instruct": "Qwen/Qwen2.5-7B-Instruct",
95
- "Qwen2.5 Coder 7B Instruct": "Qwen/Qwen2.5-Coder-7B-Instruct",
96
- "OLMo 3 7B Instruct": "allenai/Olmo-3-7B-Instruct",
97
- "Falcon3 7B Instruct": "tiiuae/Falcon3-7B-Instruct",
98
- "Qwen3 8B": "Qwen/Qwen3-8B",
99
- "DeepSeek R1 0528 Qwen3 8B": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
100
- "InternLM3 8B Instruct": "internlm/internlm3-8b-instruct",
101
- "GLM-4 9B Chat": "THUDM/glm-4-9b-chat-hf",
102
- # ── Frontier (MoE β€” tight fit on T4 with quantization) ─────────────
103
- "GPT-OSS 20B (MoE, 3.6B active)": "openai/gpt-oss-20b",
104
- "Qwen3 30B-A3B (MoE, 3B active)": "Qwen/Qwen3-30B-A3B",
105
- "GLM-4.7 Flash (MoE, 3B active)": "zai-org/GLM-4.7-Flash",
106
- # ── Frontier (multi-GPU / cloud only) ──────────────────────────────
107
- "Qwen3.5 397B-A17B (MoE)": "Qwen/Qwen3.5-397B-A17B",
108
- "GLM-5 744B (MoE, 40B active)": "zai-org/GLM-5",
109
- "MiniMax M2.5 (MoE, 10B active)": "MiniMaxAI/MiniMax-M2.5",
110
- "DeepSeek-V3 685B (MoE)": "deepseek-ai/DeepSeek-V3",
111
  }
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  METHODS = {
114
  "advanced (recommended)": "advanced",
115
  "basic (fast, single direction)": "basic",
@@ -2678,7 +2702,7 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
2678
  with gr.Row():
2679
  model_dd = gr.Dropdown(
2680
  choices=list(MODELS.keys()),
2681
- value="Qwen3 4B",
2682
  label="Target Model",
2683
  info="Models sized for a free T4 GPU (16 GB VRAM)",
2684
  allow_custom_value=True,
@@ -2822,6 +2846,12 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
2822
  cleanup_btn = gr.Button("Purge Cache", variant="secondary", size="sm")
2823
  cleanup_status = gr.Markdown("")
2824
 
 
 
 
 
 
 
2825
  # ── Tab 2: Chat ───────────────────────────────────────────────────
2826
  with gr.Tab("Chat", id="chat"):
2827
  chat_status = gr.Markdown(get_chat_header)
@@ -2920,7 +2950,7 @@ tradeoff point where refusal is minimized with minimal capability damage.
2920
  with gr.Row():
2921
  sweep_model_dd = gr.Dropdown(
2922
  choices=list(MODELS.keys()),
2923
- value="Qwen2.5 0.5B Instruct",
2924
  label="Model",
2925
  allow_custom_value=True,
2926
  )
@@ -3007,7 +3037,7 @@ Great for finding the optimal strategy for a specific architecture.
3007
  from gradio_client import Client
3008
  client = Client("pliny-the-prompter/obliteratus")
3009
  result = client.predict(
3010
- model_choice="Qwen2.5 0.5B Instruct",
3011
  methods_to_test=["basic", "advanced", "surgical", "optimized"],
3012
  prompt_volume_choice="33 (fast)",
3013
  api_name="/benchmark",
@@ -3017,7 +3047,7 @@ result = client.predict(
3017
  with gr.Row():
3018
  bench_model = gr.Dropdown(
3019
  choices=list(MODELS.keys()),
3020
- value="Qwen2.5 0.5B Instruct",
3021
  label="Target Model",
3022
  allow_custom_value=True,
3023
  )
@@ -3122,7 +3152,7 @@ how well a technique generalizes β€” especially for MoE-aware methods like
3122
  from gradio_client import Client
3123
  client = Client("pliny-the-prompter/obliteratus")
3124
  result = client.predict(
3125
- model_choices=["Qwen2.5 0.5B Instruct", "GPT-OSS 20B (MoE, 3.6B active)"],
3126
  method_choice="surgical",
3127
  prompt_volume_choice="33 (fast)",
3128
  api_name="/benchmark_multi_model",
@@ -3133,8 +3163,8 @@ result = client.predict(
3133
  mm_models = gr.CheckboxGroup(
3134
  choices=list(MODELS.keys()),
3135
  value=[
3136
- "Qwen2.5 0.5B Instruct",
3137
- "Qwen2.5 3B Instruct",
3138
  ],
3139
  label="Models to Test",
3140
  )
@@ -3279,7 +3309,7 @@ Pre-configured benchmark configurations for common research questions.
3279
 
3280
  def _preset_gptoss(vol, ds):
3281
  yield from benchmark(
3282
- "GPT-OSS 20B (MoE, 3.6B active)",
3283
  ["basic", "advanced", "aggressive", "surgical",
3284
  "optimized", "inverted", "nuclear"],
3285
  vol, ds,
@@ -3288,10 +3318,10 @@ Pre-configured benchmark configurations for common research questions.
3288
  def _preset_moe_cross(vol, ds):
3289
  yield from benchmark_multi_model(
3290
  [
3291
- "Qwen2.5 0.5B Instruct",
3292
- "Qwen2.5 3B Instruct",
3293
- "Qwen2.5 7B Instruct",
3294
- "GPT-OSS 20B (MoE, 3.6B active)",
3295
  ],
3296
  "surgical", vol, ds,
3297
  )
@@ -3303,9 +3333,9 @@ Pre-configured benchmark configurations for common research questions.
3303
  # Part 1: basic method across models
3304
  for status, results_md, log, gallery in benchmark_multi_model(
3305
  [
3306
- "Qwen2.5 0.5B Instruct",
3307
- "Qwen2.5 3B Instruct",
3308
- "Qwen2.5 7B Instruct",
3309
  ],
3310
  "basic", vol, ds,
3311
  ):
@@ -3314,9 +3344,9 @@ Pre-configured benchmark configurations for common research questions.
3314
  # Part 2: optimized method across models
3315
  for status, results_md, log, gallery in benchmark_multi_model(
3316
  [
3317
- "Qwen2.5 0.5B Instruct",
3318
- "Qwen2.5 3B Instruct",
3319
- "Qwen2.5 7B Instruct",
3320
  ],
3321
  "optimized", vol, ds,
3322
  ):
@@ -3341,10 +3371,12 @@ Pre-configured benchmark configurations for common research questions.
3341
  # ── Tab 7: Leaderboard ────────────────────────────────────────────
3342
  with gr.Tab("Leaderboard", id="leaderboard"):
3343
  gr.Markdown("""### Community Leaderboard
3344
- All benchmark results from this Space are anonymously logged.
3345
- See which model + method combinations perform best across the community.
3346
 
3347
- *Telemetry is anonymous (no user identity, no prompts). Opt in: set `OBLITERATUS_TELEMETRY=1`.*
 
 
3348
  """)
3349
 
3350
  def _load_leaderboard():
@@ -3352,7 +3384,7 @@ See which model + method combinations perform best across the community.
3352
  try:
3353
  from obliteratus.telemetry import get_leaderboard_data, is_telemetry_enabled
3354
  if not is_telemetry_enabled():
3355
- return "Telemetry is disabled. Set `OBLITERATUS_TELEMETRY=1` to enable.", ""
3356
 
3357
  data = get_leaderboard_data()
3358
  if not data:
 
68
  _bench_configs: dict[str, dict] = {}
69
 
70
  # ---------------------------------------------------------------------------
71
+ # Model presets β€” 100+ models organized by provider
72
  # ---------------------------------------------------------------------------
73
 
74
+ # Map HF org prefixes to display provider names
75
+ _PROVIDER_NAMES = {
76
+ "01-ai": "01.AI",
77
+ "Qwen": "Alibaba (Qwen)",
78
+ "allenai": "Allen AI",
79
+ "apple": "Apple",
80
+ "CohereForAI": "Cohere",
81
+ "databricks": "Databricks",
82
+ "deepseek-ai": "DeepSeek",
83
+ "EleutherAI": "EleutherAI",
84
+ "google": "Google",
85
+ "distilbert": "HuggingFace",
86
+ "HuggingFaceTB": "HuggingFace",
87
+ "ibm-granite": "IBM",
88
+ "TinyLlama": "Meta (LLaMA)",
89
+ "meta-llama": "Meta (LLaMA)",
90
+ "microsoft": "Microsoft",
91
+ "MiniMaxAI": "MiniMax",
92
+ "mistralai": "Mistral",
93
+ "moonshotai": "Moonshot",
94
+ "nvidia": "NVIDIA",
95
+ "openai": "OpenAI",
96
+ "openai-community": "OpenAI",
97
+ "openbmb": "OpenBMB",
98
+ "internlm": "Shanghai AI Lab",
99
+ "stabilityai": "Stability AI",
100
+ "stepfun-ai": "StepFun",
101
+ "tiiuae": "TII (Falcon)",
102
+ "THUDM": "Zhipu AI (GLM)",
103
+ "zai-org": "Zhipu AI (GLM)",
104
+ # Community fine-tunes
105
+ "huihui-ai": "Community",
106
+ "cognitivecomputations": "Community",
107
+ "NousResearch": "Community",
108
+ "mlabonne": "Community",
109
+ "Orenguteng": "Community",
110
+ "WhiteRabbitNeo": "Community",
111
  }
112
 
113
+
114
+ def _build_model_choices() -> dict[str, str]:
115
+ """Build display_name β†’ hf_id mapping from presets, grouped by provider."""
116
+ from obliteratus.presets import list_all_presets
117
+ presets = list_all_presets()
118
+
119
+ # Group by provider
120
+ groups: dict[str, list[tuple[str, str]]] = {}
121
+ for p in presets:
122
+ org = p.hf_id.split("/")[0] if "/" in p.hf_id else ""
123
+ provider = _PROVIDER_NAMES.get(org, org)
124
+ groups.setdefault(provider, []).append((p.name, p.hf_id))
125
+
126
+ # Build ordered dict: providers alphabetically, models by name within each
127
+ models: dict[str, str] = {}
128
+ for provider in sorted(groups.keys()):
129
+ for name, hf_id in groups[provider]:
130
+ display = f"{provider} / {name}"
131
+ models[display] = hf_id
132
+ return models
133
+
134
+
135
+ MODELS = _build_model_choices()
136
+
137
  METHODS = {
138
  "advanced (recommended)": "advanced",
139
  "basic (fast, single direction)": "basic",
 
2702
  with gr.Row():
2703
  model_dd = gr.Dropdown(
2704
  choices=list(MODELS.keys()),
2705
+ value="Alibaba (Qwen) / Qwen3-4B",
2706
  label="Target Model",
2707
  info="Models sized for a free T4 GPU (16 GB VRAM)",
2708
  allow_custom_value=True,
 
2846
  cleanup_btn = gr.Button("Purge Cache", variant="secondary", size="sm")
2847
  cleanup_status = gr.Markdown("")
2848
 
2849
+ gr.Markdown(
2850
+ "*Anonymous telemetry is on by default (no user identity or prompts collected). "
2851
+ "Opt out: set `OBLITERATUS_TELEMETRY=0`.*",
2852
+ elem_classes=["telemetry-notice"],
2853
+ )
2854
+
2855
  # ── Tab 2: Chat ───────────────────────────────────────────────────
2856
  with gr.Tab("Chat", id="chat"):
2857
  chat_status = gr.Markdown(get_chat_header)
 
2950
  with gr.Row():
2951
  sweep_model_dd = gr.Dropdown(
2952
  choices=list(MODELS.keys()),
2953
+ value="Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
2954
  label="Model",
2955
  allow_custom_value=True,
2956
  )
 
3037
  from gradio_client import Client
3038
  client = Client("pliny-the-prompter/obliteratus")
3039
  result = client.predict(
3040
+ model_choice="Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
3041
  methods_to_test=["basic", "advanced", "surgical", "optimized"],
3042
  prompt_volume_choice="33 (fast)",
3043
  api_name="/benchmark",
 
3047
  with gr.Row():
3048
  bench_model = gr.Dropdown(
3049
  choices=list(MODELS.keys()),
3050
+ value="Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
3051
  label="Target Model",
3052
  allow_custom_value=True,
3053
  )
 
3152
  from gradio_client import Client
3153
  client = Client("pliny-the-prompter/obliteratus")
3154
  result = client.predict(
3155
+ model_choices=["Alibaba (Qwen) / Qwen2.5-0.5B Instruct", "OpenAI / GPT-OSS 20B"],
3156
  method_choice="surgical",
3157
  prompt_volume_choice="33 (fast)",
3158
  api_name="/benchmark_multi_model",
 
3163
  mm_models = gr.CheckboxGroup(
3164
  choices=list(MODELS.keys()),
3165
  value=[
3166
+ "Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
3167
+ "Alibaba (Qwen) / Qwen2.5-3B Instruct",
3168
  ],
3169
  label="Models to Test",
3170
  )
 
3309
 
3310
  def _preset_gptoss(vol, ds):
3311
  yield from benchmark(
3312
+ "OpenAI / GPT-OSS 20B",
3313
  ["basic", "advanced", "aggressive", "surgical",
3314
  "optimized", "inverted", "nuclear"],
3315
  vol, ds,
 
3318
  def _preset_moe_cross(vol, ds):
3319
  yield from benchmark_multi_model(
3320
  [
3321
+ "Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
3322
+ "Alibaba (Qwen) / Qwen2.5-3B Instruct",
3323
+ "Alibaba (Qwen) / Qwen2.5-7B Instruct",
3324
+ "OpenAI / GPT-OSS 20B",
3325
  ],
3326
  "surgical", vol, ds,
3327
  )
 
3333
  # Part 1: basic method across models
3334
  for status, results_md, log, gallery in benchmark_multi_model(
3335
  [
3336
+ "Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
3337
+ "Alibaba (Qwen) / Qwen2.5-3B Instruct",
3338
+ "Alibaba (Qwen) / Qwen2.5-7B Instruct",
3339
  ],
3340
  "basic", vol, ds,
3341
  ):
 
3344
  # Part 2: optimized method across models
3345
  for status, results_md, log, gallery in benchmark_multi_model(
3346
  [
3347
+ "Alibaba (Qwen) / Qwen2.5-0.5B Instruct",
3348
+ "Alibaba (Qwen) / Qwen2.5-3B Instruct",
3349
+ "Alibaba (Qwen) / Qwen2.5-7B Instruct",
3350
  ],
3351
  "optimized", vol, ds,
3352
  ):
 
3371
  # ── Tab 7: Leaderboard ────────────────────────────────────────────
3372
  with gr.Tab("Leaderboard", id="leaderboard"):
3373
  gr.Markdown("""### Community Leaderboard
3374
+ All benchmark results from this Space are anonymously logged to help the community
3375
+ find the best model + method combinations.
3376
 
3377
+ *Telemetry is **on by default** and is fully anonymous β€” no user identity, IP addresses, or prompt content
3378
+ is ever collected. Only aggregate benchmark metrics (model name, method, scores, hardware) are stored locally.
3379
+ To opt out, set the environment variable `OBLITERATUS_TELEMETRY=0` before launching.*
3380
  """)
3381
 
3382
  def _load_leaderboard():
 
3384
  try:
3385
  from obliteratus.telemetry import get_leaderboard_data, is_telemetry_enabled
3386
  if not is_telemetry_enabled():
3387
+ return "Telemetry is disabled. Remove `OBLITERATUS_TELEMETRY=0` or set it to `1` to re-enable.", ""
3388
 
3389
  data = get_leaderboard_data()
3390
  if not data:
obliteratus/.DS_Store CHANGED
Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ
 
obliteratus/abliterate.py CHANGED
@@ -941,9 +941,10 @@ class AbliterationPipeline:
941
  self.log(" Chat template not configured for this model; using raw prompts")
942
  return prompts
943
 
944
- self.log(" Wrapping prompts with chat template")
 
945
  wrapped = []
946
- for prompt in prompts:
947
  messages = [{"role": "user", "content": prompt}]
948
  try:
949
  text = tokenizer.apply_chat_template(
@@ -952,6 +953,8 @@ class AbliterationPipeline:
952
  wrapped.append(text)
953
  except Exception:
954
  wrapped.append(prompt) # fallback to raw if individual prompt fails
 
 
955
  return wrapped
956
 
957
  @staticmethod
 
941
  self.log(" Chat template not configured for this model; using raw prompts")
942
  return prompts
943
 
944
+ n = len(prompts)
945
+ self.log(f" Wrapping {n} prompts with chat template")
946
  wrapped = []
947
+ for i, prompt in enumerate(prompts):
948
  messages = [{"role": "user", "content": prompt}]
949
  try:
950
  text = tokenizer.apply_chat_template(
 
953
  wrapped.append(text)
954
  except Exception:
955
  wrapped.append(prompt) # fallback to raw if individual prompt fails
956
+ if (i + 1) % 100 == 0 or (i + 1) == n:
957
+ self.log(f" chat template {i + 1}/{n}")
958
  return wrapped
959
 
960
  @staticmethod
obliteratus/presets.py CHANGED
@@ -70,6 +70,22 @@ _PRESETS_LIST = [
70
  params="0.5B",
71
  recommended_dtype="float16",
72
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  ModelPreset(
74
  name="Qwen2.5-1.5B",
75
  hf_id="Qwen/Qwen2.5-1.5B",
@@ -78,6 +94,38 @@ _PRESETS_LIST = [
78
  params="1.5B",
79
  recommended_dtype="float16",
80
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  ModelPreset(
82
  name="Qwen2.5-7B",
83
  hf_id="Qwen/Qwen2.5-7B",
@@ -87,6 +135,33 @@ _PRESETS_LIST = [
87
  recommended_dtype="float16",
88
  recommended_quantization="4bit",
89
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  ModelPreset(
91
  name="Qwen2.5-14B",
92
  hf_id="Qwen/Qwen2.5-14B",
@@ -146,7 +221,24 @@ _PRESETS_LIST = [
146
  # β•‘ Allen Institute for AI (AI2) β•‘
147
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
148
  ModelPreset(
149
- name="OLMo 2 7B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  hf_id="allenai/OLMo-2-0325-32B-Instruct",
151
  description="AI2's fully open 32B model (data+code+weights). Apache 2.0.",
152
  tier="large",
@@ -301,6 +393,15 @@ _PRESETS_LIST = [
301
  recommended_dtype="bfloat16",
302
  recommended_quantization="4bit",
303
  ),
 
 
 
 
 
 
 
 
 
304
  ModelPreset(
305
  name="DeepSeek-V3",
306
  hf_id="deepseek-ai/DeepSeek-V3",
@@ -448,6 +549,14 @@ _PRESETS_LIST = [
448
  params="1.7B",
449
  recommended_dtype="float16",
450
  ),
 
 
 
 
 
 
 
 
451
 
452
  # ╔══════════════════════════════════════════════════════════════════╗
453
  # β•‘ IBM (Granite) β•‘
@@ -537,6 +646,14 @@ _PRESETS_LIST = [
537
  params="3.8B",
538
  recommended_dtype="float16",
539
  ),
 
 
 
 
 
 
 
 
540
  ModelPreset(
541
  name="Phi-4",
542
  hf_id="microsoft/phi-4",
@@ -618,6 +735,18 @@ _PRESETS_LIST = [
618
  recommended_quantization="4bit",
619
  ),
620
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  # ╔══════════════════════════════════════════════════════════════════╗
622
  # β•‘ Moonshot AI (Kimi) β•‘
623
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
@@ -670,6 +799,19 @@ _PRESETS_LIST = [
670
  recommended_quantization="4bit",
671
  ),
672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673
  # ╔══════════════════════════════════════════════════════════════════╗
674
  # β•‘ OpenAI Community (GPT-2) β•‘
675
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
@@ -709,6 +851,15 @@ _PRESETS_LIST = [
709
  # ╔══════════════════════════════════════════════════════════════════╗
710
  # β•‘ Shanghai AI Lab (InternLM) β•‘
711
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
 
 
 
 
 
 
 
 
 
712
  ModelPreset(
713
  name="InternLM2.5 7B Chat",
714
  hf_id="internlm/internlm2_5-7b-chat",
@@ -764,6 +915,14 @@ _PRESETS_LIST = [
764
  # ╔══════════════════════════════════════════════════════════════════╗
765
  # β•‘ Technology Innovation Institute (Falcon) β•‘
766
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•οΏ½οΏ½οΏ½β•β•β•β•β•β•
 
 
 
 
 
 
 
 
767
  ModelPreset(
768
  name="Falcon 7B",
769
  hf_id="tiiuae/falcon-7b",
@@ -773,6 +932,15 @@ _PRESETS_LIST = [
773
  recommended_dtype="float16",
774
  recommended_quantization="4bit",
775
  ),
 
 
 
 
 
 
 
 
 
776
  ModelPreset(
777
  name="Falcon 11B",
778
  hf_id="tiiuae/falcon-11B",
@@ -813,6 +981,15 @@ _PRESETS_LIST = [
813
  recommended_dtype="float16",
814
  recommended_quantization="4bit",
815
  ),
 
 
 
 
 
 
 
 
 
816
  ModelPreset(
817
  name="GLM-4 32B Chat",
818
  hf_id="zai-org/GLM-4-32B-0414",
 
70
  params="0.5B",
71
  recommended_dtype="float16",
72
  ),
73
+ ModelPreset(
74
+ name="Qwen2.5-0.5B Instruct",
75
+ hf_id="Qwen/Qwen2.5-0.5B-Instruct",
76
+ description="Tiny Qwen instruct model, fast ablation studies with chat template.",
77
+ tier="tiny",
78
+ params="0.5B",
79
+ recommended_dtype="float16",
80
+ ),
81
+ ModelPreset(
82
+ name="Qwen3-0.6B",
83
+ hf_id="Qwen/Qwen3-0.6B",
84
+ description="Qwen3 0.6B β€” smallest Qwen3 with think/non-think modes.",
85
+ tier="tiny",
86
+ params="0.6B",
87
+ recommended_dtype="float16",
88
+ ),
89
  ModelPreset(
90
  name="Qwen2.5-1.5B",
91
  hf_id="Qwen/Qwen2.5-1.5B",
 
94
  params="1.5B",
95
  recommended_dtype="float16",
96
  ),
97
+ ModelPreset(
98
+ name="Qwen2.5-1.5B Instruct",
99
+ hf_id="Qwen/Qwen2.5-1.5B-Instruct",
100
+ description="Qwen 1.5B instruct β€” strong multilingual chat model.",
101
+ tier="small",
102
+ params="1.5B",
103
+ recommended_dtype="float16",
104
+ ),
105
+ ModelPreset(
106
+ name="Qwen3-1.7B",
107
+ hf_id="Qwen/Qwen3-1.7B",
108
+ description="Qwen3 1.7B β€” compact Qwen3 with think/non-think modes.",
109
+ tier="small",
110
+ params="1.7B",
111
+ recommended_dtype="float16",
112
+ ),
113
+ ModelPreset(
114
+ name="Qwen2.5-3B Instruct",
115
+ hf_id="Qwen/Qwen2.5-3B-Instruct",
116
+ description="Qwen 3B instruct β€” excellent small chat model.",
117
+ tier="small",
118
+ params="3B",
119
+ recommended_dtype="float16",
120
+ ),
121
+ ModelPreset(
122
+ name="Qwen3-4B",
123
+ hf_id="Qwen/Qwen3-4B",
124
+ description="Qwen3 4B β€” strong reasoning with think/non-think modes. Apache 2.0.",
125
+ tier="small",
126
+ params="4B",
127
+ recommended_dtype="float16",
128
+ ),
129
  ModelPreset(
130
  name="Qwen2.5-7B",
131
  hf_id="Qwen/Qwen2.5-7B",
 
135
  recommended_dtype="float16",
136
  recommended_quantization="4bit",
137
  ),
138
+ ModelPreset(
139
+ name="Qwen2.5-7B Instruct",
140
+ hf_id="Qwen/Qwen2.5-7B-Instruct",
141
+ description="Qwen 7B instruct variant with chat template.",
142
+ tier="medium",
143
+ params="7B",
144
+ recommended_dtype="float16",
145
+ recommended_quantization="4bit",
146
+ ),
147
+ ModelPreset(
148
+ name="Qwen2.5 Coder 7B Instruct",
149
+ hf_id="Qwen/Qwen2.5-Coder-7B-Instruct",
150
+ description="Qwen 7B fine-tuned for code generation and understanding.",
151
+ tier="medium",
152
+ params="7B",
153
+ recommended_dtype="float16",
154
+ recommended_quantization="4bit",
155
+ ),
156
+ ModelPreset(
157
+ name="Qwen3-8B",
158
+ hf_id="Qwen/Qwen3-8B",
159
+ description="Qwen3 8B β€” strong reasoning, think/non-think modes. Apache 2.0.",
160
+ tier="medium",
161
+ params="8B",
162
+ recommended_dtype="float16",
163
+ recommended_quantization="4bit",
164
+ ),
165
  ModelPreset(
166
  name="Qwen2.5-14B",
167
  hf_id="Qwen/Qwen2.5-14B",
 
221
  # β•‘ Allen Institute for AI (AI2) β•‘
222
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
223
  ModelPreset(
224
+ name="OLMo 2 1B Instruct",
225
+ hf_id="allenai/OLMo-2-0425-1B-Instruct",
226
+ description="AI2's compact fully open 1B instruct model. Apache 2.0.",
227
+ tier="tiny",
228
+ params="1B",
229
+ recommended_dtype="float16",
230
+ ),
231
+ ModelPreset(
232
+ name="OLMo 3 7B Instruct",
233
+ hf_id="allenai/Olmo-3-7B-Instruct",
234
+ description="AI2's fully open 7B instruct model. Apache 2.0.",
235
+ tier="medium",
236
+ params="7B",
237
+ recommended_dtype="float16",
238
+ recommended_quantization="4bit",
239
+ ),
240
+ ModelPreset(
241
+ name="OLMo 2 32B Instruct",
242
  hf_id="allenai/OLMo-2-0325-32B-Instruct",
243
  description="AI2's fully open 32B model (data+code+weights). Apache 2.0.",
244
  tier="large",
 
393
  recommended_dtype="bfloat16",
394
  recommended_quantization="4bit",
395
  ),
396
+ ModelPreset(
397
+ name="DeepSeek-R1 0528 Qwen3-8B",
398
+ hf_id="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
399
+ description="DeepSeek-R1 reasoning distilled into Qwen3 8B. Latest R1 distillation. MIT.",
400
+ tier="medium",
401
+ params="8B",
402
+ recommended_dtype="float16",
403
+ recommended_quantization="4bit",
404
+ ),
405
  ModelPreset(
406
  name="DeepSeek-V3",
407
  hf_id="deepseek-ai/DeepSeek-V3",
 
549
  params="1.7B",
550
  recommended_dtype="float16",
551
  ),
552
+ ModelPreset(
553
+ name="SmolLM3-3B",
554
+ hf_id="HuggingFaceTB/SmolLM3-3B",
555
+ description="HuggingFace's SmolLM3 3B. Latest efficient small LM.",
556
+ tier="small",
557
+ params="3B",
558
+ recommended_dtype="float16",
559
+ ),
560
 
561
  # ╔══════════════════════════════════════════════════════════════════╗
562
  # β•‘ IBM (Granite) β•‘
 
646
  params="3.8B",
647
  recommended_dtype="float16",
648
  ),
649
+ ModelPreset(
650
+ name="Phi-4 Mini Instruct",
651
+ hf_id="microsoft/Phi-4-mini-instruct",
652
+ description="Microsoft's 3.8B Phi-4 Mini. Strong reasoning for its size. MIT license.",
653
+ tier="small",
654
+ params="3.8B",
655
+ recommended_dtype="float16",
656
+ ),
657
  ModelPreset(
658
  name="Phi-4",
659
  hf_id="microsoft/phi-4",
 
735
  recommended_quantization="4bit",
736
  ),
737
 
738
+ # ╔══════════════════════════════════════════════════════════════════╗
739
+ # β•‘ OpenBMB β•‘
740
+ # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
741
+ ModelPreset(
742
+ name="MiniCPM3-4B",
743
+ hf_id="openbmb/MiniCPM3-4B",
744
+ description="OpenBMB's MiniCPM3 4B. Efficient on-device LM with strong reasoning.",
745
+ tier="small",
746
+ params="4B",
747
+ recommended_dtype="float16",
748
+ ),
749
+
750
  # ╔══════════════════════════════════════════════════════════════════╗
751
  # β•‘ Moonshot AI (Kimi) β•‘
752
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
 
799
  recommended_quantization="4bit",
800
  ),
801
 
802
+ # ╔══════════════════════════════════════════════════════════════════╗
803
+ # β•‘ OpenAI β•‘
804
+ # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
805
+ ModelPreset(
806
+ name="GPT-OSS 20B",
807
+ hf_id="openai/gpt-oss-20b",
808
+ description="OpenAI's first open-weight MoE (20B total, 3.6B active). MIT license.",
809
+ tier="large",
810
+ params="20B MoE",
811
+ recommended_dtype="float16",
812
+ recommended_quantization="4bit",
813
+ ),
814
+
815
  # ╔══════════════════════════════════════════════════════════════════╗
816
  # β•‘ OpenAI Community (GPT-2) β•‘
817
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
 
851
  # ╔══════════════════════════════════════════════════════════════════╗
852
  # β•‘ Shanghai AI Lab (InternLM) β•‘
853
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
854
+ ModelPreset(
855
+ name="InternLM3-8B Instruct",
856
+ hf_id="internlm/internlm3-8b-instruct",
857
+ description="Shanghai AI Lab's InternLM3 8B instruct. Strong reasoning. Apache 2.0.",
858
+ tier="medium",
859
+ params="8B",
860
+ recommended_dtype="float16",
861
+ recommended_quantization="4bit",
862
+ ),
863
  ModelPreset(
864
  name="InternLM2.5 7B Chat",
865
  hf_id="internlm/internlm2_5-7b-chat",
 
915
  # ╔══════════════════════════════════════════════════════════════════╗
916
  # β•‘ Technology Innovation Institute (Falcon) β•‘
917
  # β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•οΏ½οΏ½οΏ½β•β•β•β•β•β•
918
+ ModelPreset(
919
+ name="Falcon3-3B Instruct",
920
+ hf_id="tiiuae/Falcon3-3B-Instruct",
921
+ description="TII's Falcon3 3B instruct. Modern architecture, Apache 2.0.",
922
+ tier="small",
923
+ params="3B",
924
+ recommended_dtype="float16",
925
+ ),
926
  ModelPreset(
927
  name="Falcon 7B",
928
  hf_id="tiiuae/falcon-7b",
 
932
  recommended_dtype="float16",
933
  recommended_quantization="4bit",
934
  ),
935
+ ModelPreset(
936
+ name="Falcon3-7B Instruct",
937
+ hf_id="tiiuae/Falcon3-7B-Instruct",
938
+ description="TII's Falcon3 7B instruct. Modern architecture, Apache 2.0.",
939
+ tier="medium",
940
+ params="7B",
941
+ recommended_dtype="float16",
942
+ recommended_quantization="4bit",
943
+ ),
944
  ModelPreset(
945
  name="Falcon 11B",
946
  hf_id="tiiuae/falcon-11B",
 
981
  recommended_dtype="float16",
982
  recommended_quantization="4bit",
983
  ),
984
+ ModelPreset(
985
+ name="GLM-4 9B Chat HF",
986
+ hf_id="THUDM/glm-4-9b-chat-hf",
987
+ description="GLM-4 9B chat variant (HuggingFace-compatible format). No trust_remote_code needed.",
988
+ tier="medium",
989
+ params="9B",
990
+ recommended_dtype="float16",
991
+ recommended_quantization="4bit",
992
+ ),
993
  ModelPreset(
994
  name="GLM-4 32B Chat",
995
  hf_id="zai-org/GLM-4-32B-0414",
obliteratus/telemetry.py CHANGED
@@ -5,8 +5,9 @@ HuggingFace Dataset for community leaderboard aggregation. No user
5
  identity, IP addresses, or prompt content is stored β€” only aggregate
6
  benchmark metrics (model name, method, scores, hardware info, timestamp).
7
 
8
- Users can opt in by setting OBLITERATUS_TELEMETRY=1 or calling
9
- enable_telemetry(). Telemetry is disabled by default.
 
10
 
11
  Architecture:
12
  1. Every benchmark/obliteration run appends a record to a local JSONL
@@ -38,9 +39,9 @@ logger = logging.getLogger(__name__)
38
 
39
  # ── Configuration ─────────────────────────────────────────────────────
40
 
41
- _TELEMETRY_ENABLED = os.environ.get("OBLITERATUS_TELEMETRY", "0") == "1"
42
 
43
- # ── Opt-in telemetry state (v2 API) ──────────────────────────────────
44
  _enabled: bool | None = None
45
  _TELEMETRY_REPO = os.environ.get(
46
  "OBLITERATUS_TELEMETRY_REPO", "pliny-the-prompter/obliteratus-telemetry"
@@ -95,12 +96,12 @@ def is_telemetry_enabled() -> bool:
95
 
96
 
97
  def is_enabled() -> bool:
98
- """Check if v2 opt-in telemetry is enabled."""
99
  global _enabled
100
  if _enabled is not None:
101
  return _enabled
102
- env = os.environ.get("OBLITERATUS_TELEMETRY", "")
103
- return env in ("1", "true")
104
 
105
 
106
  # ── Record schema ─────────────────────────────────────────────────────
 
5
  identity, IP addresses, or prompt content is stored β€” only aggregate
6
  benchmark metrics (model name, method, scores, hardware info, timestamp).
7
 
8
+ Telemetry is enabled by default to help the community build better
9
+ benchmarks. Users can opt out at any time by setting OBLITERATUS_TELEMETRY=0
10
+ or calling disable_telemetry().
11
 
12
  Architecture:
13
  1. Every benchmark/obliteration run appends a record to a local JSONL
 
39
 
40
  # ── Configuration ─────────────────────────────────────────────────────
41
 
42
+ _TELEMETRY_ENABLED = os.environ.get("OBLITERATUS_TELEMETRY", "1") != "0"
43
 
44
+ # ── Telemetry state (v2 API) ─────────────────────────────────────────
45
  _enabled: bool | None = None
46
  _TELEMETRY_REPO = os.environ.get(
47
  "OBLITERATUS_TELEMETRY_REPO", "pliny-the-prompter/obliteratus-telemetry"
 
96
 
97
 
98
  def is_enabled() -> bool:
99
+ """Check if telemetry is enabled (on by default, opt out with OBLITERATUS_TELEMETRY=0)."""
100
  global _enabled
101
  if _enabled is not None:
102
  return _enabled
103
+ env = os.environ.get("OBLITERATUS_TELEMETRY", "1")
104
+ return env not in ("0", "false")
105
 
106
 
107
  # ── Record schema ─────────────────────────────────────────────────────
tests/test_telemetry.py CHANGED
@@ -37,18 +37,23 @@ class TestTelemetryConfig:
37
  def setup_method(self):
38
  _reset_telemetry()
39
 
40
- def test_disabled_by_default(self):
41
  with patch.dict(os.environ, {}, clear=True):
 
 
 
 
 
42
  _reset_telemetry()
43
  assert not is_enabled()
44
 
45
- def test_enable_via_env(self):
46
- with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
47
  _reset_telemetry()
48
- assert is_enabled()
49
 
50
- def test_enable_via_env_true(self):
51
- with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "true"}):
52
  _reset_telemetry()
53
  assert is_enabled()
54
 
 
37
  def setup_method(self):
38
  _reset_telemetry()
39
 
40
+ def test_enabled_by_default(self):
41
  with patch.dict(os.environ, {}, clear=True):
42
+ _reset_telemetry()
43
+ assert is_enabled()
44
+
45
+ def test_disable_via_env_zero(self):
46
+ with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "0"}):
47
  _reset_telemetry()
48
  assert not is_enabled()
49
 
50
+ def test_disable_via_env_false(self):
51
+ with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "false"}):
52
  _reset_telemetry()
53
+ assert not is_enabled()
54
 
55
+ def test_enable_via_env_explicit(self):
56
+ with patch.dict(os.environ, {"OBLITERATUS_TELEMETRY": "1"}):
57
  _reset_telemetry()
58
  assert is_enabled()
59