Claude commited on
Commit
a69f12b
Β·
1 Parent(s): 998779f

Fix Windows encoding issues in diagnostic script

Browse files
scripts/diagnose_structural_clothing.py CHANGED
@@ -89,9 +89,9 @@ def run_diagnostic():
89
  caption = test_case["caption"]
90
  expected = test_case["expected"]
91
 
92
- print(f"\n{'─' * 80}")
93
  print(f"TEST {i}/{len(TEST_CASES)}: {name}")
94
- print(f"{'─' * 80}")
95
  print(f"Caption: {caption}")
96
  print(f"Expected tags: {expected}")
97
  print(f"\nCalling LLM...", flush=True)
@@ -123,13 +123,13 @@ def run_diagnostic():
123
 
124
  clothing_correct = expected_clothing == selected_clothing
125
 
126
- print(f"\nβœ“ Correct: {sorted(correct)}")
127
  if missed:
128
- print(f"βœ— Missed: {sorted(missed)}")
129
  if extra:
130
- print(f"⚠ Extra: {sorted(extra)}")
131
 
132
- print(f"\nClothing state inference: {'βœ“ PASS' if clothing_correct else 'βœ— FAIL'}")
133
  if expected_clothing:
134
  print(f" Expected: {sorted(expected_clothing)}")
135
  print(f" Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
@@ -155,16 +155,16 @@ def run_diagnostic():
155
 
156
  print(f"\nTotal tests: {total_tests}")
157
  print(f"Clothing state inference:")
158
- print(f" βœ“ Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
159
- print(f" βœ— Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
160
 
161
  if clothing_fail > 0:
162
- print(f"\n{'─' * 80}")
163
  print("FAILURES:")
164
- print(f"{'─' * 80}")
165
  for r in results:
166
  if not r["clothing_correct"]:
167
- print(f"\nβ€’ {r['name']}")
168
  print(f" Caption: {r['caption'][:60]}...")
169
  clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
170
  exp_clothing = set(r['expected']) & clothing_tags
@@ -178,15 +178,15 @@ def run_diagnostic():
178
  print(f"{'=' * 80}")
179
 
180
  if clothing_pass == total_tests:
181
- print("\nβœ“ All tests passed! Clothing inference is working correctly.")
182
  elif clothing_pass == 0:
183
- print("\nβœ— ALL tests failed! The LLM is completely ignoring the clothing state group.")
184
  print("\nPossible causes:")
185
  print("1. Prompt design issue - clothing group not salient enough")
186
  print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
187
  print("3. Response parsing issue - LLM is selecting but parser is missing it")
188
  else:
189
- print(f"\n⚠ Partial failure! {clothing_fail}/{total_tests} tests failed.")
190
  print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
191
 
192
  return results
 
89
  caption = test_case["caption"]
90
  expected = test_case["expected"]
91
 
92
+ print(f"\n{'-' * 80}")
93
  print(f"TEST {i}/{len(TEST_CASES)}: {name}")
94
+ print(f"{'-' * 80}")
95
  print(f"Caption: {caption}")
96
  print(f"Expected tags: {expected}")
97
  print(f"\nCalling LLM...", flush=True)
 
123
 
124
  clothing_correct = expected_clothing == selected_clothing
125
 
126
+ print(f"\n[OK] Correct: {sorted(correct)}")
127
  if missed:
128
+ print(f"[X] Missed: {sorted(missed)}")
129
  if extra:
130
+ print(f"[!] Extra: {sorted(extra)}")
131
 
132
+ print(f"\nClothing state inference: {'[OK] PASS' if clothing_correct else '[X] FAIL'}")
133
  if expected_clothing:
134
  print(f" Expected: {sorted(expected_clothing)}")
135
  print(f" Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
 
155
 
156
  print(f"\nTotal tests: {total_tests}")
157
  print(f"Clothing state inference:")
158
+ print(f" [OK] Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
159
+ print(f" [X] Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
160
 
161
  if clothing_fail > 0:
162
+ print(f"\n{'-' * 80}")
163
  print("FAILURES:")
164
+ print(f"{'-' * 80}")
165
  for r in results:
166
  if not r["clothing_correct"]:
167
+ print(f"\n* {r['name']}")
168
  print(f" Caption: {r['caption'][:60]}...")
169
  clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
170
  exp_clothing = set(r['expected']) & clothing_tags
 
178
  print(f"{'=' * 80}")
179
 
180
  if clothing_pass == total_tests:
181
+ print("\n[OK] All tests passed! Clothing inference is working correctly.")
182
  elif clothing_pass == 0:
183
+ print("\n[X] ALL tests failed! The LLM is completely ignoring the clothing state group.")
184
  print("\nPossible causes:")
185
  print("1. Prompt design issue - clothing group not salient enough")
186
  print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
187
  print("3. Response parsing issue - LLM is selecting but parser is missing it")
188
  else:
189
+ print(f"\n[!] Partial failure! {clothing_fail}/{total_tests} tests failed.")
190
  print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
191
 
192
  return results