Spaces:
Running
Running
Claude commited on
Commit Β·
a69f12b
1
Parent(s): 998779f
Fix Windows encoding issues in diagnostic script
Browse files
scripts/diagnose_structural_clothing.py
CHANGED
|
@@ -89,9 +89,9 @@ def run_diagnostic():
|
|
| 89 |
caption = test_case["caption"]
|
| 90 |
expected = test_case["expected"]
|
| 91 |
|
| 92 |
-
print(f"\n{'
|
| 93 |
print(f"TEST {i}/{len(TEST_CASES)}: {name}")
|
| 94 |
-
print(f"{'
|
| 95 |
print(f"Caption: {caption}")
|
| 96 |
print(f"Expected tags: {expected}")
|
| 97 |
print(f"\nCalling LLM...", flush=True)
|
|
@@ -123,13 +123,13 @@ def run_diagnostic():
|
|
| 123 |
|
| 124 |
clothing_correct = expected_clothing == selected_clothing
|
| 125 |
|
| 126 |
-
print(f"\n
|
| 127 |
if missed:
|
| 128 |
-
print(f"
|
| 129 |
if extra:
|
| 130 |
-
print(f"
|
| 131 |
|
| 132 |
-
print(f"\nClothing state inference: {'
|
| 133 |
if expected_clothing:
|
| 134 |
print(f" Expected: {sorted(expected_clothing)}")
|
| 135 |
print(f" Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
|
|
@@ -155,16 +155,16 @@ def run_diagnostic():
|
|
| 155 |
|
| 156 |
print(f"\nTotal tests: {total_tests}")
|
| 157 |
print(f"Clothing state inference:")
|
| 158 |
-
print(f"
|
| 159 |
-
print(f"
|
| 160 |
|
| 161 |
if clothing_fail > 0:
|
| 162 |
-
print(f"\n{'
|
| 163 |
print("FAILURES:")
|
| 164 |
-
print(f"{'
|
| 165 |
for r in results:
|
| 166 |
if not r["clothing_correct"]:
|
| 167 |
-
print(f"\n
|
| 168 |
print(f" Caption: {r['caption'][:60]}...")
|
| 169 |
clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
|
| 170 |
exp_clothing = set(r['expected']) & clothing_tags
|
|
@@ -178,15 +178,15 @@ def run_diagnostic():
|
|
| 178 |
print(f"{'=' * 80}")
|
| 179 |
|
| 180 |
if clothing_pass == total_tests:
|
| 181 |
-
print("\n
|
| 182 |
elif clothing_pass == 0:
|
| 183 |
-
print("\n
|
| 184 |
print("\nPossible causes:")
|
| 185 |
print("1. Prompt design issue - clothing group not salient enough")
|
| 186 |
print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
|
| 187 |
print("3. Response parsing issue - LLM is selecting but parser is missing it")
|
| 188 |
else:
|
| 189 |
-
print(f"\n
|
| 190 |
print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
|
| 191 |
|
| 192 |
return results
|
|
|
|
| 89 |
caption = test_case["caption"]
|
| 90 |
expected = test_case["expected"]
|
| 91 |
|
| 92 |
+
print(f"\n{'-' * 80}")
|
| 93 |
print(f"TEST {i}/{len(TEST_CASES)}: {name}")
|
| 94 |
+
print(f"{'-' * 80}")
|
| 95 |
print(f"Caption: {caption}")
|
| 96 |
print(f"Expected tags: {expected}")
|
| 97 |
print(f"\nCalling LLM...", flush=True)
|
|
|
|
| 123 |
|
| 124 |
clothing_correct = expected_clothing == selected_clothing
|
| 125 |
|
| 126 |
+
print(f"\n[OK] Correct: {sorted(correct)}")
|
| 127 |
if missed:
|
| 128 |
+
print(f"[X] Missed: {sorted(missed)}")
|
| 129 |
if extra:
|
| 130 |
+
print(f"[!] Extra: {sorted(extra)}")
|
| 131 |
|
| 132 |
+
print(f"\nClothing state inference: {'[OK] PASS' if clothing_correct else '[X] FAIL'}")
|
| 133 |
if expected_clothing:
|
| 134 |
print(f" Expected: {sorted(expected_clothing)}")
|
| 135 |
print(f" Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
|
|
|
|
| 155 |
|
| 156 |
print(f"\nTotal tests: {total_tests}")
|
| 157 |
print(f"Clothing state inference:")
|
| 158 |
+
print(f" [OK] Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
|
| 159 |
+
print(f" [X] Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
|
| 160 |
|
| 161 |
if clothing_fail > 0:
|
| 162 |
+
print(f"\n{'-' * 80}")
|
| 163 |
print("FAILURES:")
|
| 164 |
+
print(f"{'-' * 80}")
|
| 165 |
for r in results:
|
| 166 |
if not r["clothing_correct"]:
|
| 167 |
+
print(f"\n* {r['name']}")
|
| 168 |
print(f" Caption: {r['caption'][:60]}...")
|
| 169 |
clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
|
| 170 |
exp_clothing = set(r['expected']) & clothing_tags
|
|
|
|
| 178 |
print(f"{'=' * 80}")
|
| 179 |
|
| 180 |
if clothing_pass == total_tests:
|
| 181 |
+
print("\n[OK] All tests passed! Clothing inference is working correctly.")
|
| 182 |
elif clothing_pass == 0:
|
| 183 |
+
print("\n[X] ALL tests failed! The LLM is completely ignoring the clothing state group.")
|
| 184 |
print("\nPossible causes:")
|
| 185 |
print("1. Prompt design issue - clothing group not salient enough")
|
| 186 |
print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
|
| 187 |
print("3. Response parsing issue - LLM is selecting but parser is missing it")
|
| 188 |
else:
|
| 189 |
+
print(f"\n[!] Partial failure! {clothing_fail}/{total_tests} tests failed.")
|
| 190 |
print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
|
| 191 |
|
| 192 |
return results
|