Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Sleeping

App Files Files Community

Claude commited on Feb 13

Commit

a69f12b

1 Parent(s): 998779f

Fix Windows encoding issues in diagnostic script

Browse files

Files changed (1) hide show

scripts/diagnose_structural_clothing.py +14 -14

scripts/diagnose_structural_clothing.py CHANGED Viewed

@@ -89,9 +89,9 @@ def run_diagnostic():
         caption = test_case["caption"]
         expected = test_case["expected"]
-        print(f"\n{'─' * 80}")
         print(f"TEST {i}/{len(TEST_CASES)}: {name}")
-        print(f"{'─' * 80}")
         print(f"Caption: {caption}")
         print(f"Expected tags: {expected}")
         print(f"\nCalling LLM...", flush=True)
@@ -123,13 +123,13 @@ def run_diagnostic():
         clothing_correct = expected_clothing == selected_clothing
-        print(f"\n✓ Correct:  {sorted(correct)}")
         if missed:
-            print(f"✗ Missed:   {sorted(missed)}")
         if extra:
-            print(f"⚠ Extra:    {sorted(extra)}")
-        print(f"\nClothing state inference: {'✓ PASS' if clothing_correct else '✗ FAIL'}")
         if expected_clothing:
             print(f"  Expected: {sorted(expected_clothing)}")
             print(f"  Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
@@ -155,16 +155,16 @@ def run_diagnostic():
     print(f"\nTotal tests: {total_tests}")
     print(f"Clothing state inference:")
-    print(f"  ✓ Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
-    print(f"  ✗ Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
     if clothing_fail > 0:
-        print(f"\n{'─' * 80}")
         print("FAILURES:")
-        print(f"{'─' * 80}")
         for r in results:
             if not r["clothing_correct"]:
-                print(f"\n• {r['name']}")
                 print(f"  Caption: {r['caption'][:60]}...")
                 clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
                 exp_clothing = set(r['expected']) & clothing_tags
@@ -178,15 +178,15 @@ def run_diagnostic():
     print(f"{'=' * 80}")
     if clothing_pass == total_tests:
-        print("\n✓ All tests passed! Clothing inference is working correctly.")
     elif clothing_pass == 0:
-        print("\n✗ ALL tests failed! The LLM is completely ignoring the clothing state group.")
         print("\nPossible causes:")
         print("1. Prompt design issue - clothing group not salient enough")
         print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
         print("3. Response parsing issue - LLM is selecting but parser is missing it")
     else:
-        print(f"\n⚠ Partial failure! {clothing_fail}/{total_tests} tests failed.")
         print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
     return results

         caption = test_case["caption"]
         expected = test_case["expected"]
+        print(f"\n{'-' * 80}")
         print(f"TEST {i}/{len(TEST_CASES)}: {name}")
+        print(f"{'-' * 80}")
         print(f"Caption: {caption}")
         print(f"Expected tags: {expected}")
         print(f"\nCalling LLM...", flush=True)
         clothing_correct = expected_clothing == selected_clothing
+        print(f"\n[OK] Correct:  {sorted(correct)}")
         if missed:
+            print(f"[X] Missed:   {sorted(missed)}")
         if extra:
+            print(f"[!] Extra:    {sorted(extra)}")
+        print(f"\nClothing state inference: {'[OK] PASS' if clothing_correct else '[X] FAIL'}")
         if expected_clothing:
             print(f"  Expected: {sorted(expected_clothing)}")
             print(f"  Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
     print(f"\nTotal tests: {total_tests}")
     print(f"Clothing state inference:")
+    print(f"  [OK] Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
+    print(f"  [X] Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
     if clothing_fail > 0:
+        print(f"\n{'-' * 80}")
         print("FAILURES:")
+        print(f"{'-' * 80}")
         for r in results:
             if not r["clothing_correct"]:
+                print(f"\n* {r['name']}")
                 print(f"  Caption: {r['caption'][:60]}...")
                 clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
                 exp_clothing = set(r['expected']) & clothing_tags
     print(f"{'=' * 80}")
     if clothing_pass == total_tests:
+        print("\n[OK] All tests passed! Clothing inference is working correctly.")
     elif clothing_pass == 0:
+        print("\n[X] ALL tests failed! The LLM is completely ignoring the clothing state group.")
         print("\nPossible causes:")
         print("1. Prompt design issue - clothing group not salient enough")
         print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
         print("3. Response parsing issue - LLM is selecting but parser is missing it")
     else:
+        print(f"\n[!] Partial failure! {clothing_fail}/{total_tests} tests failed.")
         print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
     return results