Anisha Bhatnagar
commited on
Commit
Β·
3ad08b5
1
Parent(s):
9a097e7
fixed bug in region computation for all task authors and reduced logging
Browse files- app.py +1 -0
- utils/interp_space_utils.py +17 -17
- utils/llm_feat_utils.py +7 -2
app.py
CHANGED
|
@@ -22,6 +22,7 @@ def load_config(path="config/config.yaml"):
|
|
| 22 |
return yaml.safe_load(f)
|
| 23 |
|
| 24 |
# A comment to trigger change in spaces
|
|
|
|
| 25 |
cfg = load_config()
|
| 26 |
|
| 27 |
|
|
|
|
| 22 |
return yaml.safe_load(f)
|
| 23 |
|
| 24 |
# A comment to trigger change in spaces
|
| 25 |
+
# comment 2
|
| 26 |
cfg = load_config()
|
| 27 |
|
| 28 |
|
utils/interp_space_utils.py
CHANGED
|
@@ -697,19 +697,19 @@ def compute_clusters_g2v_representation(
|
|
| 697 |
filtered_features = []
|
| 698 |
for feature, score, z_score in top_g2v_feats:
|
| 699 |
# DEBUG: Print what we're checking for this feature
|
| 700 |
-
print(f"[DEBUG] Checking feature: {feature}")
|
| 701 |
-
print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
|
| 702 |
|
| 703 |
# Check if the feature has a non-zero value in all of the selected authors
|
| 704 |
feature_presence = []
|
| 705 |
for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
|
| 706 |
feature_value = author_g2v_feats.get(feature, 0)
|
| 707 |
feature_presence.append(feature_value)
|
| 708 |
-
print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
|
| 709 |
|
| 710 |
-
print(f"[DEBUG] All feature values: {feature_presence}")
|
| 711 |
-
print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
|
| 712 |
-
print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
|
| 713 |
|
| 714 |
# First check: feature must be present in Gram2Vec vectors
|
| 715 |
vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
|
|
@@ -727,11 +727,11 @@ def compute_clusters_g2v_representation(
|
|
| 727 |
|
| 728 |
spans = find_feature_spans(doc_text, feature)
|
| 729 |
if not spans: # No spans found in this document
|
| 730 |
-
print(f"[DEBUG] β Feature '{feature}' not found in document {i} of selected author")
|
| 731 |
text_present = False
|
| 732 |
break
|
| 733 |
-
else:
|
| 734 |
-
print(f"[DEBUG] β Feature '{feature}' found in document {i} with {len(spans)} spans")
|
| 735 |
except Exception as e:
|
| 736 |
print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
|
| 737 |
# Fall back to vector-based filtering if text checking fails
|
|
@@ -740,13 +740,13 @@ def compute_clusters_g2v_representation(
|
|
| 740 |
# Feature must pass BOTH checks
|
| 741 |
if vector_present and text_present:
|
| 742 |
filtered_features.append((feature, score, z_score))
|
| 743 |
-
print(f"[DEBUG] β Feature '{feature}' PASSED both vector and text checks")
|
| 744 |
-
else:
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
|
| 751 |
|
| 752 |
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|
|
@@ -947,7 +947,7 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_n
|
|
| 947 |
# Region 11: Centroid of all task authors (mystery + 3 candidates)
|
| 948 |
task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
|
| 949 |
regions["All Task Authors Centroid"] = get_region_around_point(
|
| 950 |
-
task_centroid, "All Task Authors"
|
| 951 |
)
|
| 952 |
|
| 953 |
def serialize_numpy_dtypes(obj):
|
|
|
|
| 697 |
filtered_features = []
|
| 698 |
for feature, score, z_score in top_g2v_feats:
|
| 699 |
# DEBUG: Print what we're checking for this feature
|
| 700 |
+
# print(f"[DEBUG] Checking feature: {feature}")
|
| 701 |
+
# print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
|
| 702 |
|
| 703 |
# Check if the feature has a non-zero value in all of the selected authors
|
| 704 |
feature_presence = []
|
| 705 |
for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
|
| 706 |
feature_value = author_g2v_feats.get(feature, 0)
|
| 707 |
feature_presence.append(feature_value)
|
| 708 |
+
# print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
|
| 709 |
|
| 710 |
+
# print(f"[DEBUG] All feature values: {feature_presence}")
|
| 711 |
+
# print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
|
| 712 |
+
# print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
|
| 713 |
|
| 714 |
# First check: feature must be present in Gram2Vec vectors
|
| 715 |
vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
|
|
|
|
| 727 |
|
| 728 |
spans = find_feature_spans(doc_text, feature)
|
| 729 |
if not spans: # No spans found in this document
|
| 730 |
+
# print(f"[DEBUG] β Feature '{feature}' not found in document {i} of selected author")
|
| 731 |
text_present = False
|
| 732 |
break
|
| 733 |
+
# else:
|
| 734 |
+
# print(f"[DEBUG] β Feature '{feature}' found in document {i} with {len(spans)} spans")
|
| 735 |
except Exception as e:
|
| 736 |
print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
|
| 737 |
# Fall back to vector-based filtering if text checking fails
|
|
|
|
| 740 |
# Feature must pass BOTH checks
|
| 741 |
if vector_present and text_present:
|
| 742 |
filtered_features.append((feature, score, z_score))
|
| 743 |
+
# print(f"[DEBUG] β Feature '{feature}' PASSED both vector and text checks")
|
| 744 |
+
# else:
|
| 745 |
+
# if not vector_present:
|
| 746 |
+
# # print(f"[DEBUG] β Feature '{feature}' FAILED vector check")
|
| 747 |
+
# if not text_present:
|
| 748 |
+
# # print(f"[DEBUG] β Feature '{feature}' FAILED text check")
|
| 749 |
+
# # print(f"[DEBUG] β Feature '{feature}' FAILED the filter")
|
| 750 |
|
| 751 |
|
| 752 |
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|
|
|
|
| 947 |
# Region 11: Centroid of all task authors (mystery + 3 candidates)
|
| 948 |
task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
|
| 949 |
regions["All Task Authors Centroid"] = get_region_around_point(
|
| 950 |
+
task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
|
| 951 |
)
|
| 952 |
|
| 953 |
def serialize_numpy_dtypes(obj):
|
utils/llm_feat_utils.py
CHANGED
|
@@ -100,6 +100,8 @@ def generate_feature_spans_cached(client, text: str, features: list[str], role:
|
|
| 100 |
cache = {}
|
| 101 |
result: dict[str, list[str]] = {}
|
| 102 |
missing_feats: list[str] = []
|
|
|
|
|
|
|
| 103 |
|
| 104 |
for feat in features:
|
| 105 |
if feat == "None":
|
|
@@ -108,12 +110,15 @@ def generate_feature_spans_cached(client, text: str, features: list[str], role:
|
|
| 108 |
|
| 109 |
h = _feat_hash(feat, text)
|
| 110 |
if h in cache:
|
| 111 |
-
print(f"Found feature: {feat}")
|
|
|
|
| 112 |
result[feat] = cache[h]["spans"]
|
| 113 |
else:
|
| 114 |
-
print(f"Missing feature: {feat}")
|
|
|
|
| 115 |
missing_feats.append(feat)
|
| 116 |
|
|
|
|
| 117 |
if missing_feats:
|
| 118 |
|
| 119 |
mapping = generate_feature_spans_with_retries(client, text, missing_feats)
|
|
|
|
| 100 |
cache = {}
|
| 101 |
result: dict[str, list[str]] = {}
|
| 102 |
missing_feats: list[str] = []
|
| 103 |
+
missing_feats_count = 0
|
| 104 |
+
found_feats_count = 0
|
| 105 |
|
| 106 |
for feat in features:
|
| 107 |
if feat == "None":
|
|
|
|
| 110 |
|
| 111 |
h = _feat_hash(feat, text)
|
| 112 |
if h in cache:
|
| 113 |
+
# print(f"Found feature: {feat}")
|
| 114 |
+
found_feats_count += 1
|
| 115 |
result[feat] = cache[h]["spans"]
|
| 116 |
else:
|
| 117 |
+
# print(f"Missing feature: {feat}")
|
| 118 |
+
missing_feats_count += 1
|
| 119 |
missing_feats.append(feat)
|
| 120 |
|
| 121 |
+
print(f"Found {found_feats_count} features in cache, {missing_feats_count} missing")
|
| 122 |
if missing_feats:
|
| 123 |
|
| 124 |
mapping = generate_feature_spans_with_retries(client, text, missing_feats)
|