Anisha Bhatnagar commited on
Commit
3ad08b5
Β·
1 Parent(s): 9a097e7

fixed bug in region computation for all task authors and reduced logging

Browse files
Files changed (3) hide show
  1. app.py +1 -0
  2. utils/interp_space_utils.py +17 -17
  3. utils/llm_feat_utils.py +7 -2
app.py CHANGED
@@ -22,6 +22,7 @@ def load_config(path="config/config.yaml"):
22
  return yaml.safe_load(f)
23
 
24
  # A comment to trigger change in spaces
 
25
  cfg = load_config()
26
 
27
 
 
22
  return yaml.safe_load(f)
23
 
24
  # A comment to trigger change in spaces
25
+ # comment 2
26
  cfg = load_config()
27
 
28
 
utils/interp_space_utils.py CHANGED
@@ -697,19 +697,19 @@ def compute_clusters_g2v_representation(
697
  filtered_features = []
698
  for feature, score, z_score in top_g2v_feats:
699
  # DEBUG: Print what we're checking for this feature
700
- print(f"[DEBUG] Checking feature: {feature}")
701
- print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
702
 
703
  # Check if the feature has a non-zero value in all of the selected authors
704
  feature_presence = []
705
  for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
706
  feature_value = author_g2v_feats.get(feature, 0)
707
  feature_presence.append(feature_value)
708
- print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
709
 
710
- print(f"[DEBUG] All feature values: {feature_presence}")
711
- print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
712
- print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
713
 
714
  # First check: feature must be present in Gram2Vec vectors
715
  vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
@@ -727,11 +727,11 @@ def compute_clusters_g2v_representation(
727
 
728
  spans = find_feature_spans(doc_text, feature)
729
  if not spans: # No spans found in this document
730
- print(f"[DEBUG] βœ— Feature '{feature}' not found in document {i} of selected author")
731
  text_present = False
732
  break
733
- else:
734
- print(f"[DEBUG] βœ“ Feature '{feature}' found in document {i} with {len(spans)} spans")
735
  except Exception as e:
736
  print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
737
  # Fall back to vector-based filtering if text checking fails
@@ -740,13 +740,13 @@ def compute_clusters_g2v_representation(
740
  # Feature must pass BOTH checks
741
  if vector_present and text_present:
742
  filtered_features.append((feature, score, z_score))
743
- print(f"[DEBUG] βœ“ Feature '{feature}' PASSED both vector and text checks")
744
- else:
745
- if not vector_present:
746
- print(f"[DEBUG] βœ— Feature '{feature}' FAILED vector check")
747
- if not text_present:
748
- print(f"[DEBUG] βœ— Feature '{feature}' FAILED text check")
749
- print(f"[DEBUG] βœ— Feature '{feature}' FAILED the filter")
750
 
751
 
752
  print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
@@ -947,7 +947,7 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_n
947
  # Region 11: Centroid of all task authors (mystery + 3 candidates)
948
  task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
949
  regions["All Task Authors Centroid"] = get_region_around_point(
950
- task_centroid, "All Task Authors"
951
  )
952
 
953
  def serialize_numpy_dtypes(obj):
 
697
  filtered_features = []
698
  for feature, score, z_score in top_g2v_feats:
699
  # DEBUG: Print what we're checking for this feature
700
+ # print(f"[DEBUG] Checking feature: {feature}")
701
+ # print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
702
 
703
  # Check if the feature has a non-zero value in all of the selected authors
704
  feature_presence = []
705
  for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
706
  feature_value = author_g2v_feats.get(feature, 0)
707
  feature_presence.append(feature_value)
708
+ # print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
709
 
710
+ # print(f"[DEBUG] All feature values: {feature_presence}")
711
+ # print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
712
+ # print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
713
 
714
  # First check: feature must be present in Gram2Vec vectors
715
  vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
 
727
 
728
  spans = find_feature_spans(doc_text, feature)
729
  if not spans: # No spans found in this document
730
+ # print(f"[DEBUG] βœ— Feature '{feature}' not found in document {i} of selected author")
731
  text_present = False
732
  break
733
+ # else:
734
+ # print(f"[DEBUG] βœ“ Feature '{feature}' found in document {i} with {len(spans)} spans")
735
  except Exception as e:
736
  print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
737
  # Fall back to vector-based filtering if text checking fails
 
740
  # Feature must pass BOTH checks
741
  if vector_present and text_present:
742
  filtered_features.append((feature, score, z_score))
743
+ # print(f"[DEBUG] βœ“ Feature '{feature}' PASSED both vector and text checks")
744
+ # else:
745
+ # if not vector_present:
746
+ # # print(f"[DEBUG] βœ— Feature '{feature}' FAILED vector check")
747
+ # if not text_present:
748
+ # # print(f"[DEBUG] βœ— Feature '{feature}' FAILED text check")
749
+ # # print(f"[DEBUG] βœ— Feature '{feature}' FAILED the filter")
750
 
751
 
752
  print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
 
947
  # Region 11: Centroid of all task authors (mystery + 3 candidates)
948
  task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
949
  regions["All Task Authors Centroid"] = get_region_around_point(
950
+ task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
951
  )
952
 
953
  def serialize_numpy_dtypes(obj):
utils/llm_feat_utils.py CHANGED
@@ -100,6 +100,8 @@ def generate_feature_spans_cached(client, text: str, features: list[str], role:
100
  cache = {}
101
  result: dict[str, list[str]] = {}
102
  missing_feats: list[str] = []
 
 
103
 
104
  for feat in features:
105
  if feat == "None":
@@ -108,12 +110,15 @@ def generate_feature_spans_cached(client, text: str, features: list[str], role:
108
 
109
  h = _feat_hash(feat, text)
110
  if h in cache:
111
- print(f"Found feature: {feat}")
 
112
  result[feat] = cache[h]["spans"]
113
  else:
114
- print(f"Missing feature: {feat}")
 
115
  missing_feats.append(feat)
116
 
 
117
  if missing_feats:
118
 
119
  mapping = generate_feature_spans_with_retries(client, text, missing_feats)
 
100
  cache = {}
101
  result: dict[str, list[str]] = {}
102
  missing_feats: list[str] = []
103
+ missing_feats_count = 0
104
+ found_feats_count = 0
105
 
106
  for feat in features:
107
  if feat == "None":
 
110
 
111
  h = _feat_hash(feat, text)
112
  if h in cache:
113
+ # print(f"Found feature: {feat}")
114
+ found_feats_count += 1
115
  result[feat] = cache[h]["spans"]
116
  else:
117
+ # print(f"Missing feature: {feat}")
118
+ missing_feats_count += 1
119
  missing_feats.append(feat)
120
 
121
+ print(f"Found {found_feats_count} features in cache, {missing_feats_count} missing")
122
  if missing_feats:
123
 
124
  mapping = generate_feature_spans_with_retries(client, text, missing_feats)