Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

peter-zeng commited on Sep 22

Commit

5d8a39b

1 Parent(s): 3676d21

Revert "updated to use mystery + predicted"

Browse files

This reverts commit 07c4d0f325f6905ba1f854f1014066abb46d2a72.
remove gram2vec changes

Files changed (1) hide show

utils/interp_space_utils.py +3 -133

utils/interp_space_utils.py CHANGED Viewed

@@ -31,14 +31,6 @@ os.makedirs(os.path.dirname(REGION_CACHE), exist_ok=True)
 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
-# Features to exclude from Gram2Vec outputs
-EXCLUDED_G2V_FEATURE_PREFIXES = [
-    'num_tokens'
-]
-EXCLUDED_G2V_FEATURES = set([
-    'num_tokens:num_tokens'
-])
 class style_analysis_schema(BaseModel):
     features: list[str]
     spans: dict[str, dict[str, list[str]]]
@@ -67,8 +59,8 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
         print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
     # Gather the input texts (preserves list-of-strings if any)
-    # If an entry is a list of strings, join; otherwise use the string as-is
-    author_texts = [('\n\n'.join(x) if isinstance(x, list) else x) for x in clustered_authors_df.fullText.tolist()]
     print(f"Number of author_texts: {len(author_texts)}")
@@ -694,11 +686,7 @@ def compute_clusters_g2v_representation(
     # Keep only features that have a positive contrastive score
     top_g2v_feats = sorted(
-        [
-            (feat, val, z_score)
-            for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores)
-            if val > 0 and feat not in EXCLUDED_G2V_FEATURES and not any(feat.startswith(p) for p in EXCLUDED_G2V_FEATURE_PREFIXES)
-        ],
         key=lambda x: -x[1]  # Sort by contrastive score
     )
@@ -788,124 +776,6 @@ def compute_clusters_g2v_representation(
     return filtered_features[:top_n]  # Return tuples with z-scores
-def compute_task_only_g2v_similarity(
-    background_corpus_df: pd.DataFrame,
-    visible_author_ids: List[Any],
-    features_clm_name: str = 'g2v_vector',
-    top_n: int = 10,
-    require_spans: bool = True
-) -> List[tuple]:
-    """
-    Compute top Gram2Vec features that are shared between the Mystery author and the
-    predicted Candidate author, ignoring background authors and contrast.
-    Selection is limited to task authors within the zoom (i.e., present in
-    `visible_author_ids`). A feature is kept if:
-      - it has a positive value (> 0) for both Mystery and Predicted Candidate,
-      - and (optionally) at least one detected span exists in both authors' texts.
-    Scoring strategy prioritizes features strong in both authors: score = min(mystery_value, predicted_value).
-    Returns a list of (feature_name, score) tuples sorted by score desc, limited to top_n.
-    """
-    task_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
-    # Filter to visible task authors
-    is_visible = background_corpus_df['authorID'].isin(visible_author_ids)
-    is_task = background_corpus_df['authorID'].isin(task_names)
-    visible_task_df = background_corpus_df[is_visible & is_task]
-    if visible_task_df.empty:
-        return []
-    # Identify Mystery author row within the visible set
-    mystery_rows = visible_task_df[visible_task_df['authorID'] == 'Mystery author']
-    if mystery_rows.empty:
-        # If Mystery is not visible, fall back to using any available Mystery row in the corpus
-        mystery_rows = background_corpus_df[background_corpus_df['authorID'] == 'Mystery author']
-        if mystery_rows.empty:
-            return []
-    mystery_row = mystery_rows.iloc[0]
-    # Identify the predicted candidate within the visible set using the 'predicted' flag if present
-    predicted_row = None
-    if 'predicted' in visible_task_df.columns:
-        pred_candidates = visible_task_df[visible_task_df['predicted'] == True]
-        if not pred_candidates.empty:
-            predicted_row = pred_candidates.iloc[0]
-    # If not found in visible, try to find anywhere in the corpus
-    if predicted_row is None and 'predicted' in background_corpus_df.columns:
-        pred_any = background_corpus_df[background_corpus_df['predicted'] == True]
-        # Prefer one that is also a task author
-        pred_any = pred_any[pred_any['authorID'].isin(task_names)] if not pred_any.empty else pred_any
-        if not pred_any.empty:
-            predicted_row = pred_any.iloc[0]
-    # If still not found, we cannot build a pair
-    if predicted_row is None:
-        return []
-    mystery_vec = mystery_row.get(features_clm_name, {})
-    predicted_vec = predicted_row.get(features_clm_name, {})
-    if not isinstance(mystery_vec, dict) or not isinstance(predicted_vec, dict):
-        return []
-    # Prepare texts for optional span gating
-    def _norm_txt(x):
-        if isinstance(x, list):
-            return '\n\n'.join(x)
-        return str(x)
-    mystery_text = _norm_txt(mystery_row.get('fullText', ''))
-    predicted_text = _norm_txt(predicted_row.get('fullText', ''))
-    try:
-        from gram2vec.feature_locator import find_feature_spans as _find_feature_spans
-    except Exception:
-        _find_feature_spans = None
-    shared_features = []
-    # Iterate over union of feature keys (both authors share the same feature space in practice)
-    for feature_name in set(list(mystery_vec.keys()) + list(predicted_vec.keys())):
-        # Exclude unwanted features
-        if feature_name in EXCLUDED_G2V_FEATURES or any(feature_name.startswith(p) for p in EXCLUDED_G2V_FEATURE_PREFIXES):
-            continue
-        m_val = float(mystery_vec.get(feature_name, 0.0))
-        p_val = float(predicted_vec.get(feature_name, 0.0))
-        # Optional span gate: require at least one span in both texts
-        spans_m = spans_p = None
-        if require_spans and _find_feature_spans is not None:
-            try:
-                spans_m = _find_feature_spans(mystery_text, feature_name) or []
-                spans_p = _find_feature_spans(predicted_text, feature_name) or []
-                if len(spans_m) == 0 or len(spans_p) == 0:
-                    continue
-            except Exception:
-                # On span errors, skip gating and proceed
-                spans_m = spans_m if spans_m is not None else []
-                spans_p = spans_p if spans_p is not None else []
-        # Similarity metric: |m| + |p| - |m - p|
-        score = abs(m_val) + abs(p_val) - abs(m_val - p_val)
-        shared_features.append((feature_name, score, m_val, p_val, len(spans_m) if spans_m is not None else -1, len(spans_p) if spans_p is not None else -1))
-    # Rank by score desc and return top_n
-    shared_features.sort(key=lambda x: x[1], reverse=True)
-    top = shared_features[:top_n]
-    # Debug print of top-N with values and span counts for presence sanity-check
-    try:
-        print("[DEBUG] Task-only G2V top features (feature, mystery_val, predicted_val, score | spans_mystery, spans_predicted):")
-        for feat_name, sc, m_val, p_val, c_m, c_p in top:
-            print(f"    {feat_name} | mystery={m_val:.4f}, predicted={p_val:.4f}, S={sc:.4f} | spans=({c_m}, {c_p})")
-    except Exception:
-        pass
-    return [(f, s) for (f, s, _, _, _, _) in top]
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
     styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]

 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
 class style_analysis_schema(BaseModel):
     features: list[str]
     spans: dict[str, dict[str, list[str]]]
         print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
     # Gather the input texts (preserves list-of-strings if any)
+    #texts = background_corpus_df[text_clm].fillna("").tolist()
+    author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
     print(f"Number of author_texts: {len(author_texts)}")
     # Keep only features that have a positive contrastive score
     top_g2v_feats = sorted(
+        [(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
         key=lambda x: -x[1]  # Sort by contrastive score
     )
     return filtered_features[:top_n]  # Return tuples with z-scores
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
     styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]