Spaces:
Sleeping
Sleeping
Dana Atzil
commited on
Commit
·
685d696
1
Parent(s):
0f1ab6c
add files
Browse files- MIND_utils.py +120 -0
- clean_annotations_safe.csv +0 -0
- streamlit_app_LDA.py +145 -0
MIND_utils.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np, pandas as pd, json
|
| 2 |
+
|
| 3 |
+
dimensions = ["A", "B-S", "B-O", "C-S", "C-O", "D"]
|
| 4 |
+
dimension_to_layer_name = {
|
| 5 |
+
"B-S": "Segment Patient B-S",
|
| 6 |
+
"C-S": "Segment Patient C-S",
|
| 7 |
+
"D": "Segment Patient Desire (D)",
|
| 8 |
+
"C-O": "Segment Patient C-O",
|
| 9 |
+
"B-O": "Segment Patient B-O",
|
| 10 |
+
"A": "Segment Patient Affect (A)",
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
def df_to_self_states_json(df, doc_name, annotator = None):
|
| 14 |
+
"""Convert a dataframe into a json object that can be more easily used for visualization."""
|
| 15 |
+
# df is the dataframe of annotations
|
| 16 |
+
# doc_name is the name of the document
|
| 17 |
+
# annotator is the name of the annotator (optional)
|
| 18 |
+
def get_evidence_obj(evidence_df):
|
| 19 |
+
"Assume that the evidence_df is a partial dataframe including only annotation of a single evidence span."
|
| 20 |
+
evidence_obj = {k: v.value.iloc[0] for k, v in evidence_df.groupby("feature")}
|
| 21 |
+
# evidence_obj["text"] = evidence_df.span_text.iloc[0]
|
| 22 |
+
# evidence_obj["span_index_begin"] = evidence_df.begin.iloc[0]
|
| 23 |
+
# evidence_obj["span_index_end"] = evidence_df.end.iloc[0]
|
| 24 |
+
return evidence_obj
|
| 25 |
+
|
| 26 |
+
doc_object = {"document": doc_name, "annotator": annotator}
|
| 27 |
+
doc_object["segments"] = []
|
| 28 |
+
doc_df = df[df.document == doc_name]
|
| 29 |
+
if annotator:
|
| 30 |
+
doc_df = doc_df[doc_df.annotator == annotator]
|
| 31 |
+
|
| 32 |
+
# now add the segments
|
| 33 |
+
for segment_index, segment_group in df[df.document == doc_name].groupby("segment"):
|
| 34 |
+
# add Segment Summary features into segment object
|
| 35 |
+
segment_object = {"segment": segment_index}
|
| 36 |
+
segment_summary_df = segment_group[segment_group.layer == "Segment Summary"]
|
| 37 |
+
# # if not post-summary, skip this post
|
| 38 |
+
# if segment_summary_df.empty:
|
| 39 |
+
# continue
|
| 40 |
+
segment_object["Segment Summary"] = {k: v.value.iloc[0] for k, v in segment_summary_df.groupby("feature")}
|
| 41 |
+
|
| 42 |
+
state1_df = segment_group[segment_group.self_state_index == 1]
|
| 43 |
+
state2_df = segment_group[segment_group.self_state_index == 2]
|
| 44 |
+
states_list = list()
|
| 45 |
+
state1_obj = dict()
|
| 46 |
+
state2_obj = dict()
|
| 47 |
+
# set is_adaptive for each state
|
| 48 |
+
if not state1_df.empty:
|
| 49 |
+
state1_obj["is_adaptive"] = state1_df.is_adaptive.dropna().iloc[0]
|
| 50 |
+
states_list.append(state1_obj)
|
| 51 |
+
if not state2_df.empty:
|
| 52 |
+
state2_obj["is_adaptive"] = state2_df.is_adaptive.dropna().iloc[0]
|
| 53 |
+
states_list.append(state2_obj)
|
| 54 |
+
# collect elements per dimension
|
| 55 |
+
for dimension in dimensions:
|
| 56 |
+
segment_dim_layer_name = dimension_to_layer_name[dimension]
|
| 57 |
+
state1_dimension_df = state1_df[state1_df.layer == segment_dim_layer_name]
|
| 58 |
+
state2_dimension_df = state2_df[state2_df.layer == segment_dim_layer_name]
|
| 59 |
+
# search for evidence layers that match the same segment and dimension
|
| 60 |
+
dim_evidence_rows = segment_group[segment_group.layer == f"Patient_{dimension}_evidence"]
|
| 61 |
+
if not state1_dimension_df.empty:
|
| 62 |
+
state1_obj[dimension] = {k: v.value.iloc[0] for k, v in state1_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
|
| 63 |
+
evidences_obj = []
|
| 64 |
+
# for _, evidence_df in dim_evidence_rows.groupby("span_text"):
|
| 65 |
+
for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
|
| 66 |
+
# take only the evidence that matches the category of the segment-level element of the same dimension
|
| 67 |
+
if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state1_obj[dimension]["Category"]:
|
| 68 |
+
evidences_obj.append(get_evidence_obj(evidence_df))
|
| 69 |
+
if evidences_obj:
|
| 70 |
+
state1_obj[dimension]["evidences"] = evidences_obj
|
| 71 |
+
if not state2_dimension_df.empty:
|
| 72 |
+
state2_obj[dimension] = {k: v.value.iloc[0] for k, v in state2_dimension_df.groupby("feature")} # "Category", "Adaptivity", "Presence"
|
| 73 |
+
evidences_obj = []
|
| 74 |
+
# for _, evidence_df in dim_evidence_rows.groupby("span_text"):
|
| 75 |
+
for _, evidence_df in dim_evidence_rows.groupby(["begin", "end"]):
|
| 76 |
+
# take only the evidence that matches the category of the segment-level element of the same dimension
|
| 77 |
+
if not evidence_df.empty and evidence_df[evidence_df.feature == "Category"].value.iloc[0] == state2_obj[dimension]["Category"]:
|
| 78 |
+
evidences_obj.append(get_evidence_obj(evidence_df))
|
| 79 |
+
if evidences_obj:
|
| 80 |
+
state2_obj[dimension]["evidences"] = evidences_obj
|
| 81 |
+
|
| 82 |
+
segment_object["self-states"] = states_list
|
| 83 |
+
|
| 84 |
+
# add the post object to the document object
|
| 85 |
+
doc_object["segments"].append(segment_object)
|
| 86 |
+
return doc_object
|
| 87 |
+
|
| 88 |
+
element_short_desc_map = {
|
| 89 |
+
'A:(11) Proud': 'A:(11) Proud',
|
| 90 |
+
'B-O:(1) Relating behavior': 'B-O:(1) Relating',
|
| 91 |
+
'C-S:(1) Self-acceptance and compassion': 'C-S:(1) Self-compassion',
|
| 92 |
+
'D:(1) Relatedness': 'D:(1) Relatedness',
|
| 93 |
+
'A:(4) Depressed, despair, hopeless': 'A:(4) Depressed',
|
| 94 |
+
'C-O:(4) Perception of the other as blocking autonomy needs': 'C-O:(4) Other blocks autonomy',
|
| 95 |
+
'C-S:(2) Self criticism': 'C-S:(2) Self-criticism',
|
| 96 |
+
'C-O:(2) Perception of the other as detached or over attached': 'C-O:(2) Other detached/overattached',
|
| 97 |
+
'C-O:(1) Perception of the other as related': 'C-O:(1) Other related',
|
| 98 |
+
'A:(3) Sad, emotional pain, grieving': 'A:(3) Sadness',
|
| 99 |
+
'B-O:(2) Fight or flight behavior': 'B-O:(2) Fight/flight',
|
| 100 |
+
'A:(14) Feel lonely': 'A:(14) Lonely',
|
| 101 |
+
'D:(2) Expectation that relatedness needs will not be met': 'D:(2) Relatedness (-)',
|
| 102 |
+
'B-S:(2) Self harm, neglect and avoidance': 'B-S:(2) Self-harm',
|
| 103 |
+
'A:(10) Angry (aggression), disgust, contempt': 'A:(10) Angry/Aggressive',
|
| 104 |
+
'A:(8) Apathic, don’t care, blunted': 'A:(8) Apathetic',
|
| 105 |
+
'B-S:(1) Self care and improvement': 'B-S:(1) Self-care',
|
| 106 |
+
'D:(5) Competence, self esteem, self-care': 'D:(5) Competence',
|
| 107 |
+
'D:(6) Expectation that competence needs will not be met': 'D:(6) Competence (-)',
|
| 108 |
+
'C-O:(3) Perception of the other as facilitating autonomy needs': 'C-O:(3) Other supports autonomy',
|
| 109 |
+
'A:(2) Anxious/ fearful/ tense': 'A:(2) Anxious',
|
| 110 |
+
'A:(12) Ashamed, guilty': 'A:(12) Ashamed/Guilty',
|
| 111 |
+
'B-O:(4) Over controlled or controlling behavior': 'B-O:(4) Controlling',
|
| 112 |
+
'A:(1) Calm/ laid back': 'A:(1) Calm',
|
| 113 |
+
'D:(4) Expectation that autonomy needs will not be met': 'D:(4) Autonomy (-)',
|
| 114 |
+
'D:(3) Autonomy and adaptive control': 'D:(3) Autonomy',
|
| 115 |
+
'A:(5) Content, happy, joy, hopeful': 'A:(5) Happy',
|
| 116 |
+
'B-O:(3) Autonomous or adaptive control behavior': 'B-O:(3) Adaptive control',
|
| 117 |
+
'A:(9) Justifiable anger/ assertive anger, justifiable outrage': 'A:(9) Justified anger',
|
| 118 |
+
'A:(13) Feel loved, belong': 'A:(13) Loved/Belonging',
|
| 119 |
+
'A:(7) Vigor / energetic': 'A:(7) Vigor'
|
| 120 |
+
}
|
clean_annotations_safe.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
streamlit_app_LDA.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import json
|
| 3 |
+
import random
|
| 4 |
+
import numpy as np
|
| 5 |
+
from gensim import corpora, models
|
| 6 |
+
import pyLDAvis.gensim_models as gensimvis
|
| 7 |
+
import pyLDAvis
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import streamlit.components.v1 as components
|
| 10 |
+
|
| 11 |
+
from MIND_utils import df_to_self_states_json, element_short_desc_map
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ---------------------------
|
| 15 |
+
# Streamlit App Layout
|
| 16 |
+
# ---------------------------
|
| 17 |
+
st.title("Prototypical Self-States via Topic Modeling")
|
| 18 |
+
|
| 19 |
+
st.sidebar.header("Model Parameters")
|
| 20 |
+
num_topics = st.sidebar.slider("Number of Topics", min_value=2, max_value=20, value=5)
|
| 21 |
+
num_passes = st.sidebar.slider("Number of Passes", min_value=5, max_value=50, value=10)
|
| 22 |
+
lda_document_is = st.radio("A 'Document' in the topic model will correspond to a:", ("self-state", "segment"))
|
| 23 |
+
seed_value = st.sidebar.number_input("Random Seed", value=42)
|
| 24 |
+
num_top_elements_to_show = st.sidebar.slider("# top element to show in a topic", min_value=2, max_value=15, value=5)
|
| 25 |
+
|
| 26 |
+
# ---------------------------
|
| 27 |
+
# Load Data
|
| 28 |
+
# ---------------------------
|
| 29 |
+
# You can also allow users to upload their file via st.file_uploader.
|
| 30 |
+
# @st.cache(allow_output_mutation=True)
|
| 31 |
+
def load_data():
|
| 32 |
+
return pd.read_csv("clean_annotations_safe.csv")
|
| 33 |
+
|
| 34 |
+
df = load_data()
|
| 35 |
+
|
| 36 |
+
# ---------------------------
|
| 37 |
+
# Preprocess Data: Build Documents
|
| 38 |
+
# ---------------------------
|
| 39 |
+
# Set random seeds for reproducibility
|
| 40 |
+
random.seed(seed_value)
|
| 41 |
+
np.random.seed(seed_value)
|
| 42 |
+
|
| 43 |
+
# Functions to extract "words" (elements -- <dim>:<category>) from a segment / self-state
|
| 44 |
+
def extract_elements_from_selfstate(selfstate):
|
| 45 |
+
words = []
|
| 46 |
+
for dim, dim_obj in selfstate.items():
|
| 47 |
+
if dim == "is_adaptive":
|
| 48 |
+
continue
|
| 49 |
+
if "Category" in dim_obj and not pd.isna(dim_obj["Category"]):
|
| 50 |
+
word = f"{dim}:{dim_obj['Category']}"
|
| 51 |
+
words.append(word)
|
| 52 |
+
return words
|
| 53 |
+
|
| 54 |
+
def extract_elements_from_segment(segment):
|
| 55 |
+
words = []
|
| 56 |
+
for selfstate in segment["self-states"]:
|
| 57 |
+
words += extract_elements_from_selfstate(selfstate)
|
| 58 |
+
return words
|
| 59 |
+
|
| 60 |
+
# Build a list of "documents" (one per segment)
|
| 61 |
+
lda_documents = []
|
| 62 |
+
lda_document_ids = []
|
| 63 |
+
for (doc_id, annotator), df_ in df.groupby(["document", "annotator"]):
|
| 64 |
+
doc_json = df_to_self_states_json(df_, doc_id, annotator)
|
| 65 |
+
### * for Segment-level LDA-documents:
|
| 66 |
+
if lda_document_is == "segment":
|
| 67 |
+
for segment in doc_json["segments"]:
|
| 68 |
+
lda_doc = extract_elements_from_segment(segment)
|
| 69 |
+
if lda_doc: # only add if non-empty
|
| 70 |
+
lda_documents.append(lda_doc)
|
| 71 |
+
lda_document_ids.append(f"{doc_id}_seg{segment['segment']}")
|
| 72 |
+
### * for SelfState-level LDA-documents:
|
| 73 |
+
elif lda_document_is == "self-state":
|
| 74 |
+
for segment in doc_json["segments"]:
|
| 75 |
+
for i, selfstate in enumerate(segment["self-states"]):
|
| 76 |
+
lda_doc = extract_elements_from_selfstate(selfstate)
|
| 77 |
+
if lda_doc:
|
| 78 |
+
lda_documents.append(lda_doc)
|
| 79 |
+
lda_document_ids.append(f"{doc_id}_seg{segment['segment']}_state{i+1}")
|
| 80 |
+
|
| 81 |
+
# Create a dictionary and corpus for LDA
|
| 82 |
+
dictionary = corpora.Dictionary(lda_documents)
|
| 83 |
+
corpus = [dictionary.doc2bow(doc) for doc in lda_documents]
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------
|
| 87 |
+
# Run LDA Model
|
| 88 |
+
# ---------------------------
|
| 89 |
+
lda_model = models.LdaModel(corpus,
|
| 90 |
+
num_topics=num_topics,
|
| 91 |
+
id2word=dictionary,
|
| 92 |
+
passes=num_passes,
|
| 93 |
+
random_state=seed_value)
|
| 94 |
+
|
| 95 |
+
# ---------------------------
|
| 96 |
+
# Display Pretty Printed Topics
|
| 97 |
+
# ---------------------------
|
| 98 |
+
st.header("Pretty Printed Topics")
|
| 99 |
+
|
| 100 |
+
# Build a mapping for each topic to the list of (document index, topic probability)
|
| 101 |
+
topic_docs = {topic_id: [] for topic_id in range(lda_model.num_topics)}
|
| 102 |
+
|
| 103 |
+
# Iterate over the corpus to get topic distributions for each document
|
| 104 |
+
for i, doc_bow in enumerate(corpus):
|
| 105 |
+
# Get the full topic distribution (with minimum_probability=0 so every topic is included)
|
| 106 |
+
doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
|
| 107 |
+
for topic_id, prob in doc_topics:
|
| 108 |
+
topic_docs[topic_id].append((i, prob))
|
| 109 |
+
|
| 110 |
+
# For each topic, sort the documents by probability in descending order and keep the top 3
|
| 111 |
+
top_docs = {}
|
| 112 |
+
for topic_id, doc_list in topic_docs.items():
|
| 113 |
+
sorted_docs = sorted(doc_list, key=lambda x: x[1], reverse=True)
|
| 114 |
+
top_docs[topic_id] = sorted_docs[:3]
|
| 115 |
+
|
| 116 |
+
# Aggregate output into a single string
|
| 117 |
+
output_str = "Identified Prototypical Self-States (Topics):\n\n"
|
| 118 |
+
for topic_id, topic_str in lda_model.print_topics(num_words=num_top_elements_to_show):
|
| 119 |
+
output_str += f"Topic {topic_id}:\n"
|
| 120 |
+
terms = topic_str.split(" + ")
|
| 121 |
+
for term in terms:
|
| 122 |
+
weight, token = term.split("*")
|
| 123 |
+
token = token.strip().replace('"', '')
|
| 124 |
+
output_str += f" {float(weight):.3f} -> {token}\n"
|
| 125 |
+
|
| 126 |
+
output_str += " Top 3 Documents (Segment Indices) for this topic:\n"
|
| 127 |
+
for doc_index, prob in top_docs[topic_id]:
|
| 128 |
+
# Assuming lda_document_ids is a list or dict mapping document indices to identifiers
|
| 129 |
+
output_str += f" Doc {doc_index} ({lda_document_ids[doc_index]}) with probability {prob:.3f}\n"
|
| 130 |
+
output_str += "-" * 60 + "\n"
|
| 131 |
+
|
| 132 |
+
# Now you can display the aggregated string in Streamlit:
|
| 133 |
+
import streamlit as st
|
| 134 |
+
st.text(output_str)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# ---------------------------
|
| 138 |
+
# Prepare and Display pyLDAvis Visualization
|
| 139 |
+
# ---------------------------
|
| 140 |
+
st.header("Interactive Topic Visualization")
|
| 141 |
+
# vis_dict = {i: element_short_desc_map[v] for i, v in dictionary.items()}
|
| 142 |
+
# vis_dictionary = corpora.dictionary.Dictionary([[new_token] for new_token in vis_dict.values()])
|
| 143 |
+
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
|
| 144 |
+
html_string = pyLDAvis.prepared_data_to_html(vis_data)
|
| 145 |
+
components.html(html_string, width=1300, height=800)
|