Spaces:

rockerritesh
/

textTree

Runtime error

App Files Files Community

rockerritesh commited on Sep 20, 2024

Commit

7c47743

verified ·

1 Parent(s): ba1c1a9

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -11

app.py CHANGED Viewed

@@ -11,7 +11,8 @@ import plotly.graph_objs as go
 # Download the punkt tokenizer
 nltk.download('punkt_tab')
-# Helper function to split text into topics using KMeans clustering
 def split_text_into_topics(text, n_topics):
     sentences = sent_tokenize(text)
     vectorizer = TfidfVectorizer(stop_words='english')
@@ -23,10 +24,19 @@ def split_text_into_topics(text, n_topics):
     clusters = kmeans.labels_.tolist()
     topic_sentences = {i: [] for i in range(n_topics)}
     for i, sentence in enumerate(sentences):
         topic_sentences[clusters[i]].append(sentence)
-    return topic_sentences
 # Recursive function to split subtopics
 def recursive_split(topic_dict, depth, max_depth, subtopics):
@@ -38,38 +48,43 @@ def recursive_split(topic_dict, depth, max_depth, subtopics):
         if len(sentences) <= 1:
             new_topic_dict[topic] = sentences
         else:
-            sub_topics = split_text_into_topics(' '.join(sentences), subtopics)
             new_topic_dict[topic] = sub_topics
     return new_topic_dict
 # Function to convert the tree into edge data for Plotly visualization
-def get_edges(tree, parent=None, level=0):
     edges = []
     labels = []
     pos = {}
     for key, value in tree.items():
         node_label = f'Topic {key}' if parent is None else f'Subtopic {key}'
         pos[node_label] = (level, len(labels))
         labels.append(node_label)
         if parent:
             edges.append((parent, node_label))
         if isinstance(value, dict):
-            new_edges, new_labels, new_pos = get_edges(value, node_label, level+1)
             edges += new_edges
             labels += new_labels
             pos.update(new_pos)
         else:
             for i, sentence in enumerate(value):
                 sentence_label = f"{node_label} - Sentence {i+1}"
                 pos[sentence_label] = (level+1, len(labels))
                 labels.append(sentence_label)
                 edges.append((node_label, sentence_label))
-    return edges, labels, pos
 # Streamlit App layout
 st.title('Interactive Text Topic Tree Generator')
@@ -85,14 +100,14 @@ if uploaded_file is not None:
     max_depth = st.slider('Select maximum depth of subtopics', 1, 5, 2)
     subtopics_per_topic = st.slider('Select number of subtopics per topic', 2, 5, 3)
-    # Split text into main topics
-    topic_dict = split_text_into_topics(text, n_topics)
     # Recursively split the topics into subtopics
     full_tree = recursive_split(topic_dict, 0, max_depth, subtopics_per_topic)
-    # Get edges and positions for the plot
-    edges, labels, pos = get_edges(full_tree)
     # Plot the tree graph using Plotly
     edge_x = []
@@ -114,12 +129,13 @@ if uploaded_file is not None:
         mode='lines'
     )
-    # Create node trace
     node_trace = go.Scatter(
         x=node_x, y=node_y,
         mode='markers+text',
         text=labels,
         hoverinfo='text',
         marker=dict(
             showscale=True,
             colorscale='YlGnBu',

 # Download the punkt tokenizer
 nltk.download('punkt_tab')
+# Helper function to split text into topics using KMeans clustering and extract top words
 def split_text_into_topics(text, n_topics):
     sentences = sent_tokenize(text)
     vectorizer = TfidfVectorizer(stop_words='english')
     clusters = kmeans.labels_.tolist()
     topic_sentences = {i: [] for i in range(n_topics)}
+    # Store the top word for each cluster
+    top_words = []
+    for i in range(n_topics):
+        cluster_center = kmeans.cluster_centers_[i]
+        sorted_indices = np.argsort(cluster_center)[::-1]
+        top_word_index = sorted_indices[0]
+        top_word = vectorizer.get_feature_names_out()[top_word_index]
+        top_words.append(top_word)
     for i, sentence in enumerate(sentences):
         topic_sentences[clusters[i]].append(sentence)
+    return topic_sentences, top_words
 # Recursive function to split subtopics
 def recursive_split(topic_dict, depth, max_depth, subtopics):
         if len(sentences) <= 1:
             new_topic_dict[topic] = sentences
         else:
+            sub_topics, _ = split_text_into_topics(' '.join(sentences), subtopics)
             new_topic_dict[topic] = sub_topics
     return new_topic_dict
 # Function to convert the tree into edge data for Plotly visualization
+def get_edges(tree, parent=None, level=0, top_words=None):
     edges = []
     labels = []
+    hover_texts = []
     pos = {}
     for key, value in tree.items():
         node_label = f'Topic {key}' if parent is None else f'Subtopic {key}'
         pos[node_label] = (level, len(labels))
+        top_word = top_words[key] if top_words and key < len(top_words) else "N/A"
         labels.append(node_label)
+        hover_texts.append(f"Top Word: {top_word}")
         if parent:
             edges.append((parent, node_label))
         if isinstance(value, dict):
+            new_edges, new_labels, new_hover_texts, new_pos = get_edges(value, node_label, level+1, top_words)
             edges += new_edges
             labels += new_labels
+            hover_texts += new_hover_texts
             pos.update(new_pos)
         else:
             for i, sentence in enumerate(value):
                 sentence_label = f"{node_label} - Sentence {i+1}"
                 pos[sentence_label] = (level+1, len(labels))
                 labels.append(sentence_label)
+                hover_texts.append(sentence)
                 edges.append((node_label, sentence_label))
+    return edges, labels, hover_texts, pos
 # Streamlit App layout
 st.title('Interactive Text Topic Tree Generator')
     max_depth = st.slider('Select maximum depth of subtopics', 1, 5, 2)
     subtopics_per_topic = st.slider('Select number of subtopics per topic', 2, 5, 3)
+    # Split text into main topics and extract top words
+    topic_dict, top_words = split_text_into_topics(text, n_topics)
     # Recursively split the topics into subtopics
     full_tree = recursive_split(topic_dict, 0, max_depth, subtopics_per_topic)
+    # Get edges, labels, hover texts, and positions for the plot
+    edges, labels, hover_texts, pos = get_edges(full_tree, top_words=top_words)
     # Plot the tree graph using Plotly
     edge_x = []
         mode='lines'
     )
+    # Create node trace with hover text showing top words
     node_trace = go.Scatter(
         x=node_x, y=node_y,
         mode='markers+text',
         text=labels,
         hoverinfo='text',
+        hovertext=hover_texts,  # Adding hover text
         marker=dict(
             showscale=True,
             colorscale='YlGnBu',