Spaces:

kambris
/

V3

Sleeping

App Files Files Community

kambris commited on Feb 15, 2025

Commit

67dbb5a

verified ·

1 Parent(s): f57ddb1

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -47

app.py CHANGED Viewed

@@ -447,52 +447,52 @@ if uploaded_file is not None:
     # Folio Clustering Section
     with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
-    st.write("""
-    This section groups folios into clusters based on their word usage patterns.
-    - **PCA**: Reduces the data to 2D using Principal Component Analysis.
-    - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
-    - **K-Means**: Groups folios into clusters based on their word frequencies.
-    """)
-    # Feature Extraction
-    all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
-    word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0)  # Convert set to list
-    for folio, word_counter in folio_word_map.items():
-        for word, count in word_counter.items():
-            word_freq_matrix.loc[folio, word] = count
-    # Dimensionality Reduction Option
-    dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
-    if dim_reduction_method == "PCA":
-        reducer = PCA(n_components=2)
-        folio_coords = reducer.fit_transform(word_freq_matrix)
-    else:
-        reducer = TSNE(n_components=2, random_state=42)
-        folio_coords = reducer.fit_transform(word_freq_matrix)
-    # Clustering Algorithm Option
-    clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
-    if clustering_method == "K-Means":
-        # K-Means Clustering
-        n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
-        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-        clusters = kmeans.fit_predict(word_freq_matrix)
-    else:
-        # DBSCAN Clustering
-        dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust parameters as needed
-        clusters = dbscan.fit_predict(word_freq_matrix)
-    # Visualization
-    plot_data = pd.DataFrame({
-        'Folio': word_freq_matrix.index,
-        'Dim1': folio_coords[:, 0],
-        'Dim2': folio_coords[:, 1],
-        'Cluster': clusters
-    })
-    fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
-                     hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
-    st.plotly_chart(fig)

     # Folio Clustering Section
     with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
+        st.write("""
+        This section groups folios into clusters based on their word usage patterns.
+        - **PCA**: Reduces the data to 2D using Principal Component Analysis.
+        - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
+        - **K-Means**: Groups folios into clusters based on their word frequencies.
+        """)
+        # Feature Extraction
+        all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
+        word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0)  # Convert set to list
+        for folio, word_counter in folio_word_map.items():
+            for word, count in word_counter.items():
+                word_freq_matrix.loc[folio, word] = count
+        # Dimensionality Reduction Option
+        dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
+        if dim_reduction_method == "PCA":
+            reducer = PCA(n_components=2)
+            folio_coords = reducer.fit_transform(word_freq_matrix)
+        else:
+            reducer = TSNE(n_components=2, random_state=42)
+            folio_coords = reducer.fit_transform(word_freq_matrix)
+        # Clustering Algorithm Option
+        clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
+        if clustering_method == "K-Means":
+            # K-Means Clustering
+            n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
+            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+            clusters = kmeans.fit_predict(word_freq_matrix)
+        else:
+            # DBSCAN Clustering
+            dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust parameters as needed
+            clusters = dbscan.fit_predict(word_freq_matrix)
+        # Visualization
+        plot_data = pd.DataFrame({
+            'Folio': word_freq_matrix.index,
+            'Dim1': folio_coords[:, 0],
+            'Dim2': folio_coords[:, 1],
+            'Cluster': clusters
+        })
+        fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
+                         hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
+        st.plotly_chart(fig)