Update app.py
Browse files
app.py
CHANGED
|
@@ -447,52 +447,52 @@ if uploaded_file is not None:
|
|
| 447 |
|
| 448 |
# Folio Clustering Section
|
| 449 |
with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
# Feature Extraction
|
| 458 |
-
all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
|
| 459 |
-
word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
|
| 460 |
-
|
| 461 |
-
for folio, word_counter in folio_word_map.items():
|
| 462 |
-
for word, count in word_counter.items():
|
| 463 |
-
word_freq_matrix.loc[folio, word] = count
|
| 464 |
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
if dim_reduction_method == "PCA":
|
| 469 |
-
reducer = PCA(n_components=2)
|
| 470 |
-
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 471 |
-
else:
|
| 472 |
-
reducer = TSNE(n_components=2, random_state=42)
|
| 473 |
-
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 474 |
-
|
| 475 |
-
# Clustering Algorithm Option
|
| 476 |
-
clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
|
| 477 |
-
|
| 478 |
-
if clustering_method == "K-Means":
|
| 479 |
-
# K-Means Clustering
|
| 480 |
-
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
| 481 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 482 |
-
clusters = kmeans.fit_predict(word_freq_matrix)
|
| 483 |
-
else:
|
| 484 |
-
# DBSCAN Clustering
|
| 485 |
-
dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
|
| 486 |
-
clusters = dbscan.fit_predict(word_freq_matrix)
|
| 487 |
-
|
| 488 |
-
# Visualization
|
| 489 |
-
plot_data = pd.DataFrame({
|
| 490 |
-
'Folio': word_freq_matrix.index,
|
| 491 |
-
'Dim1': folio_coords[:, 0],
|
| 492 |
-
'Dim2': folio_coords[:, 1],
|
| 493 |
-
'Cluster': clusters
|
| 494 |
-
})
|
| 495 |
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
|
| 448 |
# Folio Clustering Section
|
| 449 |
with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
|
| 450 |
+
st.write("""
|
| 451 |
+
This section groups folios into clusters based on their word usage patterns.
|
| 452 |
+
- **PCA**: Reduces the data to 2D using Principal Component Analysis.
|
| 453 |
+
- **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
|
| 454 |
+
- **K-Means**: Groups folios into clusters based on their word frequencies.
|
| 455 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
+
# Feature Extraction
|
| 458 |
+
all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
|
| 459 |
+
word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
+
for folio, word_counter in folio_word_map.items():
|
| 462 |
+
for word, count in word_counter.items():
|
| 463 |
+
word_freq_matrix.loc[folio, word] = count
|
| 464 |
+
|
| 465 |
+
# Dimensionality Reduction Option
|
| 466 |
+
dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
|
| 467 |
+
|
| 468 |
+
if dim_reduction_method == "PCA":
|
| 469 |
+
reducer = PCA(n_components=2)
|
| 470 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 471 |
+
else:
|
| 472 |
+
reducer = TSNE(n_components=2, random_state=42)
|
| 473 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 474 |
+
|
| 475 |
+
# Clustering Algorithm Option
|
| 476 |
+
clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
|
| 477 |
+
|
| 478 |
+
if clustering_method == "K-Means":
|
| 479 |
+
# K-Means Clustering
|
| 480 |
+
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
| 481 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 482 |
+
clusters = kmeans.fit_predict(word_freq_matrix)
|
| 483 |
+
else:
|
| 484 |
+
# DBSCAN Clustering
|
| 485 |
+
dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
|
| 486 |
+
clusters = dbscan.fit_predict(word_freq_matrix)
|
| 487 |
+
|
| 488 |
+
# Visualization
|
| 489 |
+
plot_data = pd.DataFrame({
|
| 490 |
+
'Folio': word_freq_matrix.index,
|
| 491 |
+
'Dim1': folio_coords[:, 0],
|
| 492 |
+
'Dim2': folio_coords[:, 1],
|
| 493 |
+
'Cluster': clusters
|
| 494 |
+
})
|
| 495 |
+
|
| 496 |
+
fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
|
| 497 |
+
hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
|
| 498 |
+
st.plotly_chart(fig)
|