Update app.py
Browse files
app.py
CHANGED
|
@@ -424,3 +424,46 @@ if uploaded_file is not None:
|
|
| 424 |
color_continuous_scale='RdBu') # Use a valid Plotly colorscale
|
| 425 |
fig.update_layout(title="Word Length Correlations by Position")
|
| 426 |
st.plotly_chart(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
color_continuous_scale='RdBu') # Use a valid Plotly colorscale
|
| 425 |
fig.update_layout(title="Word Length Correlations by Position")
|
| 426 |
st.plotly_chart(fig)
|
| 427 |
+
|
| 428 |
+
with st.expander("Folio Clustering Based on Word Usage Patterns"):
|
| 429 |
+
st.write("""
|
| 430 |
+
This section groups folios into clusters based on their word usage patterns.
|
| 431 |
+
- **PCA**: Reduces the data to 2D using Principal Component Analysis.
|
| 432 |
+
- **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
|
| 433 |
+
- **K-Means**: Groups folios into clusters based on their word frequencies.
|
| 434 |
+
""")
|
| 435 |
+
|
| 436 |
+
# Feature Extraction
|
| 437 |
+
all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
|
| 438 |
+
word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=all_words, data=0)
|
| 439 |
+
|
| 440 |
+
for folio, word_counter in folio_word_map.items():
|
| 441 |
+
for word, count in word_counter.items():
|
| 442 |
+
word_freq_matrix.loc[folio, word] = count
|
| 443 |
+
|
| 444 |
+
# Dimensionality Reduction Option
|
| 445 |
+
dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
|
| 446 |
+
|
| 447 |
+
if dim_reduction_method == "PCA":
|
| 448 |
+
reducer = PCA(n_components=2)
|
| 449 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 450 |
+
else:
|
| 451 |
+
reducer = TSNE(n_components=2, random_state=42)
|
| 452 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
| 453 |
+
|
| 454 |
+
# Clustering (K-Means)
|
| 455 |
+
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
| 456 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 457 |
+
clusters = kmeans.fit_predict(word_freq_matrix)
|
| 458 |
+
|
| 459 |
+
# Visualization
|
| 460 |
+
plot_data = pd.DataFrame({
|
| 461 |
+
'Folio': word_freq_matrix.index,
|
| 462 |
+
'Dim1': folio_coords[:, 0],
|
| 463 |
+
'Dim2': folio_coords[:, 1],
|
| 464 |
+
'Cluster': clusters
|
| 465 |
+
})
|
| 466 |
+
|
| 467 |
+
fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
|
| 468 |
+
hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method})")
|
| 469 |
+
st.plotly_chart(fig)
|