kambris commited on
Commit
0c2ee4d
·
verified ·
1 Parent(s): b34a069

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -0
app.py CHANGED
@@ -424,3 +424,46 @@ if uploaded_file is not None:
424
  color_continuous_scale='RdBu') # Use a valid Plotly colorscale
425
  fig.update_layout(title="Word Length Correlations by Position")
426
  st.plotly_chart(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  color_continuous_scale='RdBu') # Use a valid Plotly colorscale
425
  fig.update_layout(title="Word Length Correlations by Position")
426
  st.plotly_chart(fig)
427
+
428
+ with st.expander("Folio Clustering Based on Word Usage Patterns"):
429
+ st.write("""
430
+ This section groups folios into clusters based on their word usage patterns.
431
+ - **PCA**: Reduces the data to 2D using Principal Component Analysis.
432
+ - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
433
+ - **K-Means**: Groups folios into clusters based on their word frequencies.
434
+ """)
435
+
436
+ # Feature Extraction
437
+ all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
438
+ word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=all_words, data=0)
439
+
440
+ for folio, word_counter in folio_word_map.items():
441
+ for word, count in word_counter.items():
442
+ word_freq_matrix.loc[folio, word] = count
443
+
444
+ # Dimensionality Reduction Option
445
+ dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
446
+
447
+ if dim_reduction_method == "PCA":
448
+ reducer = PCA(n_components=2)
449
+ folio_coords = reducer.fit_transform(word_freq_matrix)
450
+ else:
451
+ reducer = TSNE(n_components=2, random_state=42)
452
+ folio_coords = reducer.fit_transform(word_freq_matrix)
453
+
454
+ # Clustering (K-Means)
455
+ n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
456
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
457
+ clusters = kmeans.fit_predict(word_freq_matrix)
458
+
459
+ # Visualization
460
+ plot_data = pd.DataFrame({
461
+ 'Folio': word_freq_matrix.index,
462
+ 'Dim1': folio_coords[:, 0],
463
+ 'Dim2': folio_coords[:, 1],
464
+ 'Cluster': clusters
465
+ })
466
+
467
+ fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
468
+ hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method})")
469
+ st.plotly_chart(fig)