Spaces:
Sleeping
Sleeping
Update analyzing.py
Browse files- analyzing.py +82 -80
analyzing.py
CHANGED
|
@@ -594,88 +594,90 @@ st.session_state['stage'] = 1
|
|
| 594 |
|
| 595 |
|
| 596 |
if st.session_state['stage'] > 0 :
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
st.session_state['dataset'] = parsed_responses
|
| 645 |
-
st.session_state['new_data'] = new_data
|
| 646 |
-
st.session_state['data_processed'] = True
|
| 647 |
-
except Exception as e:
|
| 648 |
-
st.write(f"Error processing data: {e}")
|
| 649 |
-
|
| 650 |
-
if st.session_state['data_processed']:
|
| 651 |
-
try:
|
| 652 |
-
visualizer = UAPVisualizer(data=st.session_state['new_data'])
|
| 653 |
-
#new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
|
| 654 |
-
fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
|
| 655 |
-
with st.status(f"Cramer's V Chart", expanded=True) as statuss:
|
| 656 |
-
st.pyplot(fig2)
|
| 657 |
-
statuss.update(label="Cramer's V chart plotted", expanded=False)
|
| 658 |
-
except Exception as e:
|
| 659 |
-
st.write(f"Error plotting Cramers V: {e}")
|
| 660 |
-
|
| 661 |
-
for i, column in enumerate(st.session_state['col_names']):
|
| 662 |
-
#if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
|
| 663 |
-
# if st.session_state['data_processed']:
|
| 664 |
-
# with st.status(f"Show clusters {column}", expanded=True) as stats:
|
| 665 |
-
# fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
|
| 666 |
-
# stats.update(label=f"Show clusters {column} complete", expanded=False)
|
| 667 |
if st.session_state['data_processed']:
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
)
|
| 677 |
-
|
| 678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
|
| 680 |
|
| 681 |
# this will check if the dataframe is not empty
|
|
|
|
| 594 |
|
| 595 |
|
| 596 |
if st.session_state['stage'] > 0 :
|
| 597 |
+
with st.form(border=True, key='Select Columns for Analysis'):
|
| 598 |
+
columns_to_analyze = st.multiselect(
|
| 599 |
+
label='Select columns to analyze',
|
| 600 |
+
options=st.session_state['parsed_responses'].columns
|
| 601 |
+
)
|
| 602 |
+
if st.form_submit_button("Process Data"):
|
| 603 |
+
if columns_to_analyze:
|
| 604 |
+
analyzers = []
|
| 605 |
+
col_names = []
|
| 606 |
+
clusters = {}
|
| 607 |
+
for column in columns_to_analyze:
|
| 608 |
+
with torch.no_grad():
|
| 609 |
+
with st.status(f"Processing {column}", expanded=True) as status:
|
| 610 |
+
analyzer = UAPAnalyzer(st.session_state['parsed_responses'], column)
|
| 611 |
+
st.write(f"Processing {column}...")
|
| 612 |
+
analyzer.preprocess_data(top_n=32)
|
| 613 |
+
st.write("Reducing dimensionality...")
|
| 614 |
+
analyzer.reduce_dimensionality(method='UMAP', n_components=2, n_neighbors=15, min_dist=0.1)
|
| 615 |
+
st.write("Clustering data...")
|
| 616 |
+
analyzer.cluster_data(method='HDBSCAN', min_cluster_size=15)
|
| 617 |
+
analyzer.get_tf_idf_clusters(top_n=3)
|
| 618 |
+
st.write("Naming clusters...")
|
| 619 |
+
analyzers.append(analyzer)
|
| 620 |
+
col_names.append(column)
|
| 621 |
+
clusters[column] = analyzer.merge_similar_clusters(cluster_terms=analyzer.__dict__['cluster_terms'], cluster_labels=analyzer.__dict__['cluster_labels'])
|
| 622 |
+
|
| 623 |
+
# Run the visualization
|
| 624 |
+
# fig = datamapplot.create_plot(
|
| 625 |
+
# analyzer.__dict__['reduced_embeddings'],
|
| 626 |
+
# analyzer.__dict__['cluster_labels'].astype(str),
|
| 627 |
+
# #label_font_size=11,
|
| 628 |
+
# label_wrap_width=20,
|
| 629 |
+
# use_medoids=True,
|
| 630 |
+
# )#.to_html(full_html=False, include_plotlyjs='cdn')
|
| 631 |
+
# st.pyplot(fig.savefig())
|
| 632 |
+
status.update(label=f"Processing {column} complete", expanded=False)
|
| 633 |
+
st.session_state['analyzers'] = analyzers
|
| 634 |
+
st.session_state['col_names'] = col_names
|
| 635 |
+
st.session_state['clusters'] = clusters
|
| 636 |
+
|
| 637 |
+
# save space
|
| 638 |
+
parsed = None
|
| 639 |
+
analyzers = None
|
| 640 |
+
col_names = None
|
| 641 |
+
clusters = None
|
| 642 |
|
| 643 |
+
if st.session_state['clusters'] is not None:
|
| 644 |
+
try:
|
| 645 |
+
new_data, parsed_responses = analyze_and_predict(st.session_state['parsed_responses'], st.session_state['analyzers'], st.session_state['col_names'], st.session_state['clusters'])
|
| 646 |
+
st.session_state['dataset'] = parsed_responses
|
| 647 |
+
st.session_state['new_data'] = new_data
|
| 648 |
+
st.session_state['data_processed'] = True
|
| 649 |
+
except Exception as e:
|
| 650 |
+
st.write(f"Error processing data: {e}")
|
| 651 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
if st.session_state['data_processed']:
|
| 653 |
+
try:
|
| 654 |
+
visualizer = UAPVisualizer(data=st.session_state['new_data'])
|
| 655 |
+
#new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
|
| 656 |
+
fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
|
| 657 |
+
with st.status(f"Cramer's V Chart", expanded=True) as statuss:
|
| 658 |
+
st.pyplot(fig2)
|
| 659 |
+
statuss.update(label="Cramer's V chart plotted", expanded=False)
|
| 660 |
+
except Exception as e:
|
| 661 |
+
st.write(f"Error plotting Cramers V: {e}")
|
| 662 |
+
|
| 663 |
+
for i, column in enumerate(st.session_state['col_names']):
|
| 664 |
+
#if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
|
| 665 |
+
# if st.session_state['data_processed']:
|
| 666 |
+
# with st.status(f"Show clusters {column}", expanded=True) as stats:
|
| 667 |
+
# fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
|
| 668 |
+
# stats.update(label=f"Show clusters {column} complete", expanded=False)
|
| 669 |
+
if st.session_state['data_processed']:
|
| 670 |
+
with st.status(f"Show clusters {column}", expanded=True) as stats:
|
| 671 |
+
fig3 = st.session_state['analyzers'][i].plot_embeddings4(
|
| 672 |
+
title=f"{column} clusters",
|
| 673 |
+
cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'],
|
| 674 |
+
cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'],
|
| 675 |
+
reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'],
|
| 676 |
+
column=column, # Use the original column name here
|
| 677 |
+
data=st.session_state['parsed_responses'] # Use the original dataset here
|
| 678 |
+
)
|
| 679 |
+
stats.update(label=f"Show clusters {column} complete", expanded=False)
|
| 680 |
+
st.session_state['analysis_complete'] = True
|
| 681 |
|
| 682 |
|
| 683 |
# this will check if the dataframe is not empty
|