Spaces:
Sleeping
Sleeping
Test spaces.GPU
Browse files
app.py
CHANGED
|
@@ -67,6 +67,7 @@ import matplotlib.colors as mcolors
|
|
| 67 |
import textwrap
|
| 68 |
import pandas as pd
|
| 69 |
import streamlit as st
|
|
|
|
| 70 |
|
| 71 |
# Configure logging
|
| 72 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -150,7 +151,7 @@ class UAPAnalyzer:
|
|
| 150 |
self.embeddings = self._extract_embeddings(string_data)
|
| 151 |
logging.info("Data preprocessing complete")
|
| 152 |
|
| 153 |
-
|
| 154 |
def _extract_embeddings(self, data_column):
|
| 155 |
"""
|
| 156 |
Extracts embeddings from the given data column.
|
|
@@ -165,6 +166,7 @@ class UAPAnalyzer:
|
|
| 165 |
# convert to str
|
| 166 |
return embed_model.encode(data_column.tolist(), show_progress_bar=True)
|
| 167 |
|
|
|
|
| 168 |
def reduce_dimensionality(self, method='UMAP', n_components=2, **kwargs):
|
| 169 |
"""
|
| 170 |
Reduces the dimensionality of embeddings using specified method.
|
|
@@ -184,7 +186,8 @@ class UAPAnalyzer:
|
|
| 184 |
|
| 185 |
self.reduced_embeddings = reducer.fit_transform(self.embeddings)
|
| 186 |
logging.info(f"Dimensionality reduced using {method}")
|
| 187 |
-
|
|
|
|
| 188 |
def cluster_data(self, method='HDBSCAN', **kwargs):
|
| 189 |
"""
|
| 190 |
Clusters the reduced dimensionality data using the specified clustering method.
|
|
@@ -205,7 +208,7 @@ class UAPAnalyzer:
|
|
| 205 |
self.cluster_labels = clusterer.labels_
|
| 206 |
logging.info(f"Data clustering complete using {method}")
|
| 207 |
|
| 208 |
-
|
| 209 |
def get_tf_idf_clusters(self, top_n=2):
|
| 210 |
"""
|
| 211 |
Names clusters using the most frequent terms based on TF-IDF analysis.
|
|
@@ -387,7 +390,8 @@ class UAPAnalyzer:
|
|
| 387 |
# Update string labels to reflect merged labels
|
| 388 |
updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
|
| 389 |
return updated_string_labels
|
| 390 |
-
|
|
|
|
| 391 |
def cluster_cosine(self, cluster_terms, cluster_labels, similarity_threshold):
|
| 392 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 393 |
|
|
@@ -650,6 +654,7 @@ class UAPAnalyzer:
|
|
| 650 |
|
| 651 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 652 |
|
|
|
|
| 653 |
def analyze_and_predict(data, analyzers, col_names):
|
| 654 |
"""
|
| 655 |
Performs analysis on the data using provided analyzers and makes predictions on specified columns.
|
|
@@ -677,6 +682,8 @@ def analyze_and_predict(data, analyzers, col_names):
|
|
| 677 |
logging.error(f"Error processing {col}: {e}")
|
| 678 |
return new_data
|
| 679 |
|
|
|
|
|
|
|
| 680 |
def train_xgboost(x_train, y_train, x_test, y_test, num_classes):
|
| 681 |
"""
|
| 682 |
Trains an XGBoost model and evaluates its performance.
|
|
@@ -788,7 +795,7 @@ def plot_cramers_v_heatmap(data, significance_level=0.05):
|
|
| 788 |
class UAPVisualizer:
|
| 789 |
def __init__(self, data=None):
|
| 790 |
pass # Initialization can be added if needed
|
| 791 |
-
|
| 792 |
def analyze_and_predict(self, data, analyzers, col_names):
|
| 793 |
new_data = pd.DataFrame()
|
| 794 |
for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
|
|
|
|
| 67 |
import textwrap
|
| 68 |
import pandas as pd
|
| 69 |
import streamlit as st
|
| 70 |
+
import spaces
|
| 71 |
|
| 72 |
# Configure logging
|
| 73 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 151 |
self.embeddings = self._extract_embeddings(string_data)
|
| 152 |
logging.info("Data preprocessing complete")
|
| 153 |
|
| 154 |
+
@spaces.GPU
|
| 155 |
def _extract_embeddings(self, data_column):
|
| 156 |
"""
|
| 157 |
Extracts embeddings from the given data column.
|
|
|
|
| 166 |
# convert to str
|
| 167 |
return embed_model.encode(data_column.tolist(), show_progress_bar=True)
|
| 168 |
|
| 169 |
+
@spaces.GPU
|
| 170 |
def reduce_dimensionality(self, method='UMAP', n_components=2, **kwargs):
|
| 171 |
"""
|
| 172 |
Reduces the dimensionality of embeddings using specified method.
|
|
|
|
| 186 |
|
| 187 |
self.reduced_embeddings = reducer.fit_transform(self.embeddings)
|
| 188 |
logging.info(f"Dimensionality reduced using {method}")
|
| 189 |
+
|
| 190 |
+
@spaces.GPU
|
| 191 |
def cluster_data(self, method='HDBSCAN', **kwargs):
|
| 192 |
"""
|
| 193 |
Clusters the reduced dimensionality data using the specified clustering method.
|
|
|
|
| 208 |
self.cluster_labels = clusterer.labels_
|
| 209 |
logging.info(f"Data clustering complete using {method}")
|
| 210 |
|
| 211 |
+
@spaces.GPU
|
| 212 |
def get_tf_idf_clusters(self, top_n=2):
|
| 213 |
"""
|
| 214 |
Names clusters using the most frequent terms based on TF-IDF analysis.
|
|
|
|
| 390 |
# Update string labels to reflect merged labels
|
| 391 |
updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
|
| 392 |
return updated_string_labels
|
| 393 |
+
|
| 394 |
+
@spaces.GPU
|
| 395 |
def cluster_cosine(self, cluster_terms, cluster_labels, similarity_threshold):
|
| 396 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 397 |
|
|
|
|
| 654 |
|
| 655 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 656 |
|
| 657 |
+
@spaces.GPU
|
| 658 |
def analyze_and_predict(data, analyzers, col_names):
|
| 659 |
"""
|
| 660 |
Performs analysis on the data using provided analyzers and makes predictions on specified columns.
|
|
|
|
| 682 |
logging.error(f"Error processing {col}: {e}")
|
| 683 |
return new_data
|
| 684 |
|
| 685 |
+
|
| 686 |
+
@spaces.GPU
|
| 687 |
def train_xgboost(x_train, y_train, x_test, y_test, num_classes):
|
| 688 |
"""
|
| 689 |
Trains an XGBoost model and evaluates its performance.
|
|
|
|
| 795 |
class UAPVisualizer:
|
| 796 |
def __init__(self, data=None):
|
| 797 |
pass # Initialization can be added if needed
|
| 798 |
+
|
| 799 |
def analyze_and_predict(self, data, analyzers, col_names):
|
| 800 |
new_data = pd.DataFrame()
|
| 801 |
for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
|