Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -113,16 +113,49 @@ from nltk.tokenize import word_tokenize
|
|
| 113 |
nltk.download('punkt')
|
| 114 |
nltk.download('stopwords')
|
| 115 |
|
| 116 |
-
def combined_text_processing(text):
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
# Tokenize and remove stopwords
|
| 123 |
-
tokens = word_tokenize(text.lower())
|
| 124 |
stop_words = set(stopwords.words('english'))
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
# Lemmatize tokens using SpaCy
|
| 128 |
doc = nlp(' '.join(tokens))
|
|
@@ -139,6 +172,97 @@ def combined_text_processing(text):
|
|
| 139 |
|
| 140 |
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
|
|
@@ -153,15 +277,25 @@ def combined_text_processing(text):
|
|
| 153 |
|
| 154 |
def nlp_pipeline(original_df):
|
| 155 |
# Data Preprocessing
|
| 156 |
-
processed_df = data_pre_processing(original_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
|
| 159 |
-
# Apply the combined function to your DataFrame
|
| 160 |
-
processed_df['Processed_ProblemDescription'] = processed_df['Problem_Description'].apply(combined_text_processing)
|
| 161 |
|
| 162 |
|
|
|
|
|
|
|
| 163 |
|
| 164 |
-
return processed_df
|
| 165 |
|
| 166 |
def process_excel(file):
|
| 167 |
try:
|
|
|
|
| 113 |
nltk.download('punkt')
|
| 114 |
nltk.download('stopwords')
|
| 115 |
|
| 116 |
+
# def combined_text_processing(text):
|
| 117 |
+
# # Remove punctuation, numbers, URLs, and special characters
|
| 118 |
+
# text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters
|
| 119 |
+
# text = re.sub(r'\d+', '', text) # Remove numbers
|
| 120 |
+
# text = re.sub(r'http\S+', '', text) # Remove URLs
|
| 121 |
+
|
| 122 |
+
# # Tokenize and remove stopwords
|
| 123 |
+
# tokens = word_tokenize(text.lower()) # Convert to lowercase
|
| 124 |
+
# stop_words = set(stopwords.words('english'))
|
| 125 |
+
# tokens = [word for word in tokens if word not in stop_words]
|
| 126 |
+
|
| 127 |
+
# # Lemmatize tokens using SpaCy
|
| 128 |
+
# doc = nlp(' '.join(tokens))
|
| 129 |
+
# lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
| 130 |
+
|
| 131 |
+
# # Apply Hugging Face Transformers
|
| 132 |
+
# inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
| 133 |
+
# with torch.no_grad():
|
| 134 |
+
# outputs = model(**inputs)
|
| 135 |
+
|
| 136 |
+
# return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def text_processing_for_domain(text):
|
| 140 |
+
# Text Cleaning
|
| 141 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 142 |
+
text = re.sub(r'\d+', '', text)
|
| 143 |
+
text = re.sub(r'http\S+', '', text) # Remove https URLs
|
| 144 |
+
text = re.sub(r'www\.\S+', '', text) # Remove www URLs
|
| 145 |
|
| 146 |
# Tokenize and remove stopwords
|
| 147 |
+
tokens = word_tokenize(text.lower())
|
| 148 |
stop_words = set(stopwords.words('english'))
|
| 149 |
+
custom_stopwords = {'example', 'another'} # Add custom stopwords
|
| 150 |
+
tokens = [word for word in tokens if word not in stop_words and word not in custom_stopwords]
|
| 151 |
+
|
| 152 |
+
# NER - Remove named entities
|
| 153 |
+
doc = nlp(' '.join(tokens))
|
| 154 |
+
tokens = [token.text for token in doc if not token.ent_type_]
|
| 155 |
+
|
| 156 |
+
# POS Tagging (optional)
|
| 157 |
+
pos_tags = nltk.pos_tag(tokens)
|
| 158 |
+
tokens = [word for word, pos in pos_tags if pos in ['NN', 'NNS']] # Filter nouns
|
| 159 |
|
| 160 |
# Lemmatize tokens using SpaCy
|
| 161 |
doc = nlp(' '.join(tokens))
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
|
| 175 |
+
# # 2. Clustering from ChatGPT
|
| 176 |
+
# # Libraries: scikit-learn, sentence-transformers
|
| 177 |
+
# # Use sentence embeddings and clustering algorithms to group similar project proposals.
|
| 178 |
+
# from bertopic import BERTopic
|
| 179 |
+
# def perform_clustering(texts, n_clusters):
|
| 180 |
+
# topic_model = BERTopic(n_topics=n_clusters)
|
| 181 |
+
# topics, _ = topic_model.fit_transform(texts)
|
| 182 |
+
# return topics, topic_model
|
| 183 |
+
# # Clustering function call
|
| 184 |
+
# clustered_df, cluster_centers = clustering(processed_df)
|
| 185 |
+
# Method 1: Sentence Transformers + KMeans
|
| 186 |
+
|
| 187 |
+
# # 2. Clustering: from Claude
|
| 188 |
+
# # Use BERTopic for advanced topic modeling and clustering.
|
| 189 |
+
# from bertopic import BERTopic
|
| 190 |
+
# def perform_clustering(texts, n_clusters):
|
| 191 |
+
# topic_model = BERTopic(n_topics=n_clusters)
|
| 192 |
+
# topics, _ = topic_model.fit_transform(texts)
|
| 193 |
+
# return topics, topic_model
|
| 194 |
+
# # Clustering function call
|
| 195 |
+
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
| 196 |
+
# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
|
| 197 |
+
# After this Method 2: BERTopic function, the following need to be done:
|
| 198 |
+
# processed_df['Problem_Cluster'] = problem_clusters
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# 2. Meta AI Function: Sentence Transformers + Hierarchical Clustering + Silhouette Analysis
|
| 203 |
+
# Now this also includes:
|
| 204 |
+
# Topic Modeling using BERTopic: Integrated BERTopic to extract representative words for each cluster.
|
| 205 |
+
# Cluster Visualization: Added a simple visualization to display the top words in each cluster.
|
| 206 |
+
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
| 207 |
+
|
| 208 |
+
from sentence_transformers import SentenceTransformer
|
| 209 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 210 |
+
from sklearn.metrics import silhouette_score
|
| 211 |
+
from bertopic import BERTopic
|
| 212 |
+
|
| 213 |
+
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
| 214 |
+
def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
| 215 |
+
|
| 216 |
+
# Select Model (can we also optimize model selection automatically?)
|
| 217 |
+
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 218 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
| 219 |
+
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
| 220 |
+
# Generate embeddings
|
| 221 |
+
embeddings = model.encode(df[text_column].tolist())
|
| 222 |
+
|
| 223 |
+
# Perform hierarchical clustering with Silhouette Analysis
|
| 224 |
+
silhouette_scores = []
|
| 225 |
+
for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
| 226 |
+
clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
| 227 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
| 228 |
+
silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
| 229 |
+
silhouette_scores.append(silhouette_avg)
|
| 230 |
+
|
| 231 |
+
# Determine the optimal number of clusters
|
| 232 |
+
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 233 |
+
|
| 234 |
+
# Perform clustering with the optimal number of clusters
|
| 235 |
+
clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
| 236 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# BERTopic for topic modelling
|
| 240 |
+
topic_model = BERTopic(n_topics=optimal_n_clusters)
|
| 241 |
+
topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# Get representative words for each cluster
|
| 245 |
+
cluster_representations = {}
|
| 246 |
+
for i in range(optimal_n_clusters):
|
| 247 |
+
cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
|
| 248 |
+
|
| 249 |
+
# Map cluster labels to representative words
|
| 250 |
+
df["Problem_Cluster"] = cluster_labels
|
| 251 |
+
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 252 |
+
|
| 253 |
+
# # Print clusters and their representative words
|
| 254 |
+
# for cluster_label, words in cluster_representations.items():
|
| 255 |
+
# print(f"Domain {cluster_label}: {', '.join(words)}")
|
| 256 |
+
|
| 257 |
+
# return df.assign(cluster=cluster_labels), optimal_n_clusters
|
| 258 |
+
|
| 259 |
+
# df[new_column_name] = clustering.fit_predict(embeddings)
|
| 260 |
+
return df, optimal_n_clusters
|
| 261 |
+
|
| 262 |
+
# Usage
|
| 263 |
+
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|
| 264 |
+
# print(f'Optimal number of clusters: {optimal_n_clusters}')
|
| 265 |
+
|
| 266 |
|
| 267 |
|
| 268 |
|
|
|
|
| 277 |
|
| 278 |
def nlp_pipeline(original_df):
|
| 279 |
# Data Preprocessing
|
| 280 |
+
processed_df = data_pre_processing(original_df) # merged_dataset
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# Starting the Pipeline for Domain Extraction
|
| 284 |
+
# Apply the text_processing_for_domain function to the DataFrame
|
| 285 |
+
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
| 286 |
+
# Domain Clustering
|
| 287 |
+
domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
| 291 |
+
# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
|
| 292 |
|
| 293 |
|
|
|
|
|
|
|
| 294 |
|
| 295 |
|
| 296 |
+
# return processed_df
|
| 297 |
+
return domain_df
|
| 298 |
|
|
|
|
| 299 |
|
| 300 |
def process_excel(file):
|
| 301 |
try:
|