Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -239,8 +239,83 @@ def extract_problem_domains(df,
|
|
| 239 |
|
| 240 |
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
# def nlp_pipeline(original_df):
|
| 246 |
def nlp_pipeline(original_df):
|
|
@@ -281,6 +356,27 @@ def nlp_pipeline(original_df):
|
|
| 281 |
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
| 282 |
# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
|
| 283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
|
| 286 |
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def Extract_Location(text):
|
| 247 |
+
doc = nlp(text)
|
| 248 |
+
locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
|
| 249 |
+
return ' '.join(locations)
|
| 250 |
+
|
| 251 |
+
def text_processing_for_location(text):
|
| 252 |
+
# Extract locations
|
| 253 |
+
locations_text = Extract_Location(text)
|
| 254 |
+
|
| 255 |
+
# Perform further text cleaning if necessary
|
| 256 |
+
processed_locations_text = Lemmatize_text(locations_text)
|
| 257 |
+
|
| 258 |
+
# Remove special characters, digits, and punctuation
|
| 259 |
+
processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
|
| 260 |
+
|
| 261 |
+
# Tokenize and remove stopwords
|
| 262 |
+
tokens = word_tokenize(processed_locations_text.lower())
|
| 263 |
+
stop_words = set(stopwords.words('english'))
|
| 264 |
+
tokens = [word for word in tokens if word not in stop_words]
|
| 265 |
+
|
| 266 |
+
# Join location words into a single string
|
| 267 |
+
final_locations_text = ' '.join(tokens)
|
| 268 |
+
|
| 269 |
+
return final_locations_text if final_locations_text else "India"
|
| 270 |
|
| 271 |
|
| 272 |
+
def extract_location_clusters(df,
|
| 273 |
+
text_column='Processed_LocationText_forClustering',
|
| 274 |
+
cluster_range=(3, 10),
|
| 275 |
+
top_words=5):
|
| 276 |
+
console_messages.append("Extracting Location Clusters...")
|
| 277 |
+
|
| 278 |
+
# Sentence Transformers approach for embeddings
|
| 279 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
| 280 |
+
embeddings = model.encode(df[text_column].tolist())
|
| 281 |
+
|
| 282 |
+
# Perform hierarchical clustering with Silhouette Analysis
|
| 283 |
+
silhouette_scores = []
|
| 284 |
+
for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
| 285 |
+
clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
| 286 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
| 287 |
+
silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
| 288 |
+
silhouette_scores.append(silhouette_avg)
|
| 289 |
+
|
| 290 |
+
# Determine the optimal number of clusters
|
| 291 |
+
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 292 |
+
|
| 293 |
+
# Perform clustering with the optimal number of clusters
|
| 294 |
+
clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
| 295 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
| 296 |
+
|
| 297 |
+
# Get representative words for each cluster
|
| 298 |
+
cluster_representations = {}
|
| 299 |
+
for i in range(optimal_n_clusters):
|
| 300 |
+
cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
|
| 301 |
+
cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
|
| 302 |
+
|
| 303 |
+
# Map cluster labels to representative words
|
| 304 |
+
df["Location_Cluster"] = cluster_labels
|
| 305 |
+
df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 306 |
+
|
| 307 |
+
console_messages.append("Location Clustering completed.")
|
| 308 |
+
return df, optimal_n_clusters
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
|
| 319 |
|
| 320 |
# def nlp_pipeline(original_df):
|
| 321 |
def nlp_pipeline(original_df):
|
|
|
|
| 356 |
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
| 357 |
# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
|
| 358 |
|
| 359 |
+
console_messages.append("Starting NLP pipeline for location extraction...")
|
| 360 |
+
|
| 361 |
+
# Apply the text_processing_for_location function to the DataFrame
|
| 362 |
+
processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
|
| 363 |
+
|
| 364 |
+
# Location Clustering
|
| 365 |
+
try:
|
| 366 |
+
location_df, optimal_n_clusters = extract_location_clusters(processed_df)
|
| 367 |
+
console_messages.append("NLP pipeline for location extraction completed.")
|
| 368 |
+
return location_df
|
| 369 |
+
except Exception as e:
|
| 370 |
+
console_messages.append(f"Error in extract_location_clusters: {str(e)}")
|
| 371 |
+
return processed_df
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
|
| 380 |
|
| 381 |
|
| 382 |
|