Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 5, 2024

Commit

5a0f2dc

verified ·

1 Parent(s): 452c821

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -0

app.py CHANGED Viewed

@@ -239,8 +239,83 @@ def extract_problem_domains(df,
 # def nlp_pipeline(original_df):
 def nlp_pipeline(original_df):
@@ -281,6 +356,27 @@ def nlp_pipeline(original_df):
     # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
     # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)

+def Extract_Location(text):
+    doc = nlp(text)
+    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
+    return ' '.join(locations)
+def text_processing_for_location(text):
+    # Extract locations
+    locations_text = Extract_Location(text)
+    # Perform further text cleaning if necessary
+    processed_locations_text = Lemmatize_text(locations_text)
+    # Remove special characters, digits, and punctuation
+    processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
+    # Tokenize and remove stopwords
+    tokens = word_tokenize(processed_locations_text.lower())
+    stop_words = set(stopwords.words('english'))
+    tokens = [word for word in tokens if word not in stop_words]
+    # Join location words into a single string
+    final_locations_text = ' '.join(tokens)
+    return final_locations_text if final_locations_text else "India"
+def extract_location_clusters(df,
+                              text_column='Processed_LocationText_forClustering',
+                              cluster_range=(3, 10),
+                              top_words=5):
+    console_messages.append("Extracting Location Clusters...")
+    # Sentence Transformers approach for embeddings
+    model = SentenceTransformer('all-mpnet-base-v2')
+    embeddings = model.encode(df[text_column].tolist())
+    # Perform hierarchical clustering with Silhouette Analysis
+    silhouette_scores = []
+    for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+        clustering = AgglomerativeClustering(n_clusters=n_clusters)
+        cluster_labels = clustering.fit_predict(embeddings)
+        silhouette_avg = silhouette_score(embeddings, cluster_labels)
+        silhouette_scores.append(silhouette_avg)
+    # Determine the optimal number of clusters
+    optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+    # Perform clustering with the optimal number of clusters
+    clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+    cluster_labels = clustering.fit_predict(embeddings)
+    # Get representative words for each cluster
+    cluster_representations = {}
+    for i in range(optimal_n_clusters):
+        cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
+        cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
+    # Map cluster labels to representative words
+    df["Location_Cluster"] = cluster_labels
+    df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+    console_messages.append("Location Clustering completed.")
+    return df, optimal_n_clusters
 # def nlp_pipeline(original_df):
 def nlp_pipeline(original_df):
     # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
     # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
+    console_messages.append("Starting NLP pipeline for location extraction...")
+    # Apply the text_processing_for_location function to the DataFrame
+    processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
+    # Location Clustering
+    try:
+        location_df, optimal_n_clusters = extract_location_clusters(processed_df)
+        console_messages.append("NLP pipeline for location extraction completed.")
+        return location_df
+    except Exception as e:
+        console_messages.append(f"Error in extract_location_clusters: {str(e)}")
+        return processed_df