Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 5, 2024

Commit

efb3f07

verified ·

1 Parent(s): 86142db

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -31

app.py CHANGED Viewed

@@ -243,31 +243,63 @@ def extract_problem_domains(df,
-def Extract_Location(text):
-    doc = nlp(text)
-    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
-    return ' '.join(locations)
-def text_processing_for_location(text):
-    # Extract locations
-    locations_text = Extract_Location(text)
-    # Perform further text cleaning if necessary
-    processed_locations_text = Lemmatize_text(locations_text)
-    # Remove special characters, digits, and punctuation
-    processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
-    # Tokenize and remove stopwords
-    tokens = word_tokenize(processed_locations_text.lower())
-    stop_words = set(stopwords.words('english'))
-    tokens = [word for word in tokens if word not in stop_words]
-    # Join location words into a single string
-    final_locations_text = ' '.join(tokens)
-    return final_locations_text if final_locations_text else "India"
 def extract_location_clusters(df,
                               text_column='Processed_LocationText_forClustering',
@@ -314,10 +346,97 @@ def extract_location_clusters(df,
-import copy
-# def nlp_pipeline(original_df):
 def nlp_pipeline(original_df):
     console_messages.append("Starting NLP pipeline...")
@@ -344,8 +463,10 @@ def nlp_pipeline(original_df):
     console_messages.append("Starting NLP pipeline for Location extraction with text processing.")
     # Apply the text_processing_for_location function to the DataFrame
-    processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
     # Location Clustering
     try:
@@ -355,12 +476,10 @@ def nlp_pipeline(original_df):
         console_messages.append(f"Error in extract_location_clusters: {str(e)}")
     console_messages.append("NLP pipeline for location extraction completed.")
     console_messages.append("NLP pipeline completed.")
     return processed_df

+import spacy
+from geopy.geocoders import Nominatim
+from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
+import pandas as pd
+nlp = spacy.load('en_core_web_sm')
+geolocator = Nominatim(user_agent="my_agent")
+def extract_and_geocode_locations(text, user_locations):
+    # Extract locations from text
+    doc = nlp(text)
+    extracted_locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
+    # Combine extracted locations with user-provided locations
+    all_locations = list(set(extracted_locations + user_locations.split(', ')))
+    geocoded_locations = []
+    for loc in all_locations:
+        try:
+            location = geolocator.geocode(loc)
+            if location:
+                geocoded_locations.append({
+                    'name': loc,
+                    'latitude': location.latitude,
+                    'longitude': location.longitude,
+                    'country': location.raw.get('display_name', '').split(', ')[-1]
+                })
+            else:
+                # If geocoding fails, add the location without coordinates
+                geocoded_locations.append({
+                    'name': loc,
+                    'latitude': None,
+                    'longitude': None,
+                    'country': None
+                })
+        except (GeocoderTimedOut, GeocoderUnavailable):
+            print(f"Geocoding failed for {loc}")
+            # Add the location without coordinates
+            geocoded_locations.append({
+                'name': loc,
+                'latitude': None,
+                'longitude': None,
+                'country': None
+            })
+    return geocoded_locations
+def text_processing_for_location(row):
+    locations = extract_and_geocode_locations(row['Problem_Description'], row['Geographical_Location'])
+    location_text = ' '.join([loc['name'] for loc in locations])
+    processed_text = Lemmatize_text(location_text)
+    return processed_text, locations
 def extract_location_clusters(df,
                               text_column='Processed_LocationText_forClustering',
+# def Extract_Location(text):
+#     doc = nlp(text)
+#     locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
+#     return ' '.join(locations)
+# def text_processing_for_location(text):
+#     # Extract locations
+#     locations_text = Extract_Location(text)
+#     # Perform further text cleaning if necessary
+#     processed_locations_text = Lemmatize_text(locations_text)
+#     # Remove special characters, digits, and punctuation
+#     processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
+#     # Tokenize and remove stopwords
+#     tokens = word_tokenize(processed_locations_text.lower())
+#     stop_words = set(stopwords.words('english'))
+#     tokens = [word for word in tokens if word not in stop_words]
+#     # Join location words into a single string
+#     final_locations_text = ' '.join(tokens)
+#     return final_locations_text if final_locations_text else "India"
+# def extract_location_clusters(df,
+#                               text_column='Processed_LocationText_forClustering',
+#                               cluster_range=(3, 10),
+#                               top_words=5):
+#     console_messages.append("Extracting Location Clusters...")
+#     # Sentence Transformers approach for embeddings
+#     model = SentenceTransformer('all-mpnet-base-v2')
+#     embeddings = model.encode(df[text_column].tolist())
+#     # Perform hierarchical clustering with Silhouette Analysis
+#     silhouette_scores = []
+#     for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+#         clustering = AgglomerativeClustering(n_clusters=n_clusters)
+#         cluster_labels = clustering.fit_predict(embeddings)
+#         silhouette_avg = silhouette_score(embeddings, cluster_labels)
+#         silhouette_scores.append(silhouette_avg)
+#     # Determine the optimal number of clusters
+#     optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+#     # Perform clustering with the optimal number of clusters
+#     clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+#     cluster_labels = clustering.fit_predict(embeddings)
+#     # Get representative words for each cluster
+#     cluster_representations = {}
+#     for i in range(optimal_n_clusters):
+#         cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
+#         cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
+#     # Map cluster labels to representative words
+#     df["Location_Cluster"] = cluster_labels
+#     df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+#     console_messages.append("Location Clustering completed.")
+#     return df, optimal_n_clusters
 def nlp_pipeline(original_df):
     console_messages.append("Starting NLP pipeline...")
     console_messages.append("Starting NLP pipeline for Location extraction with text processing.")
     # Apply the text_processing_for_location function to the DataFrame
+    # processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
+    processed_df['Processed_LocationText_forClustering'], processed_df['Extracted_Locations'] = zip(*processed_df.apply(text_processing_for_location, axis=1))
     # Location Clustering
     try:
         console_messages.append(f"Error in extract_location_clusters: {str(e)}")
     console_messages.append("NLP pipeline for location extraction completed.")
     console_messages.append("NLP pipeline completed.")
     return processed_df