Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 5, 2024

Commit

89ee992

verified ·

1 Parent(s): 31a5c30

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -134

app.py CHANGED Viewed

@@ -248,58 +248,36 @@ def extract_problem_domains(df,
-import spacy
-from geopy.geocoders import Nominatim
-from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
-import pandas as pd
-nlp = spacy.load('en_core_web_sm')
-geolocator = Nominatim(user_agent="my_agent")
-def extract_and_geocode_locations(text, user_locations):
-    # Extract locations from text
     doc = nlp(text)
-    extracted_locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
-    # Combine extracted locations with user-provided locations
-    all_locations = list(set(extracted_locations + user_locations.split(', ')))
-    geocoded_locations = []
-    for loc in all_locations:
-        try:
-            location = geolocator.geocode(loc)
-            if location:
-                geocoded_locations.append({
-                    'name': loc,
-                    'latitude': location.latitude,
-                    'longitude': location.longitude,
-                    'country': location.raw.get('display_name', '').split(', ')[-1]
-                })
-            else:
-                # If geocoding fails, add the location without coordinates
-                geocoded_locations.append({
-                    'name': loc,
-                    'latitude': None,
-                    'longitude': None,
-                    'country': None
-                })
-        except (GeocoderTimedOut, GeocoderUnavailable):
-            print(f"Geocoding failed for {loc}")
-            # Add the location without coordinates
-            geocoded_locations.append({
-                'name': loc,
-                'latitude': None,
-                'longitude': None,
-                'country': None
-            })
-    return geocoded_locations
-def text_processing_for_location(row):
-    locations = extract_and_geocode_locations(row['Problem_Description'], row['Geographical_Location'])
-    location_text = ' '.join([loc['name'] for loc in locations])
-    processed_text = Lemmatize_text(location_text)
-    return processed_text, locations
 def extract_location_clusters(df,
                               text_column='Processed_LocationText_forClustering',
@@ -355,86 +333,6 @@ def extract_location_clusters(df,
-# def Extract_Location(text):
-#     doc = nlp(text)
-#     locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
-#     return ' '.join(locations)
-# def text_processing_for_location(text):
-#     # Extract locations
-#     locations_text = Extract_Location(text)
-#     # Perform further text cleaning if necessary
-#     processed_locations_text = Lemmatize_text(locations_text)
-#     # Remove special characters, digits, and punctuation
-#     processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
-#     # Tokenize and remove stopwords
-#     tokens = word_tokenize(processed_locations_text.lower())
-#     stop_words = set(stopwords.words('english'))
-#     tokens = [word for word in tokens if word not in stop_words]
-#     # Join location words into a single string
-#     final_locations_text = ' '.join(tokens)
-#     return final_locations_text if final_locations_text else "India"
-# def extract_location_clusters(df,
-#                               text_column='Processed_LocationText_forClustering',
-#                               cluster_range=(3, 10),
-#                               top_words=5):
-#     console_messages.append("Extracting Location Clusters...")
-#     # Sentence Transformers approach for embeddings
-#     model = SentenceTransformer('all-mpnet-base-v2')
-#     embeddings = model.encode(df[text_column].tolist())
-#     # Perform hierarchical clustering with Silhouette Analysis
-#     silhouette_scores = []
-#     for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
-#         clustering = AgglomerativeClustering(n_clusters=n_clusters)
-#         cluster_labels = clustering.fit_predict(embeddings)
-#         silhouette_avg = silhouette_score(embeddings, cluster_labels)
-#         silhouette_scores.append(silhouette_avg)
-#     # Determine the optimal number of clusters
-#     optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-#     # Perform clustering with the optimal number of clusters
-#     clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
-#     cluster_labels = clustering.fit_predict(embeddings)
-#     # Get representative words for each cluster
-#     cluster_representations = {}
-#     for i in range(optimal_n_clusters):
-#         cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
-#         cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
-#     # Map cluster labels to representative words
-#     df["Location_Cluster"] = cluster_labels
-#     df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-#     console_messages.append("Location Clustering completed.")
-#     return df, optimal_n_clusters
 def nlp_pipeline(original_df):
@@ -519,14 +417,16 @@ def process_excel(file):
 # example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
 #                  '#TaxDirection (Responses)_IntermediateExample.xlsx',
-#                  '#TaxDirection (Responses)_UltimateExample.xlsx'
-#                 ]
-example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
-                 '#TaxDirection (Responses)_IntermediateExample.xlsx',
-                 ]
 import random

+def Extract_Location(text):
     doc = nlp(text)
+    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
+    return ' '.join(locations)
+def text_processing_for_location(text):
+    # Extract locations
+    locations_text = Extract_Location(text)
+    # Perform further text cleaning if necessary
+    processed_locations_text = Lemmatize_text(locations_text)
+    # Remove special characters, digits, and punctuation
+    processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
+    # Tokenize and remove stopwords
+    tokens = word_tokenize(processed_locations_text.lower())
+    stop_words = set(stopwords.words('english'))
+    tokens = [word for word in tokens if word not in stop_words]
+    # Join location words into a single string
+    final_locations_text = ' '.join(tokens)
+    return final_locations_text if final_locations_text else "India"
 def extract_location_clusters(df,
                               text_column='Processed_LocationText_forClustering',
 def nlp_pipeline(original_df):
+example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
+                 '#TaxDirection (Responses)_IntermediateExample.xlsx',
+                 '#TaxDirection (Responses)_UltimateExample.xlsx'
+                ]
 # example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
 #                  '#TaxDirection (Responses)_IntermediateExample.xlsx',
+#                  ]
+# example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',]
 import random