Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 6, 2024

Commit

6a89968

verified ·

1 Parent(s): 95831be

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -41

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import gradio as gr
 import pandas as pd
 def data_pre_processing(file_responses):
-    console_messages.append("Starting data pre-processing...")
     # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
     try: # Define the columns to be processed
@@ -54,8 +54,7 @@ def data_pre_processing(file_responses):
         initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
         initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
-        # Removing useless rows
-        # Drop rows where Problem_Description is NaN or an empty string
         initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
         initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
         initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
@@ -65,19 +64,15 @@ def data_pre_processing(file_responses):
         initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
         initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
-        # Merging the Datasets
-        # Vertically concatenating (merging) the 3 DataFrames
         merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
         # Different return can be used to check the processing
-        console_messages.append("Data pre-processing completed.")
-        # return file_responses
         return merged_dataset
     except Exception as e:
-        console_messages.append(f"Error during data pre-processing: {str(e)}")
-        # return str(e), console_messages
         return None
@@ -201,7 +196,7 @@ def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
                             cluster_range=(6, 10),
                             top_words=7):
-    console_messages.append("Extracting Problem Domains...")
     # Sentence Transformers approach
     model = SentenceTransformer('all-mpnet-base-v2')
@@ -232,8 +227,7 @@ def extract_problem_domains(df,
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-    # console_messages.append("Returning from Problem Domain Extraction function.")
-    console_messages.append("Problem Domain Extraction completed.")
     return df, optimal_n_clusters, cluster_representations
@@ -282,13 +276,13 @@ def text_processing_for_location(text):
 def extract_location_clusters(df,
                               text_column1='Processed_LocationText_forClustering', # Extracted through NLP
                               text_column2='Geographical_Location', # User Input
-                              cluster_range=(1, 5),
                               top_words=3):
     # Combine the two text columns
     text_column = "Combined_Location_Text"
     df[text_column] = df[text_column1] + ' ' + df[text_column2]
-    console_messages.append("Extracting Location Clusters...")
     # Sentence Transformers approach for embeddings
     model = SentenceTransformer('all-mpnet-base-v2')
@@ -320,7 +314,7 @@ def extract_location_clusters(df,
     df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
     df = df.drop(text_column, axis=1)
-    console_messages.append("Location Clustering completed.")
     return df, optimal_n_clusters, cluster_representations
@@ -408,17 +402,14 @@ def generate_project_proposal(prompt):
 def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
-    print("\n Starting function: create_project_proposals")
-    console_messages.append("\n Starting function: create_project_proposals")
     proposals = {}
     for loc in budget_cluster_df.index:
-        print("\n loc: ", loc)
-        console_messages.append(f"\n loc: {loc}")
         for prob in budget_cluster_df.columns:
-            console_messages.append(f"\n prob: {prob}")
-            print("\n prob: ", prob)
             location = ", ".join([item.strip() for item in location_clusters[loc] if item])  # Clean and join
             problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item])  # Clean and join
@@ -432,7 +423,7 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
             # Check if problem_descriptions is valid (not NaN and not an empty list)
             if isinstance(problem_descriptions, list) and problem_descriptions:
                 # print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
-                print(f"Generating PP")
                 # Prepare the prompt
                 # problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
@@ -531,17 +522,17 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
 def nlp_pipeline(original_df):
-    console_messages.append("Starting NLP pipeline...")
     # Data Preprocessing
     processed_df = data_pre_processing(original_df) # merged_dataset
     # Starting the Pipeline for Domain Extraction
-    console_messages.append("Executing Text processing function for Domain identification")
     # Apply the text_processing_for_domain function to the DataFrame
     processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
-    console_messages.append("Removing entries which could not be allocated to any Problem Domain")
     # processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
     # Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
     processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
@@ -549,13 +540,13 @@ def nlp_pipeline(original_df):
     # Domain Clustering
     try:
         processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
-        console_messages.append(f"Optimal clusters for Domain extraction: {optimal_n_clusters}")
     except Exception as e:
-        console_messages.append(f"Error in extract_problem_domains: {str(e)}")
-    console_messages.append("NLP pipeline for Problem Domain extraction completed.")
-    console_messages.append("Starting NLP pipeline for Location extraction with text processing.")
     # Apply the text_processing_for_location function to the DataFrame
     processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
@@ -564,10 +555,10 @@ def nlp_pipeline(original_df):
     # Location Clustering
     try:
         processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
-        console_messages.append(f"Optimal clusters for Location extraction: {optimal_n_clusters}")
     except Exception as e:
-        console_messages.append(f"Error in extract_location_clusters: {str(e)}")
-    console_messages.append("NLP pipeline for location extraction completed.")
     # Create cluster dataframes
@@ -585,8 +576,7 @@ def nlp_pipeline(original_df):
     # print("\n problem_clusters_2: ", problem_clusters)
     project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
-    console_messages.append("NLP pipeline completed.")
-    print("NLP pipeline completed.")
     return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
@@ -597,8 +587,15 @@ def nlp_pipeline(original_df):
 console_messages = []
 def process_excel(file):
-    console_messages.append("Processing starts. Reading the uploaded Excel file...")
     # Ensure the file path is correct
     file_path = file.name if hasattr(file, 'name') else file
     # Read the Excel file
@@ -606,7 +603,7 @@ def process_excel(file):
     try:
         # Process the DataFrame
-        console_messages.append("Processing the DataFrame...")
         processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
         # processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters  = nlp_pipeline(df)
@@ -628,25 +625,25 @@ def process_excel(file):
             # if isinstance(location_clusters, pd.DataFrame):
             #     location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
             # else:
-            #     console_messages.append("Converting Location Clusters to df")
             #     pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
             # if isinstance(problem_clusters, pd.DataFrame):
             #     problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
             # else:
-            #     console_messages.append("Converting Problem Clusters to df")
             #     pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
-        console_messages.append("Processing completed. Ready for download.")
         return output_filename, "\n".join(console_messages)  # Return the processed DataFrame as Excel file
     except Exception as e:
         # return str(e)  # Return the error message
         # error_message = f"Error processing file: {str(e)}"
         # print(error_message)  # Log the error
-        console_messages.append(f"Error during processing: {str(e)}")
         # return error_message, "Santanu Banerjee" # Return the error message to the user
         return None, "\n".join(console_messages)

 import pandas as pd
 def data_pre_processing(file_responses):
+    consoleMessage_and_Print("Starting data pre-processing...")
     # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
     try: # Define the columns to be processed
         initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
         initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
+        # Removing useless rows # Drop rows where Problem_Description is NaN or an empty string
         initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
         initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
         initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
         initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
         initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
+        # Merging the Datasets # Vertically concatenating (merging) the 3 DataFrames
         merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
         # Different return can be used to check the processing
+        consoleMessage_and_Print("Data pre-processing completed.")
         return merged_dataset
     except Exception as e:
+        consoleMessage_and_Print(f"Error during data pre-processing: {str(e)}")
         return None
                             text_column='Processed_ProblemDescription_forDomainExtraction',
                             cluster_range=(6, 10),
                             top_words=7):
+    consoleMessage_and_Print("Extracting Problem Domains...")
     # Sentence Transformers approach
     model = SentenceTransformer('all-mpnet-base-v2')
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+    consoleMessage_and_Print("Problem Domain Extraction completed. Returning from Problem Domain Extraction function.")
     return df, optimal_n_clusters, cluster_representations
 def extract_location_clusters(df,
                               text_column1='Processed_LocationText_forClustering', # Extracted through NLP
                               text_column2='Geographical_Location', # User Input
+                              cluster_range=(2, 5),
                               top_words=3):
     # Combine the two text columns
     text_column = "Combined_Location_Text"
     df[text_column] = df[text_column1] + ' ' + df[text_column2]
+    consoleMessage_and_Print("Extracting Location Clusters...")
     # Sentence Transformers approach for embeddings
     model = SentenceTransformer('all-mpnet-base-v2')
     df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
     df = df.drop(text_column, axis=1)
+    consoleMessage_and_Print("Location Clustering completed.")
     return df, optimal_n_clusters, cluster_representations
 def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
+    consoleMessage_and_Print("\n Starting function: create_project_proposals")
     proposals = {}
     for loc in budget_cluster_df.index:
+        consoleMessage_and_Print(f"\n loc: {loc}")
         for prob in budget_cluster_df.columns:
+            consoleMessage_and_Print(f"\n prob: {prob}")
             location = ", ".join([item.strip() for item in location_clusters[loc] if item])  # Clean and join
             problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item])  # Clean and join
             # Check if problem_descriptions is valid (not NaN and not an empty list)
             if isinstance(problem_descriptions, list) and problem_descriptions:
                 # print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
+                consoleMessage_and_Print(f"Generating PP")
                 # Prepare the prompt
                 # problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
 def nlp_pipeline(original_df):
+    consoleMessage_and_Print("Starting NLP pipeline...")
     # Data Preprocessing
     processed_df = data_pre_processing(original_df) # merged_dataset
     # Starting the Pipeline for Domain Extraction
+    consoleMessage_and_Print("Executing Text processing function for Domain identification")
     # Apply the text_processing_for_domain function to the DataFrame
     processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
+    consoleMessage_and_Print("Removing entries which could not be allocated to any Problem Domain")
     # processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
     # Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
     processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
     # Domain Clustering
     try:
         processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
+        consoleMessage_and_Print(f"Optimal clusters for Domain extraction: {optimal_n_clusters}")
     except Exception as e:
+        consoleMessage_and_Print(f"Error in extract_problem_domains: {str(e)}")
+    consoleMessage_and_Print("NLP pipeline for Problem Domain extraction completed.")
+    consoleMessage_and_Print("Starting NLP pipeline for Location extraction with text processing.")
     # Apply the text_processing_for_location function to the DataFrame
     processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
     # Location Clustering
     try:
         processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
+        consoleMessage_and_Print(f"Optimal clusters for Location extraction: {optimal_n_clusters}")
     except Exception as e:
+        consoleMessage_and_Print(f"Error in extract_location_clusters: {str(e)}")
+    consoleMessage_and_Print("NLP pipeline for location extraction completed.")
     # Create cluster dataframes
     # print("\n problem_clusters_2: ", problem_clusters)
     project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
+    consoleMessage_and_Print("NLP pipeline completed.")
     return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
 console_messages = []
+def consoleMessage_and_Print(some_text = ""):
+    console_messages.append(some_text)
+    print(some_text)
 def process_excel(file):
+    consoleMessage_and_Print("Processing starts. Reading the uploaded Excel file...")
     # Ensure the file path is correct
     file_path = file.name if hasattr(file, 'name') else file
     # Read the Excel file
     try:
         # Process the DataFrame
+        consoleMessage_and_Print("Processing the DataFrame...")
         processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
         # processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters  = nlp_pipeline(df)
             # if isinstance(location_clusters, pd.DataFrame):
             #     location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
             # else:
+            #     consoleMessage_and_Print("Converting Location Clusters to df")
             #     pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
             # if isinstance(problem_clusters, pd.DataFrame):
             #     problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
             # else:
+            #     consoleMessage_and_Print("Converting Problem Clusters to df")
             #     pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
+        consoleMessage_and_Print("Processing completed. Ready for download.")
         return output_filename, "\n".join(console_messages)  # Return the processed DataFrame as Excel file
     except Exception as e:
         # return str(e)  # Return the error message
         # error_message = f"Error processing file: {str(e)}"
         # print(error_message)  # Log the error
+        consoleMessage_and_Print(f"Error during processing: {str(e)}")
         # return error_message, "Santanu Banerjee" # Return the error message to the user
         return None, "\n".join(console_messages)