Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ import gradio as gr
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
def data_pre_processing(file_responses):
|
| 13 |
-
|
| 14 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
| 15 |
|
| 16 |
try: # Define the columns to be processed
|
|
@@ -54,8 +54,7 @@ def data_pre_processing(file_responses):
|
|
| 54 |
initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
| 55 |
initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
| 56 |
|
| 57 |
-
# Removing useless rows
|
| 58 |
-
# Drop rows where Problem_Description is NaN or an empty string
|
| 59 |
initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
|
| 60 |
initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
|
| 61 |
initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
|
|
@@ -65,19 +64,15 @@ def data_pre_processing(file_responses):
|
|
| 65 |
initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
|
| 66 |
initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
|
| 67 |
|
| 68 |
-
# Merging the Datasets
|
| 69 |
-
# Vertically concatenating (merging) the 3 DataFrames
|
| 70 |
merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
|
| 71 |
-
|
| 72 |
|
| 73 |
# Different return can be used to check the processing
|
| 74 |
-
|
| 75 |
-
# return file_responses
|
| 76 |
return merged_dataset
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
-
|
| 80 |
-
# return str(e), console_messages
|
| 81 |
return None
|
| 82 |
|
| 83 |
|
|
@@ -201,7 +196,7 @@ def extract_problem_domains(df,
|
|
| 201 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
| 202 |
cluster_range=(6, 10),
|
| 203 |
top_words=7):
|
| 204 |
-
|
| 205 |
|
| 206 |
# Sentence Transformers approach
|
| 207 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
@@ -232,8 +227,7 @@ def extract_problem_domains(df,
|
|
| 232 |
df["Problem_Cluster"] = cluster_labels
|
| 233 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 234 |
|
| 235 |
-
|
| 236 |
-
console_messages.append("Problem Domain Extraction completed.")
|
| 237 |
return df, optimal_n_clusters, cluster_representations
|
| 238 |
|
| 239 |
|
|
@@ -282,13 +276,13 @@ def text_processing_for_location(text):
|
|
| 282 |
def extract_location_clusters(df,
|
| 283 |
text_column1='Processed_LocationText_forClustering', # Extracted through NLP
|
| 284 |
text_column2='Geographical_Location', # User Input
|
| 285 |
-
cluster_range=(
|
| 286 |
top_words=3):
|
| 287 |
# Combine the two text columns
|
| 288 |
text_column = "Combined_Location_Text"
|
| 289 |
df[text_column] = df[text_column1] + ' ' + df[text_column2]
|
| 290 |
|
| 291 |
-
|
| 292 |
|
| 293 |
# Sentence Transformers approach for embeddings
|
| 294 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
@@ -320,7 +314,7 @@ def extract_location_clusters(df,
|
|
| 320 |
df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 321 |
|
| 322 |
df = df.drop(text_column, axis=1)
|
| 323 |
-
|
| 324 |
return df, optimal_n_clusters, cluster_representations
|
| 325 |
|
| 326 |
|
|
@@ -408,17 +402,14 @@ def generate_project_proposal(prompt):
|
|
| 408 |
|
| 409 |
|
| 410 |
def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
|
| 411 |
-
|
| 412 |
-
console_messages.append("\n Starting function: create_project_proposals")
|
| 413 |
proposals = {}
|
| 414 |
|
| 415 |
for loc in budget_cluster_df.index:
|
| 416 |
-
|
| 417 |
-
console_messages.append(f"\n loc: {loc}")
|
| 418 |
|
| 419 |
for prob in budget_cluster_df.columns:
|
| 420 |
-
|
| 421 |
-
print("\n prob: ", prob)
|
| 422 |
|
| 423 |
location = ", ".join([item.strip() for item in location_clusters[loc] if item]) # Clean and join
|
| 424 |
problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item]) # Clean and join
|
|
@@ -432,7 +423,7 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
|
|
| 432 |
# Check if problem_descriptions is valid (not NaN and not an empty list)
|
| 433 |
if isinstance(problem_descriptions, list) and problem_descriptions:
|
| 434 |
# print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
|
| 435 |
-
|
| 436 |
|
| 437 |
# Prepare the prompt
|
| 438 |
# problems_summary = "; \n".join(problem_descriptions) # Join all problem descriptions
|
|
@@ -531,17 +522,17 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
|
|
| 531 |
|
| 532 |
|
| 533 |
def nlp_pipeline(original_df):
|
| 534 |
-
|
| 535 |
|
| 536 |
# Data Preprocessing
|
| 537 |
processed_df = data_pre_processing(original_df) # merged_dataset
|
| 538 |
|
| 539 |
# Starting the Pipeline for Domain Extraction
|
| 540 |
-
|
| 541 |
# Apply the text_processing_for_domain function to the DataFrame
|
| 542 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
| 543 |
|
| 544 |
-
|
| 545 |
# processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
| 546 |
# Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
|
| 547 |
processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
|
|
@@ -549,13 +540,13 @@ def nlp_pipeline(original_df):
|
|
| 549 |
# Domain Clustering
|
| 550 |
try:
|
| 551 |
processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
|
| 552 |
-
|
| 553 |
except Exception as e:
|
| 554 |
-
|
| 555 |
-
|
| 556 |
|
| 557 |
|
| 558 |
-
|
| 559 |
|
| 560 |
# Apply the text_processing_for_location function to the DataFrame
|
| 561 |
processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
|
|
@@ -564,10 +555,10 @@ def nlp_pipeline(original_df):
|
|
| 564 |
# Location Clustering
|
| 565 |
try:
|
| 566 |
processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
|
| 567 |
-
|
| 568 |
except Exception as e:
|
| 569 |
-
|
| 570 |
-
|
| 571 |
|
| 572 |
|
| 573 |
# Create cluster dataframes
|
|
@@ -585,8 +576,7 @@ def nlp_pipeline(original_df):
|
|
| 585 |
# print("\n problem_clusters_2: ", problem_clusters)
|
| 586 |
project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
|
| 587 |
|
| 588 |
-
|
| 589 |
-
print("NLP pipeline completed.")
|
| 590 |
return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
|
| 591 |
|
| 592 |
|
|
@@ -597,8 +587,15 @@ def nlp_pipeline(original_df):
|
|
| 597 |
|
| 598 |
|
| 599 |
console_messages = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
def process_excel(file):
|
| 601 |
-
|
| 602 |
# Ensure the file path is correct
|
| 603 |
file_path = file.name if hasattr(file, 'name') else file
|
| 604 |
# Read the Excel file
|
|
@@ -606,7 +603,7 @@ def process_excel(file):
|
|
| 606 |
|
| 607 |
try:
|
| 608 |
# Process the DataFrame
|
| 609 |
-
|
| 610 |
processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
|
| 611 |
# processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters = nlp_pipeline(df)
|
| 612 |
|
|
@@ -628,25 +625,25 @@ def process_excel(file):
|
|
| 628 |
# if isinstance(location_clusters, pd.DataFrame):
|
| 629 |
# location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
|
| 630 |
# else:
|
| 631 |
-
#
|
| 632 |
# pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
|
| 633 |
|
| 634 |
# if isinstance(problem_clusters, pd.DataFrame):
|
| 635 |
# problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
| 636 |
# else:
|
| 637 |
-
#
|
| 638 |
# pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
| 639 |
|
| 640 |
|
| 641 |
|
| 642 |
-
|
| 643 |
return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
|
| 644 |
|
| 645 |
except Exception as e:
|
| 646 |
# return str(e) # Return the error message
|
| 647 |
# error_message = f"Error processing file: {str(e)}"
|
| 648 |
# print(error_message) # Log the error
|
| 649 |
-
|
| 650 |
# return error_message, "Santanu Banerjee" # Return the error message to the user
|
| 651 |
return None, "\n".join(console_messages)
|
| 652 |
|
|
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
def data_pre_processing(file_responses):
|
| 13 |
+
consoleMessage_and_Print("Starting data pre-processing...")
|
| 14 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
| 15 |
|
| 16 |
try: # Define the columns to be processed
|
|
|
|
| 54 |
initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
| 55 |
initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
| 56 |
|
| 57 |
+
# Removing useless rows # Drop rows where Problem_Description is NaN or an empty string
|
|
|
|
| 58 |
initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
|
| 59 |
initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
|
| 60 |
initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
|
|
|
|
| 64 |
initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
|
| 65 |
initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
|
| 66 |
|
| 67 |
+
# Merging the Datasets # Vertically concatenating (merging) the 3 DataFrames
|
|
|
|
| 68 |
merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
|
|
|
|
| 69 |
|
| 70 |
# Different return can be used to check the processing
|
| 71 |
+
consoleMessage_and_Print("Data pre-processing completed.")
|
|
|
|
| 72 |
return merged_dataset
|
| 73 |
|
| 74 |
except Exception as e:
|
| 75 |
+
consoleMessage_and_Print(f"Error during data pre-processing: {str(e)}")
|
|
|
|
| 76 |
return None
|
| 77 |
|
| 78 |
|
|
|
|
| 196 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
| 197 |
cluster_range=(6, 10),
|
| 198 |
top_words=7):
|
| 199 |
+
consoleMessage_and_Print("Extracting Problem Domains...")
|
| 200 |
|
| 201 |
# Sentence Transformers approach
|
| 202 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
|
|
| 227 |
df["Problem_Cluster"] = cluster_labels
|
| 228 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 229 |
|
| 230 |
+
consoleMessage_and_Print("Problem Domain Extraction completed. Returning from Problem Domain Extraction function.")
|
|
|
|
| 231 |
return df, optimal_n_clusters, cluster_representations
|
| 232 |
|
| 233 |
|
|
|
|
| 276 |
def extract_location_clusters(df,
|
| 277 |
text_column1='Processed_LocationText_forClustering', # Extracted through NLP
|
| 278 |
text_column2='Geographical_Location', # User Input
|
| 279 |
+
cluster_range=(2, 5),
|
| 280 |
top_words=3):
|
| 281 |
# Combine the two text columns
|
| 282 |
text_column = "Combined_Location_Text"
|
| 283 |
df[text_column] = df[text_column1] + ' ' + df[text_column2]
|
| 284 |
|
| 285 |
+
consoleMessage_and_Print("Extracting Location Clusters...")
|
| 286 |
|
| 287 |
# Sentence Transformers approach for embeddings
|
| 288 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
|
|
| 314 |
df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 315 |
|
| 316 |
df = df.drop(text_column, axis=1)
|
| 317 |
+
consoleMessage_and_Print("Location Clustering completed.")
|
| 318 |
return df, optimal_n_clusters, cluster_representations
|
| 319 |
|
| 320 |
|
|
|
|
| 402 |
|
| 403 |
|
| 404 |
def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
|
| 405 |
+
consoleMessage_and_Print("\n Starting function: create_project_proposals")
|
|
|
|
| 406 |
proposals = {}
|
| 407 |
|
| 408 |
for loc in budget_cluster_df.index:
|
| 409 |
+
consoleMessage_and_Print(f"\n loc: {loc}")
|
|
|
|
| 410 |
|
| 411 |
for prob in budget_cluster_df.columns:
|
| 412 |
+
consoleMessage_and_Print(f"\n prob: {prob}")
|
|
|
|
| 413 |
|
| 414 |
location = ", ".join([item.strip() for item in location_clusters[loc] if item]) # Clean and join
|
| 415 |
problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item]) # Clean and join
|
|
|
|
| 423 |
# Check if problem_descriptions is valid (not NaN and not an empty list)
|
| 424 |
if isinstance(problem_descriptions, list) and problem_descriptions:
|
| 425 |
# print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
|
| 426 |
+
consoleMessage_and_Print(f"Generating PP")
|
| 427 |
|
| 428 |
# Prepare the prompt
|
| 429 |
# problems_summary = "; \n".join(problem_descriptions) # Join all problem descriptions
|
|
|
|
| 522 |
|
| 523 |
|
| 524 |
def nlp_pipeline(original_df):
|
| 525 |
+
consoleMessage_and_Print("Starting NLP pipeline...")
|
| 526 |
|
| 527 |
# Data Preprocessing
|
| 528 |
processed_df = data_pre_processing(original_df) # merged_dataset
|
| 529 |
|
| 530 |
# Starting the Pipeline for Domain Extraction
|
| 531 |
+
consoleMessage_and_Print("Executing Text processing function for Domain identification")
|
| 532 |
# Apply the text_processing_for_domain function to the DataFrame
|
| 533 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
| 534 |
|
| 535 |
+
consoleMessage_and_Print("Removing entries which could not be allocated to any Problem Domain")
|
| 536 |
# processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
| 537 |
# Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
|
| 538 |
processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
|
|
|
|
| 540 |
# Domain Clustering
|
| 541 |
try:
|
| 542 |
processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
|
| 543 |
+
consoleMessage_and_Print(f"Optimal clusters for Domain extraction: {optimal_n_clusters}")
|
| 544 |
except Exception as e:
|
| 545 |
+
consoleMessage_and_Print(f"Error in extract_problem_domains: {str(e)}")
|
| 546 |
+
consoleMessage_and_Print("NLP pipeline for Problem Domain extraction completed.")
|
| 547 |
|
| 548 |
|
| 549 |
+
consoleMessage_and_Print("Starting NLP pipeline for Location extraction with text processing.")
|
| 550 |
|
| 551 |
# Apply the text_processing_for_location function to the DataFrame
|
| 552 |
processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
|
|
|
|
| 555 |
# Location Clustering
|
| 556 |
try:
|
| 557 |
processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
|
| 558 |
+
consoleMessage_and_Print(f"Optimal clusters for Location extraction: {optimal_n_clusters}")
|
| 559 |
except Exception as e:
|
| 560 |
+
consoleMessage_and_Print(f"Error in extract_location_clusters: {str(e)}")
|
| 561 |
+
consoleMessage_and_Print("NLP pipeline for location extraction completed.")
|
| 562 |
|
| 563 |
|
| 564 |
# Create cluster dataframes
|
|
|
|
| 576 |
# print("\n problem_clusters_2: ", problem_clusters)
|
| 577 |
project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
|
| 578 |
|
| 579 |
+
consoleMessage_and_Print("NLP pipeline completed.")
|
|
|
|
| 580 |
return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
|
| 581 |
|
| 582 |
|
|
|
|
| 587 |
|
| 588 |
|
| 589 |
console_messages = []
|
| 590 |
+
def consoleMessage_and_Print(some_text = ""):
|
| 591 |
+
console_messages.append(some_text)
|
| 592 |
+
print(some_text)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
|
| 597 |
def process_excel(file):
|
| 598 |
+
consoleMessage_and_Print("Processing starts. Reading the uploaded Excel file...")
|
| 599 |
# Ensure the file path is correct
|
| 600 |
file_path = file.name if hasattr(file, 'name') else file
|
| 601 |
# Read the Excel file
|
|
|
|
| 603 |
|
| 604 |
try:
|
| 605 |
# Process the DataFrame
|
| 606 |
+
consoleMessage_and_Print("Processing the DataFrame...")
|
| 607 |
processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
|
| 608 |
# processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters = nlp_pipeline(df)
|
| 609 |
|
|
|
|
| 625 |
# if isinstance(location_clusters, pd.DataFrame):
|
| 626 |
# location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
|
| 627 |
# else:
|
| 628 |
+
# consoleMessage_and_Print("Converting Location Clusters to df")
|
| 629 |
# pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
|
| 630 |
|
| 631 |
# if isinstance(problem_clusters, pd.DataFrame):
|
| 632 |
# problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
| 633 |
# else:
|
| 634 |
+
# consoleMessage_and_Print("Converting Problem Clusters to df")
|
| 635 |
# pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
| 636 |
|
| 637 |
|
| 638 |
|
| 639 |
+
consoleMessage_and_Print("Processing completed. Ready for download.")
|
| 640 |
return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
|
| 641 |
|
| 642 |
except Exception as e:
|
| 643 |
# return str(e) # Return the error message
|
| 644 |
# error_message = f"Error processing file: {str(e)}"
|
| 645 |
# print(error_message) # Log the error
|
| 646 |
+
consoleMessage_and_Print(f"Error during processing: {str(e)}")
|
| 647 |
# return error_message, "Santanu Banerjee" # Return the error message to the user
|
| 648 |
return None, "\n".join(console_messages)
|
| 649 |
|