Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
def data_pre_processing(file_responses
|
| 5 |
console_messages.append("Starting data pre-processing...")
|
| 6 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
| 7 |
|
|
@@ -65,12 +65,12 @@ def data_pre_processing(file_responses, console_messages):
|
|
| 65 |
# Different return can be used to check the processing
|
| 66 |
console_messages.append("Data pre-processing completed.")
|
| 67 |
# return file_responses
|
| 68 |
-
return merged_dataset
|
| 69 |
|
| 70 |
except Exception as e:
|
| 71 |
console_messages.append(f"Error during data pre-processing: {str(e)}")
|
| 72 |
# return str(e), console_messages
|
| 73 |
-
return None
|
| 74 |
|
| 75 |
|
| 76 |
|
|
@@ -146,6 +146,8 @@ nltk.download('averaged_perceptron_tagger')
|
|
| 146 |
|
| 147 |
|
| 148 |
def text_processing_for_domain(text):
|
|
|
|
|
|
|
| 149 |
# Text Cleaning
|
| 150 |
text = re.sub(r'[^\w\s]', '', text)
|
| 151 |
text = re.sub(r'\d+', '', text)
|
|
@@ -174,7 +176,8 @@ def text_processing_for_domain(text):
|
|
| 174 |
inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
| 175 |
with torch.no_grad():
|
| 176 |
outputs = model(**inputs)
|
| 177 |
-
|
|
|
|
| 178 |
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 179 |
|
| 180 |
|
|
@@ -226,7 +229,6 @@ import numpy as np
|
|
| 226 |
|
| 227 |
|
| 228 |
def extract_problem_domains(df,
|
| 229 |
-
console_messages,
|
| 230 |
text_column='Problem_Description',
|
| 231 |
cluster_range=(10, 50),
|
| 232 |
top_words=17,
|
|
@@ -354,11 +356,11 @@ def extract_problem_domains(df,
|
|
| 354 |
|
| 355 |
|
| 356 |
# def nlp_pipeline(original_df):
|
| 357 |
-
def nlp_pipeline(original_df
|
| 358 |
console_messages.append("Starting NLP pipeline...")
|
| 359 |
|
| 360 |
# Data Preprocessing
|
| 361 |
-
processed_df
|
| 362 |
|
| 363 |
|
| 364 |
# Starting the Pipeline for Domain Extraction
|
|
@@ -368,18 +370,18 @@ def nlp_pipeline(original_df, console_messages):
|
|
| 368 |
|
| 369 |
# Domain Clustering
|
| 370 |
try:
|
| 371 |
-
domain_df, optimal_n_clusters = extract_problem_domains(processed_df
|
| 372 |
# print(f"Optimal clusters: {optimal_clusters}")
|
| 373 |
# print(result_df.head())
|
| 374 |
# console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
|
| 375 |
|
| 376 |
console_messages.append("NLP pipeline completed.")
|
| 377 |
-
return domain_df
|
| 378 |
except Exception as e:
|
| 379 |
# print(f"Error in extract_problem_domains: {e}")
|
| 380 |
console_messages.append(f"Error in extract_problem_domains: {str(e)}")
|
| 381 |
-
return processed_df
|
| 382 |
-
# return domain_df
|
| 383 |
|
| 384 |
|
| 385 |
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
|
@@ -388,9 +390,8 @@ def nlp_pipeline(original_df, console_messages):
|
|
| 388 |
|
| 389 |
|
| 390 |
|
| 391 |
-
|
| 392 |
def process_excel(file):
|
| 393 |
-
console_messages = []
|
| 394 |
console_messages.append("Processing starts. Reading the uploaded Excel file...")
|
| 395 |
# Ensure the file path is correct
|
| 396 |
file_path = file.name if hasattr(file, 'name') else file
|
|
@@ -400,7 +401,7 @@ def process_excel(file):
|
|
| 400 |
try:
|
| 401 |
# Process the DataFrame
|
| 402 |
console_messages.append("Processing the DataFrame...")
|
| 403 |
-
result_df
|
| 404 |
|
| 405 |
# output_file = "Output_ProjectProposals.xlsx"
|
| 406 |
output_file = "Output_Proposals.xlsx"
|
|
@@ -445,7 +446,7 @@ interface = gr.Interface(
|
|
| 445 |
|
| 446 |
outputs=[
|
| 447 |
gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"), # File download output
|
| 448 |
-
gr.Textbox(label="Console Messages", lines=
|
| 449 |
],
|
| 450 |
|
| 451 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
def data_pre_processing(file_responses):
|
| 5 |
console_messages.append("Starting data pre-processing...")
|
| 6 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
| 7 |
|
|
|
|
| 65 |
# Different return can be used to check the processing
|
| 66 |
console_messages.append("Data pre-processing completed.")
|
| 67 |
# return file_responses
|
| 68 |
+
return merged_dataset
|
| 69 |
|
| 70 |
except Exception as e:
|
| 71 |
console_messages.append(f"Error during data pre-processing: {str(e)}")
|
| 72 |
# return str(e), console_messages
|
| 73 |
+
return None
|
| 74 |
|
| 75 |
|
| 76 |
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def text_processing_for_domain(text):
|
| 149 |
+
console_messages.append("Entering Text processing function for Domain identification")
|
| 150 |
+
|
| 151 |
# Text Cleaning
|
| 152 |
text = re.sub(r'[^\w\s]', '', text)
|
| 153 |
text = re.sub(r'\d+', '', text)
|
|
|
|
| 176 |
inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
| 177 |
with torch.no_grad():
|
| 178 |
outputs = model(**inputs)
|
| 179 |
+
|
| 180 |
+
console_messages.append("Exiting Text processing function for Domain identification")
|
| 181 |
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 182 |
|
| 183 |
|
|
|
|
| 229 |
|
| 230 |
|
| 231 |
def extract_problem_domains(df,
|
|
|
|
| 232 |
text_column='Problem_Description',
|
| 233 |
cluster_range=(10, 50),
|
| 234 |
top_words=17,
|
|
|
|
| 356 |
|
| 357 |
|
| 358 |
# def nlp_pipeline(original_df):
|
| 359 |
+
def nlp_pipeline(original_df):
|
| 360 |
console_messages.append("Starting NLP pipeline...")
|
| 361 |
|
| 362 |
# Data Preprocessing
|
| 363 |
+
processed_df = data_pre_processing(original_df) # merged_dataset
|
| 364 |
|
| 365 |
|
| 366 |
# Starting the Pipeline for Domain Extraction
|
|
|
|
| 370 |
|
| 371 |
# Domain Clustering
|
| 372 |
try:
|
| 373 |
+
domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
|
| 374 |
# print(f"Optimal clusters: {optimal_clusters}")
|
| 375 |
# print(result_df.head())
|
| 376 |
# console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
|
| 377 |
|
| 378 |
console_messages.append("NLP pipeline completed.")
|
| 379 |
+
return domain_df
|
| 380 |
except Exception as e:
|
| 381 |
# print(f"Error in extract_problem_domains: {e}")
|
| 382 |
console_messages.append(f"Error in extract_problem_domains: {str(e)}")
|
| 383 |
+
return processed_df
|
| 384 |
+
# return domain_df
|
| 385 |
|
| 386 |
|
| 387 |
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
|
|
|
| 390 |
|
| 391 |
|
| 392 |
|
| 393 |
+
console_messages = []
|
| 394 |
def process_excel(file):
|
|
|
|
| 395 |
console_messages.append("Processing starts. Reading the uploaded Excel file...")
|
| 396 |
# Ensure the file path is correct
|
| 397 |
file_path = file.name if hasattr(file, 'name') else file
|
|
|
|
| 401 |
try:
|
| 402 |
# Process the DataFrame
|
| 403 |
console_messages.append("Processing the DataFrame...")
|
| 404 |
+
result_df = nlp_pipeline(df)
|
| 405 |
|
| 406 |
# output_file = "Output_ProjectProposals.xlsx"
|
| 407 |
output_file = "Output_Proposals.xlsx"
|
|
|
|
| 446 |
|
| 447 |
outputs=[
|
| 448 |
gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"), # File download output
|
| 449 |
+
gr.Textbox(label="Console Messages", lines=100, interactive=False) # Console messages output
|
| 450 |
],
|
| 451 |
|
| 452 |
|