Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -159,16 +159,17 @@ from transformers import pipeline
|
|
| 159 |
# Load a summarization model
|
| 160 |
summarizer = pipeline("summarization")
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def text_processing_for_domain(unsummarized_text):
|
| 165 |
try:
|
| 166 |
# Summarization
|
| 167 |
-
|
|
|
|
| 168 |
except Exception as e:
|
| 169 |
print(f"Summarization failed: {e}")
|
| 170 |
-
|
|
|
|
| 171 |
|
|
|
|
| 172 |
# Text Cleaning
|
| 173 |
text = re.sub(r'[^\w\s]', '', text)
|
| 174 |
text = re.sub(r'\d+', '', text)
|
|
@@ -194,13 +195,31 @@ def text_processing_for_domain(unsummarized_text):
|
|
| 194 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
| 195 |
|
| 196 |
return lemmatized_text # Return the cleaned and lemmatized text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
#
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
|
|
@@ -393,8 +412,10 @@ def nlp_pipeline(original_df):
|
|
| 393 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
| 394 |
|
| 395 |
console_messages.append("Removing entries which could not be allocated to any Problem Domain")
|
| 396 |
-
processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
| 397 |
-
|
|
|
|
|
|
|
| 398 |
|
| 399 |
|
| 400 |
# Domain Clustering
|
|
|
|
| 159 |
# Load a summarization model
|
| 160 |
summarizer = pipeline("summarization")
|
| 161 |
|
| 162 |
+
def Summarized_text(passed_text):
|
|
|
|
|
|
|
| 163 |
try:
|
| 164 |
# Summarization
|
| 165 |
+
summarize_text = summarizer(passed_text, max_length=70, min_length=30, do_sample=False)[0]['summary_text']
|
| 166 |
+
return summarize_text
|
| 167 |
except Exception as e:
|
| 168 |
print(f"Summarization failed: {e}")
|
| 169 |
+
return passed_text
|
| 170 |
+
###### Will uncomment Summarization during final deployment... as it takes a lot of time
|
| 171 |
|
| 172 |
+
def Lemmatize_text(text):
|
| 173 |
# Text Cleaning
|
| 174 |
text = re.sub(r'[^\w\s]', '', text)
|
| 175 |
text = re.sub(r'\d+', '', text)
|
|
|
|
| 195 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
| 196 |
|
| 197 |
return lemmatized_text # Return the cleaned and lemmatized text
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
from random import random
|
| 201 |
+
def text_processing_for_domain(text):
|
| 202 |
+
# First, get the summarized text
|
| 203 |
+
summarized_text = ""
|
| 204 |
+
# summarized_text = Summarized_text(text)
|
| 205 |
|
| 206 |
+
# Then, lemmatize the original text
|
| 207 |
+
lemmatized_text = ""
|
| 208 |
+
lemmatized_text = Lemmatize_text(text)
|
| 209 |
+
|
| 210 |
+
if lemmatized_text and summarized_text:
|
| 211 |
+
# Join both the summarized and lemmatized text
|
| 212 |
+
if random() > 0.5:
|
| 213 |
+
combined_text = summarized_text + " " + lemmatized_text
|
| 214 |
+
else:
|
| 215 |
+
combined_text = lemmatized_text + " " + summarized_text
|
| 216 |
+
return combined_text
|
| 217 |
+
elif summarized_text:
|
| 218 |
+
return summarized_text
|
| 219 |
+
elif lemmatized_text:
|
| 220 |
+
return lemmatized_text
|
| 221 |
+
else:
|
| 222 |
+
return "Sustainability and Longevity" # Default FailSafe
|
| 223 |
|
| 224 |
|
| 225 |
|
|
|
|
| 412 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
| 413 |
|
| 414 |
console_messages.append("Removing entries which could not be allocated to any Problem Domain")
|
| 415 |
+
# processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
| 416 |
+
# Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
|
| 417 |
+
processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
|
| 418 |
+
|
| 419 |
|
| 420 |
|
| 421 |
# Domain Clustering
|