Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -154,7 +154,6 @@ nltk.download('averaged_perceptron_tagger')
|
|
| 154 |
|
| 155 |
|
| 156 |
def text_processing_for_domain(text):
|
| 157 |
-
console_messages.append("Entering Text processing function for Domain identification")
|
| 158 |
|
| 159 |
# Text Cleaning
|
| 160 |
text = re.sub(r'[^\w\s]', '', text)
|
|
@@ -179,14 +178,17 @@ def text_processing_for_domain(text):
|
|
| 179 |
# Lemmatize tokens using SpaCy
|
| 180 |
doc = nlp(' '.join(tokens))
|
| 181 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
-
# Apply Hugging Face Transformers
|
| 184 |
-
inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
| 185 |
-
with torch.no_grad():
|
| 186 |
-
outputs = model(**inputs)
|
| 187 |
|
| 188 |
-
console_messages.append("Exiting Text processing function for Domain identification")
|
| 189 |
-
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 190 |
|
| 191 |
|
| 192 |
|
|
@@ -237,10 +239,10 @@ import numpy as np
|
|
| 237 |
|
| 238 |
|
| 239 |
def extract_problem_domains(df,
|
| 240 |
-
|
| 241 |
-
text_column='Problem_Description',
|
| 242 |
cluster_range=(10, 50),
|
| 243 |
-
top_words=
|
| 244 |
method='sentence_transformers'
|
| 245 |
# method='tfidf_kmeans'
|
| 246 |
):
|
|
@@ -373,8 +375,10 @@ def nlp_pipeline(original_df):
|
|
| 373 |
|
| 374 |
|
| 375 |
# Starting the Pipeline for Domain Extraction
|
|
|
|
| 376 |
# Apply the text_processing_for_domain function to the DataFrame
|
| 377 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
|
|
|
| 378 |
|
| 379 |
|
| 380 |
# Domain Clustering
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
def text_processing_for_domain(text):
|
|
|
|
| 157 |
|
| 158 |
# Text Cleaning
|
| 159 |
text = re.sub(r'[^\w\s]', '', text)
|
|
|
|
| 178 |
# Lemmatize tokens using SpaCy
|
| 179 |
doc = nlp(' '.join(tokens))
|
| 180 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
| 181 |
+
|
| 182 |
+
return lemmatized_text # Return the cleaned and lemmatized text
|
| 183 |
+
|
| 184 |
+
# # Apply Hugging Face Transformers
|
| 185 |
+
# inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
| 186 |
+
# with torch.no_grad():
|
| 187 |
+
# outputs = model(**inputs)
|
| 188 |
+
|
| 189 |
+
# return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
|
|
|
|
|
|
| 192 |
|
| 193 |
|
| 194 |
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
def extract_problem_domains(df,
|
| 242 |
+
text_column='Processed_ProblemDescription_forDomainExtraction',
|
| 243 |
+
# text_column='Problem_Description',
|
| 244 |
cluster_range=(10, 50),
|
| 245 |
+
top_words=30,
|
| 246 |
method='sentence_transformers'
|
| 247 |
# method='tfidf_kmeans'
|
| 248 |
):
|
|
|
|
| 375 |
|
| 376 |
|
| 377 |
# Starting the Pipeline for Domain Extraction
|
| 378 |
+
console_messages.append("Entering Text processing function for Domain identification")
|
| 379 |
# Apply the text_processing_for_domain function to the DataFrame
|
| 380 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
| 381 |
+
console_messages.append("Exiting Text processing function for Domain identification")
|
| 382 |
|
| 383 |
|
| 384 |
# Domain Clustering
|