Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

8c34617

verified ·

1 Parent(s): 2b24dfb

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -10

app.py CHANGED Viewed

@@ -154,7 +154,6 @@ nltk.download('averaged_perceptron_tagger')
 def text_processing_for_domain(text):
-    console_messages.append("Entering Text processing function for Domain identification")
     # Text Cleaning
     text = re.sub(r'[^\w\s]', '', text)
@@ -179,14 +178,17 @@ def text_processing_for_domain(text):
     # Lemmatize tokens using SpaCy
     doc = nlp(' '.join(tokens))
     lemmatized_text = ' '.join([token.lemma_ for token in doc])
-    # Apply Hugging Face Transformers
-    inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    console_messages.append("Exiting Text processing function for Domain identification")
-    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
@@ -237,10 +239,10 @@ import numpy as np
 def extract_problem_domains(df,
-                            # text_column='Processed_ProblemDescription_forDomainExtraction',
-                            text_column='Problem_Description',
                             cluster_range=(10, 50),
-                            top_words=17,
                             method='sentence_transformers'
                             # method='tfidf_kmeans'
                            ):
@@ -373,8 +375,10 @@ def nlp_pipeline(original_df):
     # Starting the Pipeline for Domain Extraction
     # Apply the text_processing_for_domain function to the DataFrame
     processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
     # Domain Clustering

 def text_processing_for_domain(text):
     # Text Cleaning
     text = re.sub(r'[^\w\s]', '', text)
     # Lemmatize tokens using SpaCy
     doc = nlp(' '.join(tokens))
     lemmatized_text = ' '.join([token.lemma_ for token in doc])
+    return lemmatized_text  # Return the cleaned and lemmatized text
+    # # Apply Hugging Face Transformers
+    # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
+    # with torch.no_grad():
+    #     outputs = model(**inputs)
+    # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
 def extract_problem_domains(df,
+                            text_column='Processed_ProblemDescription_forDomainExtraction',
+                            # text_column='Problem_Description',
                             cluster_range=(10, 50),
+                            top_words=30,
                             method='sentence_transformers'
                             # method='tfidf_kmeans'
                            ):
     # Starting the Pipeline for Domain Extraction
+    console_messages.append("Entering Text processing function for Domain identification")
     # Apply the text_processing_for_domain function to the DataFrame
     processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
+    console_messages.append("Exiting Text processing function for Domain identification")
     # Domain Clustering