Redaction_PDF

Sleeping

edithram23 commited on Jul 8, 2024

Commit

a9299cc

verified ·

1 Parent(s): 2d304b6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -42,7 +42,7 @@ address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[add
 # Add the custom address recognizer to the analyzer
 analyzer.registry.add_recognizer(address_recognizer)
-analyzer.get_recognizers
 # Define a function to extract entities
@@ -181,18 +181,18 @@ if uploaded_file is not None:
             text = pg.get_text()
             sentences = sentence_tokenize(text)
             for sent in sentences:
-              x = mask_generation(sent)
-              sent_n_q_c=[]
-              sent_n = list(set(sent.lower().replace('.',' ').split("\n")))
-              for i in sent_n:
-                  for j in i.split(" "):
-                      sent_n_q_c+=j.split(',')
-              x_q = x.lower().replace('.',' ').split(' ')
-              e=[]
-              for i in x_q:
-                 e+=i.split(',')
-              t5_words=set(sent_n_q_c).difference(set(e))
               entities,words_out = extract_entities(sent)
               # print("\nwords_out:",words_out)
               # print("\nT5",t5_words)
@@ -202,7 +202,7 @@ if uploaded_file is not None:
               new=[]
               for w in words_out:
                 new+=w.split('\n')
-              words_out+=t5_words
               new+=bert_words
               words_out = [i for i in new if len(i)>3]
               # print("\nfinal:",words_out)

 # Add the custom address recognizer to the analyzer
 analyzer.registry.add_recognizer(address_recognizer)
+# analyzer.get_recognizers
 # Define a function to extract entities
             text = pg.get_text()
             sentences = sentence_tokenize(text)
             for sent in sentences:
+              # x = mask_generation(sent)
+              # sent_n_q_c=[]
+              # sent_n = list(set(sent.lower().replace('.',' ').split("\n")))
+              # for i in sent_n:
+              #     for j in i.split(" "):
+              #         sent_n_q_c+=j.split(',')
+              # x_q = x.lower().replace('.',' ').split(' ')
+              # e=[]
+              # for i in x_q:
+              #    e+=i.split(',')
+              # t5_words=set(sent_n_q_c).difference(set(e))
               entities,words_out = extract_entities(sent)
               # print("\nwords_out:",words_out)
               # print("\nT5",t5_words)
               new=[]
               for w in words_out:
                 new+=w.split('\n')
+              # words_out+=t5_words
               new+=bert_words
               words_out = [i for i in new if len(i)>3]
               # print("\nfinal:",words_out)