Redaction_PDF

Sleeping

edithram23 commited on Jul 7, 2024

Commit

09b20e0

verified ·

1 Parent(s): 9432ed7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -178,8 +178,7 @@ if uploaded_file is not None:
     if pdf_document:
         redacted_text = []
         for pg in pdf_document:
-            text = pg.get_text('text')
-            st.text_area(pg.get_text())
             sentences = sentence_tokenize(text)
             for sent in sentences:
               entities,words_out = extract_entities(sent)
@@ -189,13 +188,14 @@ if uploaded_file is not None:
                 new+=w.split('\n')
               words_out+=bert_words
               words_out = [i for i in new if len(i)>2]
               # print(words_out)
               words_out=sorted(words_out, key=len,reverse=True)
               print(words_out)
               for i in words_out:
                 redact_text(pg,i)
         output_pdf = "output_redacted.pdf"
         pdf_document.save(output_pdf)

     if pdf_document:
         redacted_text = []
         for pg in pdf_document:
+            text = pg.get_text()
             sentences = sentence_tokenize(text)
             for sent in sentences:
               entities,words_out = extract_entities(sent)
                 new+=w.split('\n')
               words_out+=bert_words
               words_out = [i for i in new if len(i)>2]
               # print(words_out)
               words_out=sorted(words_out, key=len,reverse=True)
+              redact_text+=words_out
               print(words_out)
               for i in words_out:
                 redact_text(pg,i)
+        st.text_area(pg.get_text())
         output_pdf = "output_redacted.pdf"
         pdf_document.save(output_pdf)