Redaction_PDF

Sleeping

App Files Files Community

edithram23 commited on Jul 6, 2024

Commit

67ff28f

verified ·

1 Parent(s): ca0d553

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -82

app.py CHANGED Viewed

@@ -28,7 +28,55 @@ model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
 #     pattern = r'\[.*?\]'
 #     redacted_text = re.sub(pattern, '[redacted]', predicted_title)
 #     return redacted_text
 def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
     if len(text) < 90:
         text = text + '.'
@@ -42,55 +90,6 @@ def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
     redacted_text = re.sub(pattern, '[redacted]', predicted_title)
     return redacted_text
-def find_surrounding_words(text, target="[redacted]"):
-    pattern = re.compile(r'([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?\s*' + re.escape(target) + r'\s*([A-Za-z0-9_@#\$%\^&*\(\)\[\]\{\}\.\,]+)?')
-    matches = pattern.finditer(text)
-    results = []
-    for match in matches:
-        before, after = match.group(1), match.group(2)
-        if before:
-            before_parts = before.split(',')
-            before_parts = [item for item in before_parts if item.strip()]
-            if len(before_parts) > 1:
-                before_word = before_parts[0].strip()
-                before_index = match.start(1)
-            else:
-                before_word = before_parts[0]
-                before_index = match.start(1)
-        else:
-            before_word = None
-            before_index = None
-        if after:
-            after_parts = after.split(',')
-            after_parts = [item for item in after_parts if item.strip()]
-            if len(after_parts) > 1:
-                after_word = after_parts[0].strip()
-                after_index = match.start(2)
-            else:
-                after_word = after_parts[0]
-                after_index = match.start(2)
-        else:
-            after_word = None
-            after_index = None
-        if match.start() == 0:
-            before_word = None
-            before_index = None
-        if match.end() == len(text):
-            after_word = None
-            after_index = None
-        results.append({
-            "before_word": before_word,
-            "after_word": after_word,
-            "before_index": before_index,
-            "after_index": after_index
-        })
-    return results
 def redact_text(page, text):
     text_instances = page.search_for(text)
     for inst in text_instances:
@@ -131,38 +130,27 @@ if uploaded_file is not None:
     file_contents, pdf_document = process_file(uploaded_file)
     if pdf_document:
         redacted_text = []
-        for page in pdf_document:
-            pg = page.get_text()
-            pg_lower = pg.lower()
-            token = sentence_tokenize(pg)
-            final = ''
-            for t in token:
-                t_lower = t.lower()
-                final = mask_generation(t)
-                words = find_surrounding_words(final)
-                for i in range(len(words)):
-                    if words[i]['after_index'] is None:
-                        if words[i]['before_word'] in t_lower:
-                            fi = t_lower.index(words[i]['before_word'])
-                            fi = fi + len(words[i]['before_word'])
-                            li = len(t)
-                            redacted_text.append(t[fi:li])
-                    elif words[i]['before_index'] is None:
-                        if words[i]['after_word'] in t_lower:
-                            fi = 0
-                            li = t_lower.index(words[i]['after_word'])
-                            redacted_text.append(t[fi:li])
-                    else:
-                        if words[i]['after_word'] in t_lower and words[i]['before_word'] in t_lower:
-                            before_word = words[i]['before_word']
-                            after_word = words[i]['after_word']
-                            fi = t_lower.index(before_word)
-                            fi = fi + len(before_word)
-                            li = t_lower.index(after_word)
-                            redacted_text.append(t[fi:li])
-        for page in pdf_document:
-            for i in redacted_text:
-                redact_text(page, i)
         output_pdf = "output_redacted.pdf"
         pdf_document.save(output_pdf)

 #     pattern = r'\[.*?\]'
 #     redacted_text = re.sub(pattern, '[redacted]', predicted_title)
 #     return redacted_text
+from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
+# Initialize the analyzer engine
+analyzer = AnalyzerEngine()
+# Define a custom address recognizer using a regex pattern
+address_pattern = Pattern(name="address", regex=r"\d+\s\w+\s(?:street|st|road|rd|avenue|ave|lane|ln|drive|dr|blvd|boulevard)\s*\w*", score=0.5)
+address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[address_pattern])
+# Add the custom address recognizer to the analyzer
+analyzer.registry.add_recognizer(address_recognizer)
+analyzer.get_recognizers
+# Define a function to extract entities
+def extract_entities(text):
+    entities = {
+        "NAME": [],
+        "PHONE_NUMBER": [],
+        "EMAIL": [],
+        "ADDRESS": [],
+        "LOCATION": [],
+        "IN_AADHAAR": [],
+    }
+    output = []
+    # Analyze the text for PII
+    results = analyzer.analyze(text=text, language='en')
+    for result in results:
+        if result.entity_type == "PERSON":
+            entities["NAME"].append(text[result.start:result.end])
+            output+=[text[result.start:result.end]]
+        elif result.entity_type == "PHONE_NUMBER":
+            entities["PHONE_NUMBER"].append(text[result.start:result.end])
+            output+=[text[result.start:result.end]]
+        elif result.entity_type == "EMAIL_ADDRESS":
+            entities["EMAIL"].append(text[result.start:result.end])
+            output+=[text[result.start:result.end]]
+        elif result.entity_type == "ADDRESS":
+            entities["ADDRESS"].append(text[result.start:result.end])
+            output+=[text[result.start:result.end]]
+        elif result.entity_type == 'LOCATION':
+          entities['LOCATION'].append(text[result.start:result.end])
+          output+=[text[result.start:result.end]]
+        elif result.entity_type == 'IN_AADHAAR':
+          entities['IN_PAN'].append(text[result.start:result.end])
+          output+=[text[result.start:result.end]]
+    return entities,output
 def mask_generation(text, model=model_large, tokenizer=tokenizer_large):
     if len(text) < 90:
         text = text + '.'
     redacted_text = re.sub(pattern, '[redacted]', predicted_title)
     return redacted_text
 def redact_text(page, text):
     text_instances = page.search_for(text)
     for inst in text_instances:
     file_contents, pdf_document = process_file(uploaded_file)
     if pdf_document:
         redacted_text = []
+        for pg in pdf_document:
+              text = pg.get_text('text')
+              sentences = sentence_tokenize(text)
+              for sent in sentences:
+                entities,words_out = extract_entities(sent)
+                avai_red = pg.search_for(sent)
+                new=[]
+                for w in words_out:
+                  new+=w.split('\n')
+                words_out = [i for i in new if len(i)>2]
+                print(words_out)
+                for i in avai_red:
+                        b = pg.get_text("text", clip=i)
+                        # result = [item for item in output if item in b]  # Get elements of 'a' that are in 'b'
+                        for j in words_out:
+                            new_n = pg.search_for(j, clip=i)
+                            for all in new_n:
+                              pg.add_redact_annot(all,fill=(0, 0, 0))
+              pg.apply_redactions()
         output_pdf = "output_redacted.pdf"
         pdf_document.save(output_pdf)