Redaction_st_bert

Sleeping

edithram23 commited on Jul 5, 2024

Commit

fb16cd6

verified ·

1 Parent(s): 0b66063

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -46,6 +46,38 @@ def read_pdf(file):
         text += page.get_text()
     return text, pdf_document
 def read_docx(file):
     doc = Document(file)
     text = "\n".join([para.text for para in doc.paragraphs])

         text += page.get_text()
     return text, pdf_document
+def combine_words(entities):
+    combined_entities = []
+    current_entity = None
+    for entity in entities:
+        if current_entity:
+            if current_entity['end'] == entity['start']:
+                # Combine the words without space
+                current_entity['word'] += entity['word'].replace('##', '')
+                current_entity['end'] = entity['end']
+            elif current_entity['end'] + 1 == entity['start']:
+                # Combine the words with a space
+                current_entity['word'] += ' ' + entity['word'].replace('##', '')
+                current_entity['end'] = entity['end']
+            else:
+                # Add the previous combined entity to the list
+                combined_entities.append(current_entity)
+                # Start a new entity
+                current_entity = entity.copy()
+                current_entity['word'] = current_entity['word'].replace('##', '')
+        else:
+            # Initialize the first entity
+            current_entity = entity.copy()
+            current_entity['word'] = current_entity['word'].replace('##', '')
+    # Add the last entity
+    if current_entity:
+        combined_entities.append(current_entity)
+    return combined_entities
 def read_docx(file):
     doc = Document(file)
     text = "\n".join([para.text for para in doc.paragraphs])