Spaces:

delwinn
/

ner

Sleeping

Delwin Mathew commited on Jan 21, 2025

Commit

f5c57f1

1 Parent(s): 2382f1d

add workers & normalize nlp redaction

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -14,4 +14,4 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
 EXPOSE 7860
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY --chown=user . /app
 EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers=3"]

app.py CHANGED Viewed

@@ -42,32 +42,44 @@ class NERRequest(BaseModel):
 async def process_text(request: NERRequest):
     global nlp
     redacted_chunks = []
-    supa_clien = []
     redacted_count = 0
     for text in request.chunks:
         doc = nlp(text)
         for ent in doc.ents:
             print(f"{ent.text} - {ent.label_}")
             metadata_record = {
                 "personal_info": ent.text,
                 "redaction_type": ent.label_,
             }
-            # insert into the table to get the primary key
-            query = supa_clien.append(metadata_record)
-            # Form the redacted word with label and primary key (REDACTED_PERSON_1)
             redacted_count += 1
             redacted_word = f"REDACTED_{ent.label_}_{redacted_count}"
-            metadata_record["redacted_word"] = redacted_word
             text = text.replace(ent.text, redacted_word)
         redacted_chunks.append(text)
     return {
         "redacted_chunks": redacted_chunks,
-        "metadata": supa_clien,
     }

 async def process_text(request: NERRequest):
     global nlp
     redacted_chunks = []
+    metadata_records = []
     redacted_count = 0
+    # Dictionary to keep track of already seen entities
+    seen_entities = {}
     for text in request.chunks:
         doc = nlp(text)
         for ent in doc.ents:
             print(f"{ent.text} - {ent.label_}")
+            # Check if we've seen this entity before
+            if ent.text in seen_entities:
+                # Use the existing redacted word
+                redacted_word = seen_entities[ent.text]
+                text = text.replace(ent.text, redacted_word)
+                continue
             metadata_record = {
                 "personal_info": ent.text,
                 "redaction_type": ent.label_,
             }
+            # Create new redacted word
             redacted_count += 1
             redacted_word = f"REDACTED_{ent.label_}_{redacted_count}"
+            # Store for future reference
+            seen_entities[ent.text] = redacted_word
+            metadata_record["redacted_word"] = redacted_word
+            metadata_records.append(metadata_record)
             text = text.replace(ent.text, redacted_word)
         redacted_chunks.append(text)
     return {
         "redacted_chunks": redacted_chunks,
+        "metadata": metadata_records,
     }