Delwin Mathew commited on
Commit
f5c57f1
·
1 Parent(s): 2382f1d

add workers & normalize nlp redaction

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. app.py +19 -7
Dockerfile CHANGED
@@ -14,4 +14,4 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
  COPY --chown=user . /app
16
  EXPOSE 7860
17
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
14
 
15
  COPY --chown=user . /app
16
  EXPOSE 7860
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers=3"]
app.py CHANGED
@@ -42,32 +42,44 @@ class NERRequest(BaseModel):
42
  async def process_text(request: NERRequest):
43
  global nlp
44
  redacted_chunks = []
45
- supa_clien = []
46
  redacted_count = 0
 
 
 
 
47
  for text in request.chunks:
48
  doc = nlp(text)
49
 
50
  for ent in doc.ents:
51
  print(f"{ent.text} - {ent.label_}")
 
 
 
 
 
 
 
 
52
  metadata_record = {
53
  "personal_info": ent.text,
54
  "redaction_type": ent.label_,
55
  }
56
 
57
- # insert into the table to get the primary key
58
- query = supa_clien.append(metadata_record)
59
-
60
- # Form the redacted word with label and primary key (REDACTED_PERSON_1)
61
  redacted_count += 1
62
  redacted_word = f"REDACTED_{ent.label_}_{redacted_count}"
63
 
64
- metadata_record["redacted_word"] = redacted_word
 
65
 
 
 
66
  text = text.replace(ent.text, redacted_word)
67
 
68
  redacted_chunks.append(text)
69
 
70
  return {
71
  "redacted_chunks": redacted_chunks,
72
- "metadata": supa_clien,
73
  }
 
42
  async def process_text(request: NERRequest):
43
  global nlp
44
  redacted_chunks = []
45
+ metadata_records = []
46
  redacted_count = 0
47
+
48
+ # Dictionary to keep track of already seen entities
49
+ seen_entities = {}
50
+
51
  for text in request.chunks:
52
  doc = nlp(text)
53
 
54
  for ent in doc.ents:
55
  print(f"{ent.text} - {ent.label_}")
56
+
57
+ # Check if we've seen this entity before
58
+ if ent.text in seen_entities:
59
+ # Use the existing redacted word
60
+ redacted_word = seen_entities[ent.text]
61
+ text = text.replace(ent.text, redacted_word)
62
+ continue
63
+
64
  metadata_record = {
65
  "personal_info": ent.text,
66
  "redaction_type": ent.label_,
67
  }
68
 
69
+ # Create new redacted word
 
 
 
70
  redacted_count += 1
71
  redacted_word = f"REDACTED_{ent.label_}_{redacted_count}"
72
 
73
+ # Store for future reference
74
+ seen_entities[ent.text] = redacted_word
75
 
76
+ metadata_record["redacted_word"] = redacted_word
77
+ metadata_records.append(metadata_record)
78
  text = text.replace(ent.text, redacted_word)
79
 
80
  redacted_chunks.append(text)
81
 
82
  return {
83
  "redacted_chunks": redacted_chunks,
84
+ "metadata": metadata_records,
85
  }