|
|
from fastapi import FastAPI |
|
|
from pydantic import BaseModel |
|
|
import spacy |
|
|
from contextlib import asynccontextmanager |
|
|
|
|
|
|
|
|
nlp = None |
|
|
|
|
|
|
|
|
@asynccontextmanager |
|
|
async def lifespan(app: FastAPI): |
|
|
await load_model() |
|
|
yield |
|
|
await cleanup() |
|
|
|
|
|
app = FastAPI(lifespan=lifespan) |
|
|
|
|
|
async def load_model(): |
|
|
""" |
|
|
Event to load the NLP model into memory on application startup. |
|
|
""" |
|
|
global nlp |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
print("NLP model loaded successfully.") |
|
|
|
|
|
|
|
|
async def cleanup(): |
|
|
""" |
|
|
Event to clean up resources on application shutdown (if needed). |
|
|
""" |
|
|
global nlp |
|
|
nlp = None |
|
|
print("NLP model unloaded.") |
|
|
|
|
|
|
|
|
class NERRequest(BaseModel): |
|
|
chunks: list[str] |
|
|
|
|
|
|
|
|
@app.post("/process") |
|
|
async def process_text(request: NERRequest): |
|
|
global nlp |
|
|
metadata_records = [] |
|
|
|
|
|
|
|
|
seen_entities = {} |
|
|
|
|
|
for text in request.chunks: |
|
|
doc = nlp(text) |
|
|
|
|
|
for ent in doc.ents: |
|
|
print(f"{ent.text} - {ent.label_}") |
|
|
|
|
|
|
|
|
if ent.text in seen_entities: |
|
|
|
|
|
continue |
|
|
|
|
|
metadata_record = { |
|
|
"personal_info": ent.text, |
|
|
"redaction_type": ent.label_, |
|
|
} |
|
|
|
|
|
redacted_word = f"REDACTED_{ent.label_}" |
|
|
|
|
|
|
|
|
seen_entities[ent.text] = redacted_word |
|
|
|
|
|
metadata_record["redacted_word"] = redacted_word |
|
|
metadata_records.append(metadata_record) |
|
|
|
|
|
return { |
|
|
"metadata": metadata_records, |
|
|
} |
|
|
|