from fastapi import FastAPI from pydantic import BaseModel import spacy from contextlib import asynccontextmanager # Global variable to store the model nlp = None @asynccontextmanager async def lifespan(app: FastAPI): await load_model() yield await cleanup() app = FastAPI(lifespan=lifespan) async def load_model(): """ Event to load the NLP model into memory on application startup. """ global nlp nlp = spacy.load("en_core_web_sm") print("NLP model loaded successfully.") async def cleanup(): """ Event to clean up resources on application shutdown (if needed). """ global nlp nlp = None print("NLP model unloaded.") class NERRequest(BaseModel): chunks: list[str] @app.post("/process") async def process_text(request: NERRequest): global nlp metadata_records = [] # Dictionary to keep track of already seen entities seen_entities = {} for text in request.chunks: doc = nlp(text) for ent in doc.ents: print(f"{ent.text} - {ent.label_}") # Check if we've seen this entity before if ent.text in seen_entities: # Use the existing redacted word continue metadata_record = { "personal_info": ent.text, "redaction_type": ent.label_, } redacted_word = f"REDACTED_{ent.label_}" # Store for future reference seen_entities[ent.text] = redacted_word metadata_record["redacted_word"] = redacted_word metadata_records.append(metadata_record) return { "metadata": metadata_records, }