Spaces:

delwinn
/

ner

Sleeping

ner

File size: 1,691 Bytes

736fded
 
 
2382f1d
736fded
1437552
 
 
2382f1d
 
 
 
 
 
 
89ec2fe
2382f1d
1437552
 
 
 
 
2382f1d
1437552
 
2382f1d
1437552
 
 
 
 
2382f1d
1437552
 
 
736fded
 
 
 
 
 
1437552
f5c57f1
 
 
 
 
736fded
 
 
 
 
f5c57f1
 
 
 
 
 
736fded
 
 
 
 
78f775e
736fded
f5c57f1
 
736fded
f5c57f1
 
736fded
 
f5c57f1
736fded

from fastapi import FastAPI
from pydantic import BaseModel
import spacy
from contextlib import asynccontextmanager

# Global variable to store the model
nlp = None


@asynccontextmanager
async def lifespan(app: FastAPI):
    await load_model()
    yield
    await cleanup()

app = FastAPI(lifespan=lifespan)

async def load_model():
    """
    Event to load the NLP model into memory on application startup.
    """
    global nlp
    nlp = spacy.load("en_core_web_sm")
    print("NLP model loaded successfully.")


async def cleanup():
    """
    Event to clean up resources on application shutdown (if needed).
    """
    global nlp
    nlp = None
    print("NLP model unloaded.")


class NERRequest(BaseModel):
    chunks: list[str]


@app.post("/process")
async def process_text(request: NERRequest):
    global nlp
    metadata_records = []

    # Dictionary to keep track of already seen entities
    seen_entities = {}

    for text in request.chunks:
        doc = nlp(text)

        for ent in doc.ents:
            print(f"{ent.text} - {ent.label_}")

            # Check if we've seen this entity before
            if ent.text in seen_entities:
                # Use the existing redacted word
                continue

            metadata_record = {
                "personal_info": ent.text,
                "redaction_type": ent.label_,
            }

            redacted_word = f"REDACTED_{ent.label_}"

            # Store for future reference
            seen_entities[ent.text] = redacted_word

            metadata_record["redacted_word"] = redacted_word
            metadata_records.append(metadata_record)

    return {
        "metadata": metadata_records,
    }