File size: 1,691 Bytes
736fded
 
 
2382f1d
736fded
1437552
 
 
2382f1d
 
 
 
 
 
 
89ec2fe
2382f1d
1437552
 
 
 
 
2382f1d
1437552
 
2382f1d
1437552
 
 
 
 
2382f1d
1437552
 
 
736fded
 
 
 
 
 
1437552
f5c57f1
 
 
 
 
736fded
 
 
 
 
f5c57f1
 
 
 
 
 
736fded
 
 
 
 
78f775e
736fded
f5c57f1
 
736fded
f5c57f1
 
736fded
 
f5c57f1
736fded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from fastapi import FastAPI
from pydantic import BaseModel
import spacy
from contextlib import asynccontextmanager

# Global variable to store the model
nlp = None


@asynccontextmanager
async def lifespan(app: FastAPI):
    await load_model()
    yield
    await cleanup()

app = FastAPI(lifespan=lifespan)

async def load_model():
    """
    Event to load the NLP model into memory on application startup.
    """
    global nlp
    nlp = spacy.load("en_core_web_sm")
    print("NLP model loaded successfully.")


async def cleanup():
    """
    Event to clean up resources on application shutdown (if needed).
    """
    global nlp
    nlp = None
    print("NLP model unloaded.")


class NERRequest(BaseModel):
    chunks: list[str]


@app.post("/process")
async def process_text(request: NERRequest):
    global nlp
    metadata_records = []

    # Dictionary to keep track of already seen entities
    seen_entities = {}

    for text in request.chunks:
        doc = nlp(text)

        for ent in doc.ents:
            print(f"{ent.text} - {ent.label_}")

            # Check if we've seen this entity before
            if ent.text in seen_entities:
                # Use the existing redacted word
                continue

            metadata_record = {
                "personal_info": ent.text,
                "redaction_type": ent.label_,
            }

            redacted_word = f"REDACTED_{ent.label_}"

            # Store for future reference
            seen_entities[ent.text] = redacted_word

            metadata_record["redacted_word"] = redacted_word
            metadata_records.append(metadata_record)

    return {
        "metadata": metadata_records,
    }