Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

File size: 20,405 Bytes

18573e4

package bg.bas.dcl.LLMs;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Scanner;

import ai.philterd.phileas.model.configuration.PhileasConfiguration;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.responses.FilterResponse;
import ai.philterd.phileas.model.responses.Span;
import ai.philterd.phileas.services.PlainTextFilterService;

import bg.bas.dcl.general.FileHandler;

/**
 * PIIDetector
 *
 * Detects Personally Identifiable Information (PII) in Bulgarian text at
 * sentence level using the <b>Phileas</b> library (ai.philterd:phileas).
 *
 * -----------------------------------------------------------------------
 * NOTE ON "PIISA"
 * PIISA (https://piisa.org) is a Python-only PII framework with no Java
 * bindings.  The closest Java-native equivalent with a compatible
 * detection scope is Phileas (Apache 2.0, Maven Central, actively
 * maintained as of 2025).  This component uses Phileas and documents
 * all places where a future PIISA Java binding could be substituted.
 * -----------------------------------------------------------------------
 *
 * MAVEN DEPENDENCY (pom.xml):
 * <pre>
 *   &lt;dependency&gt;
 *     &lt;groupId&gt;ai.philterd&lt;/groupId&gt;
 *     &lt;artifactId&gt;phileas&lt;/artifactId&gt;
 *     &lt;version&gt;3.1.0&lt;/version&gt;
 *   &lt;/dependency&gt;
 * </pre>
 *
 * -----------------------------------------------------------------------
 * PII TYPES DETECTED (Phileas built-in, language-agnostic unless noted):
 *
 *   Person names (NER + census dictionary) | Ages | Email addresses
 *   Phone numbers | IP addresses (v4 + v6) | URLs | Credit card numbers
 *   SSN / TIN | IBAN codes | Bank account numbers | Dates | Zip codes
 *   MAC addresses | Bitcoin addresses | VINs | Passport numbers
 *   Driver licence numbers | Medical conditions
 *
 * Language note: NER-based person-name detection uses English models by
 * default.  For Bulgarian names, supply a custom dictionary filter
 * (see {@link #buildPolicy()}) or integrate a Bulgarian NER model.
 * Regex-based filters (emails, phones, IPs, etc.) are language-independent
 * and work directly on Bulgarian text.
 *
 * -----------------------------------------------------------------------
 * ALGORITHM (per sentence):
 *
 *   1. Phileas scans the sentence and returns a list of PII *spans*, each
 *      carrying a character start/end offset and a PII type label.
 *   2. We map spans back to word tokens by checking which token positions
 *      overlap any detected span.
 *   3. piiCoverage = |tokens overlapping PII spans| / |total word tokens|
 *
 * -----------------------------------------------------------------------
 * USAGE
 *
 *   BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
 *   PIIDetector detector = new PIIDetector(splitter);
 *
 *   List&lt;SentencePIIScore&gt; scores = detector.analyseText("Иван Петров живее на ул. Роза 5.");
 *   for (SentencePIIScore s : scores) {
 *       System.out.printf("%.1f%% PII — %s%n", s.getPiiCoveragePercent(), s.getSentence());
 *   }
 *
 *   // Corpus-level processing with TSV output
 *   detector.analyseDirectory("/path/to/corpus/", "/path/to/pii_report.tsv");
 */
public class PIIDetector {

    // -----------------------------------------------------------------------
    // Constants
    // -----------------------------------------------------------------------

    /** Context string passed to Phileas (arbitrary; used for logging/caching). */
    private static final String CONTEXT  = "bg-corpus";

    /** Document ID prefix; a counter suffix is appended per sentence. */
    private static final String DOC_ID   = "sent-";

    /** Minimum word count for a sentence to be analysed. */
    private static final int    MIN_WORDS = 3;

    // -----------------------------------------------------------------------
    // Dependencies
    // -----------------------------------------------------------------------

    private final BulgarianSentenceSplitter splitter;
    private final PlainTextFilterService    filterService;
    private final List<Policy>              policies;

    // -----------------------------------------------------------------------
    // Constructors
    // -----------------------------------------------------------------------

    /**
     * Creates a PIIDetector with the default policy (all built-in Phileas
     * filters active, REDACT strategy so spans are easy to count).
     *
     * @param splitter an initialised {@link BulgarianSentenceSplitter}
     */
    public PIIDetector(BulgarianSentenceSplitter splitter) {
        this(splitter, null);
    }

    /**
     * Creates a PIIDetector with a custom Phileas {@link Policy}.
     * Pass {@code null} to use the built-in all-PII policy.
     *
     * @param splitter       an initialised {@link BulgarianSentenceSplitter}
     * @param customPolicy   a pre-built Phileas Policy, or null for default
     */
    public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) {
        if (splitter == null)
            throw new IllegalArgumentException("splitter must not be null");

        this.splitter = splitter;

        try {
            Properties props = new Properties();
            PhileasConfiguration config = new PhileasConfiguration(props);
            this.filterService = new PlainTextFilterService(config);
            this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy());
            System.out.println("[PIIDetector] Phileas filter service initialised.");
        } catch (Exception e) {
            throw new RuntimeException("Failed to initialise Phileas filter service", e);
        }
    }

    // -----------------------------------------------------------------------
    // Public API
    // -----------------------------------------------------------------------

    /**
     * Splits {@code text} into sentences and returns a {@link SentencePIIScore}
     * for each sentence.
     *
     * Sentences shorter than {@link #MIN_WORDS} words receive a zero score
     * without calling Phileas (to avoid spurious detections on fragments).
     *
     * @param text any Bulgarian plain text (may span multiple paragraphs)
     * @return one score per detected sentence, in order; never null
     */
    public List<SentencePIIScore> analyseText(String text) {
        List<SentencePIIScore> results = new ArrayList<>();
        if (text == null || text.isBlank()) return results;

        int docCounter = 0;
        for (String sentence : splitter.split(text)) {
            results.add(analyseSentence(sentence, DOC_ID + (docCounter++)));
        }
        return results;
    }

    /**
     * Analyses a single pre-split sentence.
     *
     * @param sentence the sentence string (not null)
     * @param docId    a document/sentence identifier string for Phileas context
     * @return a fully populated {@link SentencePIIScore}
     */
    public SentencePIIScore analyseSentence(String sentence, String docId) {

        // --- Tokenise ---
        String[] rawTokens = sentence.trim().split("\\s+");
        List<String> tokens = new ArrayList<>();
        for (String t : rawTokens) {
            String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", "");
            if (!clean.isEmpty()) tokens.add(clean);
        }
        int totalWords = tokens.size();

        if (totalWords < MIN_WORDS) {
            return SentencePIIScore.empty(sentence, totalWords);
        }

        // --- Run Phileas ---
        List<Span> spans;
        try {
            FilterResponse response = filterService.filter(
                    policies, CONTEXT, docId, sentence, null);
            spans = response.getSpans() != null ? response.getSpans() : List.of();
        } catch (Exception e) {
            System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage());
            return SentencePIIScore.error(sentence, totalWords, e.getMessage());
        }

        // --- Map character-level spans back to token positions ---
        // Build token character offsets from the original sentence string
        int[] tokenStart = new int[tokens.size()];
        int[] tokenEnd   = new int[tokens.size()];
        int cursor = 0;
        for (int ti = 0; ti < tokens.size(); ti++) {
            String tok = tokens.get(ti);
            int idx = sentence.indexOf(tok, cursor);
            if (idx < 0) {
                // Fallback: token not found at expected position (normalisation artefact)
                tokenStart[ti] = cursor;
                tokenEnd[ti]   = cursor + tok.length();
            } else {
                tokenStart[ti] = idx;
                tokenEnd[ti]   = idx + tok.length();
                cursor = idx + tok.length();
            }
        }

        // Count distinct PII tokens and collect type labels per token
        Map<Integer, String> piiTokenType = new LinkedHashMap<>(); // tokenIndex → PII type
        for (Span span : spans) {
            int spanStart = span.getStart();
            int spanEnd   = span.getEnd();
            String type   = span.getFilterType() != null
                            ? span.getFilterType().name()
                            : "UNKNOWN";

            for (int ti = 0; ti < tokens.size(); ti++) {
                // Overlap: token and span share at least one character
                if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) {
                    piiTokenType.put(ti, type);
                }
            }
        }

        // --- Build type frequency map ---
        Map<String, Integer> typeCounts = new LinkedHashMap<>();
        for (String type : piiTokenType.values()) {
            typeCounts.merge(type, 1, Integer::sum);
        }

        int piiTokenCount = piiTokenType.size();
        double coverage   = totalWords > 0
                ? (double) piiTokenCount / totalWords
                : 0.0;

        return new SentencePIIScore(
                sentence, totalWords, piiTokenCount, coverage,
                new ArrayList<>(piiTokenType.values()),
                typeCounts, spans, null);
    }

    // -----------------------------------------------------------------------
    // Corpus-level processing
    // -----------------------------------------------------------------------

    /**
     * Analyses all .txt files in {@code corpusDir} sentence by sentence and
     * writes results to a TSV file at {@code reportPath}.
     *
     * Only sentences with at least one PII token are written to the report.
     *
     * @param corpusDir  directory of plain-text .txt files
     * @param reportPath destination TSV report file path
     */
    public void analyseDirectory(String corpusDir, String reportPath) {
        try {
            FileHandler fh = new FileHandler();
            int filesProcessed = 0, sentencesWritten = 0;

            try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
                    new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) {

                bw.write("file\t" + SentencePIIScore.tsvHeader());
                bw.newLine();

                for (File f : fh.getFileListing(new File(corpusDir))) {
                    if (!f.isFile() || !f.getName().endsWith(".txt")) continue;

                    System.out.println("[PIIDetector] Processing: " + f.getName());

                    StringBuilder text = new StringBuilder();
                    try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
                        while (sc.hasNextLine()) text.append(sc.nextLine()).append(' ');
                    }

                    int docCounter = 0;
                    for (SentencePIIScore score : analyseText(text.toString())) {
                        if (score.hasPII()) {
                            bw.write(f.getName() + "\t" + score.toTsv());
                            bw.newLine();
                            sentencesWritten++;
                        }
                        docCounter++;
                    }
                    filesProcessed++;
                }
            }

            System.out.printf("[PIIDetector] Done.  Files: %d  Sentences with PII written: %d%n",
                    filesProcessed, sentencesWritten);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // -----------------------------------------------------------------------
    // Policy builder
    // -----------------------------------------------------------------------

    /**
     * Builds the default Phileas {@link Policy} that activates all
     * language-agnostic PII filters with a REDACT strategy (so that
     * span positions remain stable for overlap calculation).
     *
     * To customise, edit the JSON string below or deserialise your own
     * policy from a .json file with:
     *   Policy policy = Policy.fromJson(new String(Files.readAllBytes(path)));
     *
     * To add a Bulgarian names dictionary, add an "identifiers.dictionary"
     * block pointing to a file of Bulgarian given names and surnames.
     */
    private Policy buildPolicy() throws Exception {
        String policyJson = "{"
            + "\"name\": \"pii-all\","
            + "\"identifiers\": {"
            +   "\"emailAddress\":    {\"emailAddressFilterStrategies\":    [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"phoneNumber\":     {\"phoneNumberFilterStrategies\":     [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"ipAddress\":       {\"ipAddressFilterStrategies\":       [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"url\":             {\"urlFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"creditCard\":      {\"creditCardFilterStrategies\":      [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"ssn\":             {\"ssnFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"ibanCode\":        {\"ibanCodeFilterStrategies\":        [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"date\":            {\"dateFilterStrategies\":            [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"age\":             {\"ageFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"macAddress\":      {\"macAddressFilterStrategies\":      [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"bitcoinAddress\":  {\"bitcoinAddressFilterStrategies\":  [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"vin\":             {\"vinFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"zipCode\":         {\"zipCodeFilterStrategies\":         [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
            +   "\"person\":          {\"personFilterStrategies\":          [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}"
            + "}"
            + "}";
        return Policy.fromJson(policyJson);
    }

    // -----------------------------------------------------------------------
    // Inner result class
    // -----------------------------------------------------------------------

    /**
     * Immutable result object for one sentence's PII analysis.
     */
    public static class SentencePIIScore {

        private final String            sentence;
        private final int               totalWords;
        private final int               piiTokenCount;
        /** PII coverage: piiTokenCount / totalWords in [0, 1]. */
        private final double            piiCoverage;
        /** Ordered list of PII type labels for each PII token found. */
        private final List<String>      piiTypes;
        /** Frequency of each PII type in this sentence. */
        private final Map<String, Integer> typeFrequency;
        /** Raw Phileas spans (character-level). */
        private final List<Span>        spans;
        /** Non-null if Phileas threw an exception for this sentence. */
        private final String            errorMessage;

        SentencePIIScore(String sentence, int totalWords, int piiTokenCount,
                         double piiCoverage, List<String> piiTypes,
                         Map<String, Integer> typeFrequency,
                         List<Span> spans, String errorMessage) {
            this.sentence      = sentence;
            this.totalWords    = totalWords;
            this.piiTokenCount = piiTokenCount;
            this.piiCoverage   = piiCoverage;
            this.piiTypes      = Collections.unmodifiableList(piiTypes);
            this.typeFrequency = Collections.unmodifiableMap(typeFrequency);
            this.spans         = spans != null
                                 ? Collections.unmodifiableList(spans)
                                 : List.of();
            this.errorMessage  = errorMessage;
        }

        static SentencePIIScore empty(String sentence, int totalWords) {
            return new SentencePIIScore(sentence, totalWords, 0, 0.0,
                    List.of(), Map.of(), List.of(), null);
        }

        static SentencePIIScore error(String sentence, int totalWords, String msg) {
            return new SentencePIIScore(sentence, totalWords, 0, 0.0,
                    List.of(), Map.of(), List.of(), msg);
        }

        // --- Accessors ---

        public String            getSentence()           { return sentence; }
        public int               getTotalWords()         { return totalWords; }
        public int               getPiiTokenCount()      { return piiTokenCount; }
        /** PII coverage ratio in [0, 1]. */
        public double            getPiiCoverage()        { return piiCoverage; }
        /** PII coverage expressed as a percentage [0, 100]. */
        public double            getPiiCoveragePercent() { return piiCoverage * 100.0; }
        public List<String>      getPiiTypes()           { return piiTypes; }
        public Map<String, Integer> getTypeFrequency()   { return typeFrequency; }
        public List<Span>        getSpans()              { return spans; }
        public boolean           hasPII()                { return piiTokenCount > 0; }
        public boolean           hasError()              { return errorMessage != null; }
        public String            getErrorMessage()       { return errorMessage; }

        /** Number of distinct PII categories detected in this sentence. */
        public int distinctPiiTypes() { return typeFrequency.size(); }

        // --- TSV export ---

        /**
         * TSV row: sentence | totalWords | piiTokens | coverage% | distinctTypes | typeFrequency
         */
        public String toTsv() {
            return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s",
                    sentence.replace('\t', ' '),
                    totalWords,
                    piiTokenCount,
                    piiCoverage,
                    getPiiCoveragePercent(),
                    distinctPiiTypes(),
                    typeFrequency.toString());
        }

        public static String tsvHeader() {
            return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency";
        }

        @Override
        public String toString() {
            return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}",
                    totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet());
        }
    }
}