package bg.bas.dcl.LLMs; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Scanner; import ai.philterd.phileas.model.configuration.PhileasConfiguration; import ai.philterd.phileas.model.policy.Policy; import ai.philterd.phileas.model.responses.FilterResponse; import ai.philterd.phileas.model.responses.Span; import ai.philterd.phileas.services.PlainTextFilterService; import bg.bas.dcl.general.FileHandler; /** * PIIDetector * * Detects Personally Identifiable Information (PII) in Bulgarian text at * sentence level using the Phileas library (ai.philterd:phileas). * * ----------------------------------------------------------------------- * NOTE ON "PIISA" * PIISA (https://piisa.org) is a Python-only PII framework with no Java * bindings. The closest Java-native equivalent with a compatible * detection scope is Phileas (Apache 2.0, Maven Central, actively * maintained as of 2025). This component uses Phileas and documents * all places where a future PIISA Java binding could be substituted. * ----------------------------------------------------------------------- * * MAVEN DEPENDENCY (pom.xml): *
 *   <dependency>
 *     <groupId>ai.philterd</groupId>
 *     <artifactId>phileas</artifactId>
 *     <version>3.1.0</version>
 *   </dependency>
 * 
* * ----------------------------------------------------------------------- * PII TYPES DETECTED (Phileas built-in, language-agnostic unless noted): * * Person names (NER + census dictionary) | Ages | Email addresses * Phone numbers | IP addresses (v4 + v6) | URLs | Credit card numbers * SSN / TIN | IBAN codes | Bank account numbers | Dates | Zip codes * MAC addresses | Bitcoin addresses | VINs | Passport numbers * Driver licence numbers | Medical conditions * * Language note: NER-based person-name detection uses English models by * default. For Bulgarian names, supply a custom dictionary filter * (see {@link #buildPolicy()}) or integrate a Bulgarian NER model. * Regex-based filters (emails, phones, IPs, etc.) are language-independent * and work directly on Bulgarian text. * * ----------------------------------------------------------------------- * ALGORITHM (per sentence): * * 1. Phileas scans the sentence and returns a list of PII *spans*, each * carrying a character start/end offset and a PII type label. * 2. We map spans back to word tokens by checking which token positions * overlap any detected span. * 3. piiCoverage = |tokens overlapping PII spans| / |total word tokens| * * ----------------------------------------------------------------------- * USAGE * * BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter(); * PIIDetector detector = new PIIDetector(splitter); * * List<SentencePIIScore> scores = detector.analyseText("Иван Петров живее на ул. Роза 5."); * for (SentencePIIScore s : scores) { * System.out.printf("%.1f%% PII — %s%n", s.getPiiCoveragePercent(), s.getSentence()); * } * * // Corpus-level processing with TSV output * detector.analyseDirectory("/path/to/corpus/", "/path/to/pii_report.tsv"); */ public class PIIDetector { // ----------------------------------------------------------------------- // Constants // ----------------------------------------------------------------------- /** Context string passed to Phileas (arbitrary; used for logging/caching). */ private static final String CONTEXT = "bg-corpus"; /** Document ID prefix; a counter suffix is appended per sentence. */ private static final String DOC_ID = "sent-"; /** Minimum word count for a sentence to be analysed. */ private static final int MIN_WORDS = 3; // ----------------------------------------------------------------------- // Dependencies // ----------------------------------------------------------------------- private final BulgarianSentenceSplitter splitter; private final PlainTextFilterService filterService; private final List policies; // ----------------------------------------------------------------------- // Constructors // ----------------------------------------------------------------------- /** * Creates a PIIDetector with the default policy (all built-in Phileas * filters active, REDACT strategy so spans are easy to count). * * @param splitter an initialised {@link BulgarianSentenceSplitter} */ public PIIDetector(BulgarianSentenceSplitter splitter) { this(splitter, null); } /** * Creates a PIIDetector with a custom Phileas {@link Policy}. * Pass {@code null} to use the built-in all-PII policy. * * @param splitter an initialised {@link BulgarianSentenceSplitter} * @param customPolicy a pre-built Phileas Policy, or null for default */ public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) { if (splitter == null) throw new IllegalArgumentException("splitter must not be null"); this.splitter = splitter; try { Properties props = new Properties(); PhileasConfiguration config = new PhileasConfiguration(props); this.filterService = new PlainTextFilterService(config); this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy()); System.out.println("[PIIDetector] Phileas filter service initialised."); } catch (Exception e) { throw new RuntimeException("Failed to initialise Phileas filter service", e); } } // ----------------------------------------------------------------------- // Public API // ----------------------------------------------------------------------- /** * Splits {@code text} into sentences and returns a {@link SentencePIIScore} * for each sentence. * * Sentences shorter than {@link #MIN_WORDS} words receive a zero score * without calling Phileas (to avoid spurious detections on fragments). * * @param text any Bulgarian plain text (may span multiple paragraphs) * @return one score per detected sentence, in order; never null */ public List analyseText(String text) { List results = new ArrayList<>(); if (text == null || text.isBlank()) return results; int docCounter = 0; for (String sentence : splitter.split(text)) { results.add(analyseSentence(sentence, DOC_ID + (docCounter++))); } return results; } /** * Analyses a single pre-split sentence. * * @param sentence the sentence string (not null) * @param docId a document/sentence identifier string for Phileas context * @return a fully populated {@link SentencePIIScore} */ public SentencePIIScore analyseSentence(String sentence, String docId) { // --- Tokenise --- String[] rawTokens = sentence.trim().split("\\s+"); List tokens = new ArrayList<>(); for (String t : rawTokens) { String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", ""); if (!clean.isEmpty()) tokens.add(clean); } int totalWords = tokens.size(); if (totalWords < MIN_WORDS) { return SentencePIIScore.empty(sentence, totalWords); } // --- Run Phileas --- List spans; try { FilterResponse response = filterService.filter( policies, CONTEXT, docId, sentence, null); spans = response.getSpans() != null ? response.getSpans() : List.of(); } catch (Exception e) { System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage()); return SentencePIIScore.error(sentence, totalWords, e.getMessage()); } // --- Map character-level spans back to token positions --- // Build token character offsets from the original sentence string int[] tokenStart = new int[tokens.size()]; int[] tokenEnd = new int[tokens.size()]; int cursor = 0; for (int ti = 0; ti < tokens.size(); ti++) { String tok = tokens.get(ti); int idx = sentence.indexOf(tok, cursor); if (idx < 0) { // Fallback: token not found at expected position (normalisation artefact) tokenStart[ti] = cursor; tokenEnd[ti] = cursor + tok.length(); } else { tokenStart[ti] = idx; tokenEnd[ti] = idx + tok.length(); cursor = idx + tok.length(); } } // Count distinct PII tokens and collect type labels per token Map piiTokenType = new LinkedHashMap<>(); // tokenIndex → PII type for (Span span : spans) { int spanStart = span.getStart(); int spanEnd = span.getEnd(); String type = span.getFilterType() != null ? span.getFilterType().name() : "UNKNOWN"; for (int ti = 0; ti < tokens.size(); ti++) { // Overlap: token and span share at least one character if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) { piiTokenType.put(ti, type); } } } // --- Build type frequency map --- Map typeCounts = new LinkedHashMap<>(); for (String type : piiTokenType.values()) { typeCounts.merge(type, 1, Integer::sum); } int piiTokenCount = piiTokenType.size(); double coverage = totalWords > 0 ? (double) piiTokenCount / totalWords : 0.0; return new SentencePIIScore( sentence, totalWords, piiTokenCount, coverage, new ArrayList<>(piiTokenType.values()), typeCounts, spans, null); } // ----------------------------------------------------------------------- // Corpus-level processing // ----------------------------------------------------------------------- /** * Analyses all .txt files in {@code corpusDir} sentence by sentence and * writes results to a TSV file at {@code reportPath}. * * Only sentences with at least one PII token are written to the report. * * @param corpusDir directory of plain-text .txt files * @param reportPath destination TSV report file path */ public void analyseDirectory(String corpusDir, String reportPath) { try { FileHandler fh = new FileHandler(); int filesProcessed = 0, sentencesWritten = 0; try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) { bw.write("file\t" + SentencePIIScore.tsvHeader()); bw.newLine(); for (File f : fh.getFileListing(new File(corpusDir))) { if (!f.isFile() || !f.getName().endsWith(".txt")) continue; System.out.println("[PIIDetector] Processing: " + f.getName()); StringBuilder text = new StringBuilder(); try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) { while (sc.hasNextLine()) text.append(sc.nextLine()).append(' '); } int docCounter = 0; for (SentencePIIScore score : analyseText(text.toString())) { if (score.hasPII()) { bw.write(f.getName() + "\t" + score.toTsv()); bw.newLine(); sentencesWritten++; } docCounter++; } filesProcessed++; } } System.out.printf("[PIIDetector] Done. Files: %d Sentences with PII written: %d%n", filesProcessed, sentencesWritten); } catch (Exception e) { e.printStackTrace(); } } // ----------------------------------------------------------------------- // Policy builder // ----------------------------------------------------------------------- /** * Builds the default Phileas {@link Policy} that activates all * language-agnostic PII filters with a REDACT strategy (so that * span positions remain stable for overlap calculation). * * To customise, edit the JSON string below or deserialise your own * policy from a .json file with: * Policy policy = Policy.fromJson(new String(Files.readAllBytes(path))); * * To add a Bulgarian names dictionary, add an "identifiers.dictionary" * block pointing to a file of Bulgarian given names and surnames. */ private Policy buildPolicy() throws Exception { String policyJson = "{" + "\"name\": \"pii-all\"," + "\"identifiers\": {" + "\"emailAddress\": {\"emailAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"phoneNumber\": {\"phoneNumberFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"ipAddress\": {\"ipAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"url\": {\"urlFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"creditCard\": {\"creditCardFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"ssn\": {\"ssnFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"ibanCode\": {\"ibanCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"date\": {\"dateFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"age\": {\"ageFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"macAddress\": {\"macAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"bitcoinAddress\": {\"bitcoinAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"vin\": {\"vinFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"zipCode\": {\"zipCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," + "\"person\": {\"personFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}" + "}" + "}"; return Policy.fromJson(policyJson); } // ----------------------------------------------------------------------- // Inner result class // ----------------------------------------------------------------------- /** * Immutable result object for one sentence's PII analysis. */ public static class SentencePIIScore { private final String sentence; private final int totalWords; private final int piiTokenCount; /** PII coverage: piiTokenCount / totalWords in [0, 1]. */ private final double piiCoverage; /** Ordered list of PII type labels for each PII token found. */ private final List piiTypes; /** Frequency of each PII type in this sentence. */ private final Map typeFrequency; /** Raw Phileas spans (character-level). */ private final List spans; /** Non-null if Phileas threw an exception for this sentence. */ private final String errorMessage; SentencePIIScore(String sentence, int totalWords, int piiTokenCount, double piiCoverage, List piiTypes, Map typeFrequency, List spans, String errorMessage) { this.sentence = sentence; this.totalWords = totalWords; this.piiTokenCount = piiTokenCount; this.piiCoverage = piiCoverage; this.piiTypes = Collections.unmodifiableList(piiTypes); this.typeFrequency = Collections.unmodifiableMap(typeFrequency); this.spans = spans != null ? Collections.unmodifiableList(spans) : List.of(); this.errorMessage = errorMessage; } static SentencePIIScore empty(String sentence, int totalWords) { return new SentencePIIScore(sentence, totalWords, 0, 0.0, List.of(), Map.of(), List.of(), null); } static SentencePIIScore error(String sentence, int totalWords, String msg) { return new SentencePIIScore(sentence, totalWords, 0, 0.0, List.of(), Map.of(), List.of(), msg); } // --- Accessors --- public String getSentence() { return sentence; } public int getTotalWords() { return totalWords; } public int getPiiTokenCount() { return piiTokenCount; } /** PII coverage ratio in [0, 1]. */ public double getPiiCoverage() { return piiCoverage; } /** PII coverage expressed as a percentage [0, 100]. */ public double getPiiCoveragePercent() { return piiCoverage * 100.0; } public List getPiiTypes() { return piiTypes; } public Map getTypeFrequency() { return typeFrequency; } public List getSpans() { return spans; } public boolean hasPII() { return piiTokenCount > 0; } public boolean hasError() { return errorMessage != null; } public String getErrorMessage() { return errorMessage; } /** Number of distinct PII categories detected in this sentence. */ public int distinctPiiTypes() { return typeFrequency.size(); } // --- TSV export --- /** * TSV row: sentence | totalWords | piiTokens | coverage% | distinctTypes | typeFrequency */ public String toTsv() { return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s", sentence.replace('\t', ' '), totalWords, piiTokenCount, piiCoverage, getPiiCoveragePercent(), distinctPiiTypes(), typeFrequency.toString()); } public static String tsvHeader() { return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency"; } @Override public String toString() { return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}", totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet()); } } }