| package bg.bas.dcl.LLMs; |
|
|
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.OutputStreamWriter; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Properties; |
| import java.util.Scanner; |
|
|
| import ai.philterd.phileas.model.configuration.PhileasConfiguration; |
| import ai.philterd.phileas.model.policy.Policy; |
| import ai.philterd.phileas.model.responses.FilterResponse; |
| import ai.philterd.phileas.model.responses.Span; |
| import ai.philterd.phileas.services.PlainTextFilterService; |
|
|
| import bg.bas.dcl.general.FileHandler; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public class PIIDetector { |
|
|
| |
| |
| |
|
|
| |
| private static final String CONTEXT = "bg-corpus"; |
|
|
| |
| private static final String DOC_ID = "sent-"; |
|
|
| |
| private static final int MIN_WORDS = 3; |
|
|
| |
| |
| |
|
|
| private final BulgarianSentenceSplitter splitter; |
| private final PlainTextFilterService filterService; |
| private final List<Policy> policies; |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| public PIIDetector(BulgarianSentenceSplitter splitter) { |
| this(splitter, null); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) { |
| if (splitter == null) |
| throw new IllegalArgumentException("splitter must not be null"); |
|
|
| this.splitter = splitter; |
|
|
| try { |
| Properties props = new Properties(); |
| PhileasConfiguration config = new PhileasConfiguration(props); |
| this.filterService = new PlainTextFilterService(config); |
| this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy()); |
| System.out.println("[PIIDetector] Phileas filter service initialised."); |
| } catch (Exception e) { |
| throw new RuntimeException("Failed to initialise Phileas filter service", e); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public List<SentencePIIScore> analyseText(String text) { |
| List<SentencePIIScore> results = new ArrayList<>(); |
| if (text == null || text.isBlank()) return results; |
|
|
| int docCounter = 0; |
| for (String sentence : splitter.split(text)) { |
| results.add(analyseSentence(sentence, DOC_ID + (docCounter++))); |
| } |
| return results; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| public SentencePIIScore analyseSentence(String sentence, String docId) { |
|
|
| |
| String[] rawTokens = sentence.trim().split("\\s+"); |
| List<String> tokens = new ArrayList<>(); |
| for (String t : rawTokens) { |
| String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", ""); |
| if (!clean.isEmpty()) tokens.add(clean); |
| } |
| int totalWords = tokens.size(); |
|
|
| if (totalWords < MIN_WORDS) { |
| return SentencePIIScore.empty(sentence, totalWords); |
| } |
|
|
| |
| List<Span> spans; |
| try { |
| FilterResponse response = filterService.filter( |
| policies, CONTEXT, docId, sentence, null); |
| spans = response.getSpans() != null ? response.getSpans() : List.of(); |
| } catch (Exception e) { |
| System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage()); |
| return SentencePIIScore.error(sentence, totalWords, e.getMessage()); |
| } |
|
|
| |
| |
| int[] tokenStart = new int[tokens.size()]; |
| int[] tokenEnd = new int[tokens.size()]; |
| int cursor = 0; |
| for (int ti = 0; ti < tokens.size(); ti++) { |
| String tok = tokens.get(ti); |
| int idx = sentence.indexOf(tok, cursor); |
| if (idx < 0) { |
| |
| tokenStart[ti] = cursor; |
| tokenEnd[ti] = cursor + tok.length(); |
| } else { |
| tokenStart[ti] = idx; |
| tokenEnd[ti] = idx + tok.length(); |
| cursor = idx + tok.length(); |
| } |
| } |
|
|
| |
| Map<Integer, String> piiTokenType = new LinkedHashMap<>(); |
| for (Span span : spans) { |
| int spanStart = span.getStart(); |
| int spanEnd = span.getEnd(); |
| String type = span.getFilterType() != null |
| ? span.getFilterType().name() |
| : "UNKNOWN"; |
|
|
| for (int ti = 0; ti < tokens.size(); ti++) { |
| |
| if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) { |
| piiTokenType.put(ti, type); |
| } |
| } |
| } |
|
|
| |
| Map<String, Integer> typeCounts = new LinkedHashMap<>(); |
| for (String type : piiTokenType.values()) { |
| typeCounts.merge(type, 1, Integer::sum); |
| } |
|
|
| int piiTokenCount = piiTokenType.size(); |
| double coverage = totalWords > 0 |
| ? (double) piiTokenCount / totalWords |
| : 0.0; |
|
|
| return new SentencePIIScore( |
| sentence, totalWords, piiTokenCount, coverage, |
| new ArrayList<>(piiTokenType.values()), |
| typeCounts, spans, null); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public void analyseDirectory(String corpusDir, String reportPath) { |
| try { |
| FileHandler fh = new FileHandler(); |
| int filesProcessed = 0, sentencesWritten = 0; |
|
|
| try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( |
| new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) { |
|
|
| bw.write("file\t" + SentencePIIScore.tsvHeader()); |
| bw.newLine(); |
|
|
| for (File f : fh.getFileListing(new File(corpusDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
|
|
| System.out.println("[PIIDetector] Processing: " + f.getName()); |
|
|
| StringBuilder text = new StringBuilder(); |
| try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) { |
| while (sc.hasNextLine()) text.append(sc.nextLine()).append(' '); |
| } |
|
|
| int docCounter = 0; |
| for (SentencePIIScore score : analyseText(text.toString())) { |
| if (score.hasPII()) { |
| bw.write(f.getName() + "\t" + score.toTsv()); |
| bw.newLine(); |
| sentencesWritten++; |
| } |
| docCounter++; |
| } |
| filesProcessed++; |
| } |
| } |
|
|
| System.out.printf("[PIIDetector] Done. Files: %d Sentences with PII written: %d%n", |
| filesProcessed, sentencesWritten); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| private Policy buildPolicy() throws Exception { |
| String policyJson = "{" |
| + "\"name\": \"pii-all\"," |
| + "\"identifiers\": {" |
| + "\"emailAddress\": {\"emailAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"phoneNumber\": {\"phoneNumberFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"ipAddress\": {\"ipAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"url\": {\"urlFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"creditCard\": {\"creditCardFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"ssn\": {\"ssnFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"ibanCode\": {\"ibanCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"date\": {\"dateFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"age\": {\"ageFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"macAddress\": {\"macAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"bitcoinAddress\": {\"bitcoinAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"vin\": {\"vinFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"zipCode\": {\"zipCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}," |
| + "\"person\": {\"personFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}" |
| + "}" |
| + "}"; |
| return Policy.fromJson(policyJson); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| public static class SentencePIIScore { |
|
|
| private final String sentence; |
| private final int totalWords; |
| private final int piiTokenCount; |
| |
| private final double piiCoverage; |
| |
| private final List<String> piiTypes; |
| |
| private final Map<String, Integer> typeFrequency; |
| |
| private final List<Span> spans; |
| |
| private final String errorMessage; |
|
|
| SentencePIIScore(String sentence, int totalWords, int piiTokenCount, |
| double piiCoverage, List<String> piiTypes, |
| Map<String, Integer> typeFrequency, |
| List<Span> spans, String errorMessage) { |
| this.sentence = sentence; |
| this.totalWords = totalWords; |
| this.piiTokenCount = piiTokenCount; |
| this.piiCoverage = piiCoverage; |
| this.piiTypes = Collections.unmodifiableList(piiTypes); |
| this.typeFrequency = Collections.unmodifiableMap(typeFrequency); |
| this.spans = spans != null |
| ? Collections.unmodifiableList(spans) |
| : List.of(); |
| this.errorMessage = errorMessage; |
| } |
|
|
| static SentencePIIScore empty(String sentence, int totalWords) { |
| return new SentencePIIScore(sentence, totalWords, 0, 0.0, |
| List.of(), Map.of(), List.of(), null); |
| } |
|
|
| static SentencePIIScore error(String sentence, int totalWords, String msg) { |
| return new SentencePIIScore(sentence, totalWords, 0, 0.0, |
| List.of(), Map.of(), List.of(), msg); |
| } |
|
|
| |
|
|
| public String getSentence() { return sentence; } |
| public int getTotalWords() { return totalWords; } |
| public int getPiiTokenCount() { return piiTokenCount; } |
| |
| public double getPiiCoverage() { return piiCoverage; } |
| |
| public double getPiiCoveragePercent() { return piiCoverage * 100.0; } |
| public List<String> getPiiTypes() { return piiTypes; } |
| public Map<String, Integer> getTypeFrequency() { return typeFrequency; } |
| public List<Span> getSpans() { return spans; } |
| public boolean hasPII() { return piiTokenCount > 0; } |
| public boolean hasError() { return errorMessage != null; } |
| public String getErrorMessage() { return errorMessage; } |
|
|
| |
| public int distinctPiiTypes() { return typeFrequency.size(); } |
|
|
| |
|
|
| |
| |
| |
| public String toTsv() { |
| return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s", |
| sentence.replace('\t', ' '), |
| totalWords, |
| piiTokenCount, |
| piiCoverage, |
| getPiiCoveragePercent(), |
| distinctPiiTypes(), |
| typeFrequency.toString()); |
| } |
|
|
| public static String tsvHeader() { |
| return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency"; |
| } |
|
|
| @Override |
| public String toString() { |
| return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}", |
| totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet()); |
| } |
| } |
| } |
|
|