File size: 20,405 Bytes
18573e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | package bg.bas.dcl.LLMs;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Scanner;
import ai.philterd.phileas.model.configuration.PhileasConfiguration;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.responses.FilterResponse;
import ai.philterd.phileas.model.responses.Span;
import ai.philterd.phileas.services.PlainTextFilterService;
import bg.bas.dcl.general.FileHandler;
/**
* PIIDetector
*
* Detects Personally Identifiable Information (PII) in Bulgarian text at
* sentence level using the <b>Phileas</b> library (ai.philterd:phileas).
*
* -----------------------------------------------------------------------
* NOTE ON "PIISA"
* PIISA (https://piisa.org) is a Python-only PII framework with no Java
* bindings. The closest Java-native equivalent with a compatible
* detection scope is Phileas (Apache 2.0, Maven Central, actively
* maintained as of 2025). This component uses Phileas and documents
* all places where a future PIISA Java binding could be substituted.
* -----------------------------------------------------------------------
*
* MAVEN DEPENDENCY (pom.xml):
* <pre>
* <dependency>
* <groupId>ai.philterd</groupId>
* <artifactId>phileas</artifactId>
* <version>3.1.0</version>
* </dependency>
* </pre>
*
* -----------------------------------------------------------------------
* PII TYPES DETECTED (Phileas built-in, language-agnostic unless noted):
*
* Person names (NER + census dictionary) | Ages | Email addresses
* Phone numbers | IP addresses (v4 + v6) | URLs | Credit card numbers
* SSN / TIN | IBAN codes | Bank account numbers | Dates | Zip codes
* MAC addresses | Bitcoin addresses | VINs | Passport numbers
* Driver licence numbers | Medical conditions
*
* Language note: NER-based person-name detection uses English models by
* default. For Bulgarian names, supply a custom dictionary filter
* (see {@link #buildPolicy()}) or integrate a Bulgarian NER model.
* Regex-based filters (emails, phones, IPs, etc.) are language-independent
* and work directly on Bulgarian text.
*
* -----------------------------------------------------------------------
* ALGORITHM (per sentence):
*
* 1. Phileas scans the sentence and returns a list of PII *spans*, each
* carrying a character start/end offset and a PII type label.
* 2. We map spans back to word tokens by checking which token positions
* overlap any detected span.
* 3. piiCoverage = |tokens overlapping PII spans| / |total word tokens|
*
* -----------------------------------------------------------------------
* USAGE
*
* BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
* PIIDetector detector = new PIIDetector(splitter);
*
* List<SentencePIIScore> scores = detector.analyseText("Иван Петров живее на ул. Роза 5.");
* for (SentencePIIScore s : scores) {
* System.out.printf("%.1f%% PII — %s%n", s.getPiiCoveragePercent(), s.getSentence());
* }
*
* // Corpus-level processing with TSV output
* detector.analyseDirectory("/path/to/corpus/", "/path/to/pii_report.tsv");
*/
public class PIIDetector {
// -----------------------------------------------------------------------
// Constants
// -----------------------------------------------------------------------
/** Context string passed to Phileas (arbitrary; used for logging/caching). */
private static final String CONTEXT = "bg-corpus";
/** Document ID prefix; a counter suffix is appended per sentence. */
private static final String DOC_ID = "sent-";
/** Minimum word count for a sentence to be analysed. */
private static final int MIN_WORDS = 3;
// -----------------------------------------------------------------------
// Dependencies
// -----------------------------------------------------------------------
private final BulgarianSentenceSplitter splitter;
private final PlainTextFilterService filterService;
private final List<Policy> policies;
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
/**
* Creates a PIIDetector with the default policy (all built-in Phileas
* filters active, REDACT strategy so spans are easy to count).
*
* @param splitter an initialised {@link BulgarianSentenceSplitter}
*/
public PIIDetector(BulgarianSentenceSplitter splitter) {
this(splitter, null);
}
/**
* Creates a PIIDetector with a custom Phileas {@link Policy}.
* Pass {@code null} to use the built-in all-PII policy.
*
* @param splitter an initialised {@link BulgarianSentenceSplitter}
* @param customPolicy a pre-built Phileas Policy, or null for default
*/
public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) {
if (splitter == null)
throw new IllegalArgumentException("splitter must not be null");
this.splitter = splitter;
try {
Properties props = new Properties();
PhileasConfiguration config = new PhileasConfiguration(props);
this.filterService = new PlainTextFilterService(config);
this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy());
System.out.println("[PIIDetector] Phileas filter service initialised.");
} catch (Exception e) {
throw new RuntimeException("Failed to initialise Phileas filter service", e);
}
}
// -----------------------------------------------------------------------
// Public API
// -----------------------------------------------------------------------
/**
* Splits {@code text} into sentences and returns a {@link SentencePIIScore}
* for each sentence.
*
* Sentences shorter than {@link #MIN_WORDS} words receive a zero score
* without calling Phileas (to avoid spurious detections on fragments).
*
* @param text any Bulgarian plain text (may span multiple paragraphs)
* @return one score per detected sentence, in order; never null
*/
public List<SentencePIIScore> analyseText(String text) {
List<SentencePIIScore> results = new ArrayList<>();
if (text == null || text.isBlank()) return results;
int docCounter = 0;
for (String sentence : splitter.split(text)) {
results.add(analyseSentence(sentence, DOC_ID + (docCounter++)));
}
return results;
}
/**
* Analyses a single pre-split sentence.
*
* @param sentence the sentence string (not null)
* @param docId a document/sentence identifier string for Phileas context
* @return a fully populated {@link SentencePIIScore}
*/
public SentencePIIScore analyseSentence(String sentence, String docId) {
// --- Tokenise ---
String[] rawTokens = sentence.trim().split("\\s+");
List<String> tokens = new ArrayList<>();
for (String t : rawTokens) {
String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", "");
if (!clean.isEmpty()) tokens.add(clean);
}
int totalWords = tokens.size();
if (totalWords < MIN_WORDS) {
return SentencePIIScore.empty(sentence, totalWords);
}
// --- Run Phileas ---
List<Span> spans;
try {
FilterResponse response = filterService.filter(
policies, CONTEXT, docId, sentence, null);
spans = response.getSpans() != null ? response.getSpans() : List.of();
} catch (Exception e) {
System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage());
return SentencePIIScore.error(sentence, totalWords, e.getMessage());
}
// --- Map character-level spans back to token positions ---
// Build token character offsets from the original sentence string
int[] tokenStart = new int[tokens.size()];
int[] tokenEnd = new int[tokens.size()];
int cursor = 0;
for (int ti = 0; ti < tokens.size(); ti++) {
String tok = tokens.get(ti);
int idx = sentence.indexOf(tok, cursor);
if (idx < 0) {
// Fallback: token not found at expected position (normalisation artefact)
tokenStart[ti] = cursor;
tokenEnd[ti] = cursor + tok.length();
} else {
tokenStart[ti] = idx;
tokenEnd[ti] = idx + tok.length();
cursor = idx + tok.length();
}
}
// Count distinct PII tokens and collect type labels per token
Map<Integer, String> piiTokenType = new LinkedHashMap<>(); // tokenIndex → PII type
for (Span span : spans) {
int spanStart = span.getStart();
int spanEnd = span.getEnd();
String type = span.getFilterType() != null
? span.getFilterType().name()
: "UNKNOWN";
for (int ti = 0; ti < tokens.size(); ti++) {
// Overlap: token and span share at least one character
if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) {
piiTokenType.put(ti, type);
}
}
}
// --- Build type frequency map ---
Map<String, Integer> typeCounts = new LinkedHashMap<>();
for (String type : piiTokenType.values()) {
typeCounts.merge(type, 1, Integer::sum);
}
int piiTokenCount = piiTokenType.size();
double coverage = totalWords > 0
? (double) piiTokenCount / totalWords
: 0.0;
return new SentencePIIScore(
sentence, totalWords, piiTokenCount, coverage,
new ArrayList<>(piiTokenType.values()),
typeCounts, spans, null);
}
// -----------------------------------------------------------------------
// Corpus-level processing
// -----------------------------------------------------------------------
/**
* Analyses all .txt files in {@code corpusDir} sentence by sentence and
* writes results to a TSV file at {@code reportPath}.
*
* Only sentences with at least one PII token are written to the report.
*
* @param corpusDir directory of plain-text .txt files
* @param reportPath destination TSV report file path
*/
public void analyseDirectory(String corpusDir, String reportPath) {
try {
FileHandler fh = new FileHandler();
int filesProcessed = 0, sentencesWritten = 0;
try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) {
bw.write("file\t" + SentencePIIScore.tsvHeader());
bw.newLine();
for (File f : fh.getFileListing(new File(corpusDir))) {
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
System.out.println("[PIIDetector] Processing: " + f.getName());
StringBuilder text = new StringBuilder();
try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
while (sc.hasNextLine()) text.append(sc.nextLine()).append(' ');
}
int docCounter = 0;
for (SentencePIIScore score : analyseText(text.toString())) {
if (score.hasPII()) {
bw.write(f.getName() + "\t" + score.toTsv());
bw.newLine();
sentencesWritten++;
}
docCounter++;
}
filesProcessed++;
}
}
System.out.printf("[PIIDetector] Done. Files: %d Sentences with PII written: %d%n",
filesProcessed, sentencesWritten);
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------------------------------------------------------
// Policy builder
// -----------------------------------------------------------------------
/**
* Builds the default Phileas {@link Policy} that activates all
* language-agnostic PII filters with a REDACT strategy (so that
* span positions remain stable for overlap calculation).
*
* To customise, edit the JSON string below or deserialise your own
* policy from a .json file with:
* Policy policy = Policy.fromJson(new String(Files.readAllBytes(path)));
*
* To add a Bulgarian names dictionary, add an "identifiers.dictionary"
* block pointing to a file of Bulgarian given names and surnames.
*/
private Policy buildPolicy() throws Exception {
String policyJson = "{"
+ "\"name\": \"pii-all\","
+ "\"identifiers\": {"
+ "\"emailAddress\": {\"emailAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"phoneNumber\": {\"phoneNumberFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"ipAddress\": {\"ipAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"url\": {\"urlFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"creditCard\": {\"creditCardFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"ssn\": {\"ssnFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"ibanCode\": {\"ibanCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"date\": {\"dateFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"age\": {\"ageFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"macAddress\": {\"macAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"bitcoinAddress\": {\"bitcoinAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"vin\": {\"vinFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"zipCode\": {\"zipCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+ "\"person\": {\"personFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}"
+ "}"
+ "}";
return Policy.fromJson(policyJson);
}
// -----------------------------------------------------------------------
// Inner result class
// -----------------------------------------------------------------------
/**
* Immutable result object for one sentence's PII analysis.
*/
public static class SentencePIIScore {
private final String sentence;
private final int totalWords;
private final int piiTokenCount;
/** PII coverage: piiTokenCount / totalWords in [0, 1]. */
private final double piiCoverage;
/** Ordered list of PII type labels for each PII token found. */
private final List<String> piiTypes;
/** Frequency of each PII type in this sentence. */
private final Map<String, Integer> typeFrequency;
/** Raw Phileas spans (character-level). */
private final List<Span> spans;
/** Non-null if Phileas threw an exception for this sentence. */
private final String errorMessage;
SentencePIIScore(String sentence, int totalWords, int piiTokenCount,
double piiCoverage, List<String> piiTypes,
Map<String, Integer> typeFrequency,
List<Span> spans, String errorMessage) {
this.sentence = sentence;
this.totalWords = totalWords;
this.piiTokenCount = piiTokenCount;
this.piiCoverage = piiCoverage;
this.piiTypes = Collections.unmodifiableList(piiTypes);
this.typeFrequency = Collections.unmodifiableMap(typeFrequency);
this.spans = spans != null
? Collections.unmodifiableList(spans)
: List.of();
this.errorMessage = errorMessage;
}
static SentencePIIScore empty(String sentence, int totalWords) {
return new SentencePIIScore(sentence, totalWords, 0, 0.0,
List.of(), Map.of(), List.of(), null);
}
static SentencePIIScore error(String sentence, int totalWords, String msg) {
return new SentencePIIScore(sentence, totalWords, 0, 0.0,
List.of(), Map.of(), List.of(), msg);
}
// --- Accessors ---
public String getSentence() { return sentence; }
public int getTotalWords() { return totalWords; }
public int getPiiTokenCount() { return piiTokenCount; }
/** PII coverage ratio in [0, 1]. */
public double getPiiCoverage() { return piiCoverage; }
/** PII coverage expressed as a percentage [0, 100]. */
public double getPiiCoveragePercent() { return piiCoverage * 100.0; }
public List<String> getPiiTypes() { return piiTypes; }
public Map<String, Integer> getTypeFrequency() { return typeFrequency; }
public List<Span> getSpans() { return spans; }
public boolean hasPII() { return piiTokenCount > 0; }
public boolean hasError() { return errorMessage != null; }
public String getErrorMessage() { return errorMessage; }
/** Number of distinct PII categories detected in this sentence. */
public int distinctPiiTypes() { return typeFrequency.size(); }
// --- TSV export ---
/**
* TSV row: sentence | totalWords | piiTokens | coverage% | distinctTypes | typeFrequency
*/
public String toTsv() {
return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s",
sentence.replace('\t', ' '),
totalWords,
piiTokenCount,
piiCoverage,
getPiiCoveragePercent(),
distinctPiiTypes(),
typeFrequency.toString());
}
public static String tsvHeader() {
return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency";
}
@Override
public String toString() {
return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}",
totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet());
}
}
}
|