IfGPT-DataQualityComponents / java /bg /bas /dcl /LLMs /BiasAnalyser.java
dcl-ibl-bas's picture
Upload 22 files
18573e4 verified
package bg.bas.dcl.LLMs;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import bg.bas.dcl.general.FileHandler;
/**
* BiasAnalyser
*
* Detects linguistic bias in Bulgarian text using the Bulgarian Bias Dictionary
* (v4 TSV format). Works at sentence level: for each sentence it returns a
* {@link SentenceBiasScore} whose primary metric is the pair-coverage percentage —
* the fraction of word tokens in the sentence that participate in at least one
* signal–evaluator pair for each bias category.
*
* -----------------------------------------------------------------------
* ALGORITHM (per sentence)
*
* 1. TOKENISE — split on whitespace, strip non-letter characters per token.
* 2. MATCH — look each token up in the {@link BiasLexicon} (form index,
* case-insensitive). Multi-word entries are tried first via a
* forward-scan for bigrams and trigrams.
* 3. PAIR — for every signal token, search within ±PAIR_WINDOW tokens for
* an evaluator token of the same bias type (or a general one).
* Each unique (signal position, evaluator position) is a pair.
* 4. SCORE — pairCoverage[type] = distinctPairTokens[type] / totalWords
* where distinctPairTokens = set of positions involved in
* at least one confirmed pair for that type.
*
*/
public class BiasAnalyser {
// -----------------------------------------------------------------------
// Constants
// -----------------------------------------------------------------------
/**
* Maximum token distance between a signal and an evaluator for them to
* be counted as a pair. 10 matches the window used in the original
* BiasDetector.
*/
public static final int PAIR_WINDOW = 10;
/**
* Sentences with fewer words than this are skipped entirely.
*/
public static final int MIN_WORDS = 6;
/**
* Sentences with more words than this are still processed but a warning
* is printed (very long sentences may inflate scores).
*/
public static final int MAX_WORDS = 200;
// -----------------------------------------------------------------------
// Dependencies
// -----------------------------------------------------------------------
private final BiasLexicon lexicon;
private final BulgarianSentenceSplitter splitter;
// -----------------------------------------------------------------------
// Constructor
// -----------------------------------------------------------------------
/**
* @param lexicon the loaded bias dictionary
* @param splitter an initialised Bulgarian sentence splitter
*/
public BiasAnalyser(BiasLexicon lexicon, BulgarianSentenceSplitter splitter) {
if (lexicon == null) throw new IllegalArgumentException("lexicon must not be null");
if (splitter == null) throw new IllegalArgumentException("splitter must not be null");
this.lexicon = lexicon;
this.splitter = splitter;
}
// -----------------------------------------------------------------------
// Public API
// -----------------------------------------------------------------------
/**
* Splits {@code text} into sentences and returns a bias score for each.
*/
public List<SentenceBiasScore> analyseText(String text) {
List<SentenceBiasScore> results = new ArrayList<>();
if (text == null || text.isBlank()) return results;
for (String sentence : splitter.split(text)) {
results.add(analyseSentence(sentence));
}
return results;
}
/**
* Analyses a single pre-split sentence.
*
*/
public SentenceBiasScore analyseSentence(String sentence) {
// --- Tokenise --------------------------------------------------
String lower = sentence.toLowerCase();
String[] rawTokens = lower.split("\\s+");
// Build clean token list and a parallel lookup list
// We attempt multi-word matches (bigrams, trigrams) first
List<String> cleanTokens = new ArrayList<>(); // word-only tokens
List<BiasEntry> matched = new ArrayList<>(); // parallel match (null=no match)
int i = 0;
while (i < rawTokens.length) {
// Try trigram (3-word multi-word entry)
if (i + 2 < rawTokens.length) {
String tri = clean(rawTokens[i]) + " "
+ clean(rawTokens[i + 1]) + " "
+ clean(rawTokens[i + 2]);
BiasEntry e = lexicon.lookup(tri);
if (e != null) {
// Represent as 3 tokens (positions), all pointing to same entry
for (int k = 0; k < 3; k++) {
cleanTokens.add(clean(rawTokens[i + k]));
matched.add(e);
}
i += 3;
continue;
}
}
// Try bigram
if (i + 1 < rawTokens.length) {
String bi = clean(rawTokens[i]) + " " + clean(rawTokens[i + 1]);
BiasEntry e = lexicon.lookup(bi);
if (e != null) {
for (int k = 0; k < 2; k++) {
cleanTokens.add(clean(rawTokens[i + k]));
matched.add(e);
}
i += 2;
continue;
}
}
// Unigram
String tok = clean(rawTokens[i]);
if (!tok.isEmpty()) {
cleanTokens.add(tok);
matched.add(lexicon.lookup(tok));
}
i++;
}
int totalWords = cleanTokens.size();
String[] biasTypes = SentenceBiasScore.BIAS_TYPES;
Map<String, Integer> signalCount = new HashMap<>();
Map<String, Integer> evaluatorCount = new HashMap<>();
Map<String, Double> pairCoverage = new HashMap<>();
for (String type : biasTypes) {
signalCount.put(type, 0);
evaluatorCount.put(type, 0);
pairCoverage.put(type, 0.0);
}
List<String> matchedLemmas = new ArrayList<>();
int totalBiasWords = 0;
int totalDerogatory = 0;
int totalColloquial = 0;
if (totalWords < MIN_WORDS) {
// Return zero-score result for very short sentences
return new SentenceBiasScore(sentence, totalWords,
pairCoverage, signalCount, evaluatorCount,
matchedLemmas, 0, 0, 0, false);
}
// --- Collect matched positions ---------------------------------
Set<String> seenLemmas = new HashSet<>();
// signalPositions[type] = list of token indices that are signals for that type
Map<String, List<Integer>> signalPos = new HashMap<>();
// evalPositions[type] = list of token indices that are evaluators for that type
Map<String, List<Integer>> evalPos = new HashMap<>();
for (String type : biasTypes) {
signalPos.put(type, new ArrayList<>());
evalPos.put(type, new ArrayList<>());
}
for (int ti = 0; ti < totalWords; ti++) {
BiasEntry entry = matched.get(ti);
if (entry == null) continue;
String lemma = entry.getWord();
// Count each unique lemma only once (avoid double-counting
// inflected-form repetitions of the same word in one sentence)
if (seenLemmas.add(lemma)) {
matchedLemmas.add(lemma);
}
if (entry.isEvaluative()) totalBiasWords++;
if (entry.isDerogatory()) totalDerogatory++;
if (entry.isColloquial()) totalColloquial++;
// Determine which types this entry applies to
List<String> applicableTypes = entry.isTyped()
? List.of(entry.getBiasType())
: Arrays.asList(biasTypes); // general entry → all types
for (String type : applicableTypes) {
if (entry.isSignal()) {
signalPos.get(type).add(ti);
}
if (entry.isEvaluativeModifier()) {
evalPos.get(type).add(ti);
}
}
}
// --- Pair detection & score computation -----------------------
Map<String, Set<Integer>> pairTokens = new HashMap<>();
for (String type : biasTypes) pairTokens.put(type, new HashSet<>());
for (String type : biasTypes) {
List<Integer> signals = signalPos.get(type);
List<Integer> evaluators = evalPos.get(type);
for (int sIdx : signals) {
boolean paired = false;
// Self-pair: signal is itself evaluative
BiasEntry sEntry = matched.get(sIdx);
if (sEntry != null && sEntry.isEvaluativeModifier()) {
pairTokens.get(type).add(sIdx);
paired = true;
}
// Pair with a distinct evaluator within window
for (int eIdx : evaluators) {
if (eIdx == sIdx) continue;
if (Math.abs(sIdx - eIdx) <= PAIR_WINDOW) {
pairTokens.get(type).add(sIdx);
pairTokens.get(type).add(eIdx);
paired = true;
}
}
}
int sigCount = signals.size();
int evalCount = (int) evaluators.stream()
.filter(eIdx -> pairTokens.get(type).contains(eIdx))
.count();
signalCount.put(type, sigCount);
evaluatorCount.put(type, evalCount);
double coverage = totalWords > 0
? (double) pairTokens.get(type).size() / totalWords
: 0.0;
pairCoverage.put(type, coverage);
}
// --- Multi-type flag ------------------------------------------
int typesWithPairs = 0;
for (String type : biasTypes)
if (!pairTokens.get(type).isEmpty()) typesWithPairs++;
boolean multiType = typesWithPairs >= 2;
return new SentenceBiasScore(
sentence, totalWords,
pairCoverage, signalCount, evaluatorCount,
matchedLemmas, totalBiasWords, totalDerogatory, totalColloquial,
multiType);
}
/**
* Analyses all .txt files
*/
public void analyseDirectory(String corpusDir, String resultPath) {
try {
FileHandler fh = new FileHandler();
try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(resultPath, false), StandardCharsets.UTF_8))) {
bw.write(SentenceBiasScore.tsvHeader());
bw.newLine();
int filesProcessed = 0;
int sentencesWritten = 0;
for (File f : fh.getFileListing(new File(corpusDir))) {
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
System.out.println("[BiasAnalyser] Processing: " + f.getName());
StringBuilder text = new StringBuilder();
try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
while (sc.hasNextLine()) {
text.append(sc.nextLine()).append(' ');
}
}
for (SentenceBiasScore score : analyseText(text.toString())) {
if (score.isBiased()) {
bw.write(f.getName() + "\t" + score.toTsv());
bw.newLine();
sentencesWritten++;
}
}
filesProcessed++;
}
System.out.printf("[BiasAnalyser] Done. Files: %d Biased sentences written: %d%n",
filesProcessed, sentencesWritten);
}
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------------------------------------------------------
// Helper
// -----------------------------------------------------------------------
private String clean(String token) {
return token.replaceAll("[^\\p{L}\\s]", "").trim();
}
}