| package bg.bas.dcl.LLMs; |
|
|
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.OutputStreamWriter; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Scanner; |
| import java.util.Set; |
|
|
| import bg.bas.dcl.general.FileHandler; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public class BiasAnalyser { |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| public static final int PAIR_WINDOW = 10; |
|
|
| |
| |
| |
| public static final int MIN_WORDS = 6; |
|
|
| |
| |
| |
| |
| public static final int MAX_WORDS = 200; |
|
|
| |
| |
| |
|
|
| private final BiasLexicon lexicon; |
| private final BulgarianSentenceSplitter splitter; |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| public BiasAnalyser(BiasLexicon lexicon, BulgarianSentenceSplitter splitter) { |
| if (lexicon == null) throw new IllegalArgumentException("lexicon must not be null"); |
| if (splitter == null) throw new IllegalArgumentException("splitter must not be null"); |
| this.lexicon = lexicon; |
| this.splitter = splitter; |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| public List<SentenceBiasScore> analyseText(String text) { |
| List<SentenceBiasScore> results = new ArrayList<>(); |
| if (text == null || text.isBlank()) return results; |
|
|
| for (String sentence : splitter.split(text)) { |
| results.add(analyseSentence(sentence)); |
| } |
| return results; |
| } |
|
|
| |
| |
| |
| |
| public SentenceBiasScore analyseSentence(String sentence) { |
| |
| String lower = sentence.toLowerCase(); |
| String[] rawTokens = lower.split("\\s+"); |
|
|
| |
| |
| List<String> cleanTokens = new ArrayList<>(); |
| List<BiasEntry> matched = new ArrayList<>(); |
|
|
| int i = 0; |
| while (i < rawTokens.length) { |
| |
| if (i + 2 < rawTokens.length) { |
| String tri = clean(rawTokens[i]) + " " |
| + clean(rawTokens[i + 1]) + " " |
| + clean(rawTokens[i + 2]); |
| BiasEntry e = lexicon.lookup(tri); |
| if (e != null) { |
| |
| for (int k = 0; k < 3; k++) { |
| cleanTokens.add(clean(rawTokens[i + k])); |
| matched.add(e); |
| } |
| i += 3; |
| continue; |
| } |
| } |
| |
| if (i + 1 < rawTokens.length) { |
| String bi = clean(rawTokens[i]) + " " + clean(rawTokens[i + 1]); |
| BiasEntry e = lexicon.lookup(bi); |
| if (e != null) { |
| for (int k = 0; k < 2; k++) { |
| cleanTokens.add(clean(rawTokens[i + k])); |
| matched.add(e); |
| } |
| i += 2; |
| continue; |
| } |
| } |
| |
| String tok = clean(rawTokens[i]); |
| if (!tok.isEmpty()) { |
| cleanTokens.add(tok); |
| matched.add(lexicon.lookup(tok)); |
| } |
| i++; |
| } |
|
|
| int totalWords = cleanTokens.size(); |
| |
| String[] biasTypes = SentenceBiasScore.BIAS_TYPES; |
|
|
| Map<String, Integer> signalCount = new HashMap<>(); |
| Map<String, Integer> evaluatorCount = new HashMap<>(); |
| Map<String, Double> pairCoverage = new HashMap<>(); |
|
|
| for (String type : biasTypes) { |
| signalCount.put(type, 0); |
| evaluatorCount.put(type, 0); |
| pairCoverage.put(type, 0.0); |
| } |
|
|
| List<String> matchedLemmas = new ArrayList<>(); |
| int totalBiasWords = 0; |
| int totalDerogatory = 0; |
| int totalColloquial = 0; |
|
|
| if (totalWords < MIN_WORDS) { |
| |
| return new SentenceBiasScore(sentence, totalWords, |
| pairCoverage, signalCount, evaluatorCount, |
| matchedLemmas, 0, 0, 0, false); |
| } |
|
|
| |
| Set<String> seenLemmas = new HashSet<>(); |
|
|
| |
| Map<String, List<Integer>> signalPos = new HashMap<>(); |
| |
| Map<String, List<Integer>> evalPos = new HashMap<>(); |
|
|
| for (String type : biasTypes) { |
| signalPos.put(type, new ArrayList<>()); |
| evalPos.put(type, new ArrayList<>()); |
| } |
|
|
| for (int ti = 0; ti < totalWords; ti++) { |
| BiasEntry entry = matched.get(ti); |
| if (entry == null) continue; |
|
|
| String lemma = entry.getWord(); |
|
|
| |
| |
| if (seenLemmas.add(lemma)) { |
| matchedLemmas.add(lemma); |
| } |
|
|
| if (entry.isEvaluative()) totalBiasWords++; |
| if (entry.isDerogatory()) totalDerogatory++; |
| if (entry.isColloquial()) totalColloquial++; |
|
|
| |
| List<String> applicableTypes = entry.isTyped() |
| ? List.of(entry.getBiasType()) |
| : Arrays.asList(biasTypes); |
|
|
| for (String type : applicableTypes) { |
| if (entry.isSignal()) { |
| signalPos.get(type).add(ti); |
| } |
| if (entry.isEvaluativeModifier()) { |
| evalPos.get(type).add(ti); |
| } |
| } |
| } |
|
|
| |
| Map<String, Set<Integer>> pairTokens = new HashMap<>(); |
| for (String type : biasTypes) pairTokens.put(type, new HashSet<>()); |
|
|
| for (String type : biasTypes) { |
| List<Integer> signals = signalPos.get(type); |
| List<Integer> evaluators = evalPos.get(type); |
|
|
| for (int sIdx : signals) { |
| boolean paired = false; |
|
|
| |
| BiasEntry sEntry = matched.get(sIdx); |
| if (sEntry != null && sEntry.isEvaluativeModifier()) { |
| pairTokens.get(type).add(sIdx); |
| paired = true; |
| } |
|
|
| |
| for (int eIdx : evaluators) { |
| if (eIdx == sIdx) continue; |
| if (Math.abs(sIdx - eIdx) <= PAIR_WINDOW) { |
| pairTokens.get(type).add(sIdx); |
| pairTokens.get(type).add(eIdx); |
| paired = true; |
| } |
| } |
| } |
|
|
| int sigCount = signals.size(); |
| int evalCount = (int) evaluators.stream() |
| .filter(eIdx -> pairTokens.get(type).contains(eIdx)) |
| .count(); |
|
|
| signalCount.put(type, sigCount); |
| evaluatorCount.put(type, evalCount); |
| |
| double coverage = totalWords > 0 |
| ? (double) pairTokens.get(type).size() / totalWords |
| : 0.0; |
| pairCoverage.put(type, coverage); |
| } |
|
|
| |
| int typesWithPairs = 0; |
| for (String type : biasTypes) |
| if (!pairTokens.get(type).isEmpty()) typesWithPairs++; |
| boolean multiType = typesWithPairs >= 2; |
|
|
| return new SentenceBiasScore( |
| sentence, totalWords, |
| pairCoverage, signalCount, evaluatorCount, |
| matchedLemmas, totalBiasWords, totalDerogatory, totalColloquial, |
| multiType); |
| } |
|
|
| |
|
|
| |
| |
| |
| public void analyseDirectory(String corpusDir, String resultPath) { |
| try { |
| FileHandler fh = new FileHandler(); |
|
|
| try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( |
| new FileOutputStream(resultPath, false), StandardCharsets.UTF_8))) { |
|
|
| bw.write(SentenceBiasScore.tsvHeader()); |
| bw.newLine(); |
|
|
| int filesProcessed = 0; |
| int sentencesWritten = 0; |
|
|
| for (File f : fh.getFileListing(new File(corpusDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
|
|
| System.out.println("[BiasAnalyser] Processing: " + f.getName()); |
|
|
| StringBuilder text = new StringBuilder(); |
| try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) { |
| while (sc.hasNextLine()) { |
| text.append(sc.nextLine()).append(' '); |
| } |
| } |
|
|
| for (SentenceBiasScore score : analyseText(text.toString())) { |
| if (score.isBiased()) { |
| bw.write(f.getName() + "\t" + score.toTsv()); |
| bw.newLine(); |
| sentencesWritten++; |
| } |
| } |
| filesProcessed++; |
| } |
|
|
| System.out.printf("[BiasAnalyser] Done. Files: %d Biased sentences written: %d%n", |
| filesProcessed, sentencesWritten); |
| } |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| private String clean(String token) { |
| return token.replaceAll("[^\\p{L}\\s]", "").trim(); |
| } |
| } |
|
|