File size: 12,990 Bytes
18573e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
package bg.bas.dcl.LLMs;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

import bg.bas.dcl.general.FileHandler;

/**
 * BiasAnalyser
 *
 * Detects linguistic bias in Bulgarian text using the Bulgarian Bias Dictionary
 * (v4 TSV format).  Works at sentence level: for each sentence it returns a
 * {@link SentenceBiasScore} whose primary metric is the pair-coverage percentage —
 * the fraction of word tokens in the sentence that participate in at least one
 * signal–evaluator pair for each bias category.
 *
 * -----------------------------------------------------------------------
 * ALGORITHM (per sentence)
 *
 *   1. TOKENISE — split on whitespace, strip non-letter characters per token.
 *   2. MATCH    — look each token up in the {@link BiasLexicon} (form index,
 *                 case-insensitive).  Multi-word entries are tried first via a
 *                 forward-scan for bigrams and trigrams.
 *   3. PAIR     — for every signal token, search within ±PAIR_WINDOW tokens for
 *                 an evaluator token of the same bias type (or a general one).
 *                 Each unique (signal position, evaluator position) is a pair.
 *   4. SCORE    — pairCoverage[type] = distinctPairTokens[type] / totalWords
 *                 where distinctPairTokens = set of positions involved in
 *                 at least one confirmed pair for that type.
 *
  
 */
public class BiasAnalyser {

    // -----------------------------------------------------------------------
    // Constants
    // -----------------------------------------------------------------------

    /**
     * Maximum token distance between a signal and an evaluator for them to
     * be counted as a pair.  10 matches the window used in the original
     * BiasDetector.
     */
    public static final int PAIR_WINDOW = 10;

    /**
     * Sentences with fewer words than this are skipped entirely.
     */
    public static final int MIN_WORDS = 6;

    /**
     * Sentences with more words than this are still processed but a warning
     * is printed (very long sentences may inflate scores).
     */
    public static final int MAX_WORDS = 200;

    // -----------------------------------------------------------------------
    // Dependencies
    // -----------------------------------------------------------------------

    private final BiasLexicon              lexicon;
    private final BulgarianSentenceSplitter splitter;

    // -----------------------------------------------------------------------
    // Constructor
    // -----------------------------------------------------------------------

    /**
     * @param lexicon  the loaded bias dictionary
     * @param splitter an initialised Bulgarian sentence splitter
     */
    public BiasAnalyser(BiasLexicon lexicon, BulgarianSentenceSplitter splitter) {
        if (lexicon  == null) throw new IllegalArgumentException("lexicon must not be null");
        if (splitter == null) throw new IllegalArgumentException("splitter must not be null");
        this.lexicon  = lexicon;
        this.splitter = splitter;
    }

    // -----------------------------------------------------------------------
    // Public API
    // -----------------------------------------------------------------------

    /**
     * Splits {@code text} into sentences and returns a bias score for each. 
     */
    public List<SentenceBiasScore> analyseText(String text) {
        List<SentenceBiasScore> results = new ArrayList<>();
        if (text == null || text.isBlank()) return results;

        for (String sentence : splitter.split(text)) {
            results.add(analyseSentence(sentence));
        }
        return results;
    }

    /**
     * Analyses a single pre-split sentence.
     * 
     */
    public SentenceBiasScore analyseSentence(String sentence) {
        // --- Tokenise --------------------------------------------------
        String lower        = sentence.toLowerCase();
        String[] rawTokens  = lower.split("\\s+");

        // Build clean token list and a parallel lookup list
        // We attempt multi-word matches (bigrams, trigrams) first
        List<String>    cleanTokens = new ArrayList<>();   // word-only tokens
        List<BiasEntry> matched     = new ArrayList<>();   // parallel match (null=no match)

        int i = 0;
        while (i < rawTokens.length) {
            // Try trigram (3-word multi-word entry)
            if (i + 2 < rawTokens.length) {
                String tri = clean(rawTokens[i]) + " "
                           + clean(rawTokens[i + 1]) + " "
                           + clean(rawTokens[i + 2]);
                BiasEntry e = lexicon.lookup(tri);
                if (e != null) {
                    // Represent as 3 tokens (positions), all pointing to same entry
                    for (int k = 0; k < 3; k++) {
                        cleanTokens.add(clean(rawTokens[i + k]));
                        matched.add(e);
                    }
                    i += 3;
                    continue;
                }
            }
            // Try bigram
            if (i + 1 < rawTokens.length) {
                String bi = clean(rawTokens[i]) + " " + clean(rawTokens[i + 1]);
                BiasEntry e = lexicon.lookup(bi);
                if (e != null) {
                    for (int k = 0; k < 2; k++) {
                        cleanTokens.add(clean(rawTokens[i + k]));
                        matched.add(e);
                    }
                    i += 2;
                    continue;
                }
            }
            // Unigram
            String tok = clean(rawTokens[i]);
            if (!tok.isEmpty()) {
                cleanTokens.add(tok);
                matched.add(lexicon.lookup(tok));
            }
            i++;
        }

        int totalWords = cleanTokens.size();
 
        String[] biasTypes = SentenceBiasScore.BIAS_TYPES;

        Map<String, Integer> signalCount    = new HashMap<>();
        Map<String, Integer> evaluatorCount = new HashMap<>();
        Map<String, Double>  pairCoverage   = new HashMap<>();

        for (String type : biasTypes) {
            signalCount.put(type, 0);
            evaluatorCount.put(type, 0);
            pairCoverage.put(type, 0.0);
        }

        List<String> matchedLemmas = new ArrayList<>();
        int totalBiasWords  = 0;
        int totalDerogatory = 0;
        int totalColloquial = 0;

        if (totalWords < MIN_WORDS) {
            // Return zero-score result for very short sentences
            return new SentenceBiasScore(sentence, totalWords,
                    pairCoverage, signalCount, evaluatorCount,
                    matchedLemmas, 0, 0, 0, false);
        }

        // --- Collect matched positions ---------------------------------
        Set<String> seenLemmas = new HashSet<>();

        // signalPositions[type] = list of token indices that are signals for that type
        Map<String, List<Integer>> signalPos  = new HashMap<>();
        // evalPositions[type]   = list of token indices that are evaluators for that type
        Map<String, List<Integer>> evalPos    = new HashMap<>();

        for (String type : biasTypes) {
            signalPos.put(type, new ArrayList<>());
            evalPos.put(type,   new ArrayList<>());
        }

        for (int ti = 0; ti < totalWords; ti++) {
            BiasEntry entry = matched.get(ti);
            if (entry == null) continue;

            String lemma = entry.getWord();

            // Count each unique lemma only once (avoid double-counting
            // inflected-form repetitions of the same word in one sentence)
            if (seenLemmas.add(lemma)) {
                matchedLemmas.add(lemma);
            }

            if (entry.isEvaluative()) totalBiasWords++;
            if (entry.isDerogatory()) totalDerogatory++;
            if (entry.isColloquial()) totalColloquial++;

            // Determine which types this entry applies to
            List<String> applicableTypes = entry.isTyped()
                    ? List.of(entry.getBiasType())
                    : Arrays.asList(biasTypes);    // general entry → all types

            for (String type : applicableTypes) {
                if (entry.isSignal()) {
                    signalPos.get(type).add(ti);
                }
                if (entry.isEvaluativeModifier()) {
                    evalPos.get(type).add(ti);
                }
            }
        }

        // --- Pair detection & score computation ----------------------- 
        Map<String, Set<Integer>> pairTokens = new HashMap<>();
        for (String type : biasTypes) pairTokens.put(type, new HashSet<>());

        for (String type : biasTypes) {
            List<Integer> signals    = signalPos.get(type);
            List<Integer> evaluators = evalPos.get(type);

            for (int sIdx : signals) {
                boolean paired = false;

                // Self-pair: signal is itself evaluative
                BiasEntry sEntry = matched.get(sIdx);
                if (sEntry != null && sEntry.isEvaluativeModifier()) {
                    pairTokens.get(type).add(sIdx);
                    paired = true;
                }

                // Pair with a distinct evaluator within window
                for (int eIdx : evaluators) {
                    if (eIdx == sIdx) continue;
                    if (Math.abs(sIdx - eIdx) <= PAIR_WINDOW) {
                        pairTokens.get(type).add(sIdx);
                        pairTokens.get(type).add(eIdx);
                        paired = true;
                    }
                }
            }

            int sigCount  = signals.size();
            int evalCount = (int) evaluators.stream()
                    .filter(eIdx -> pairTokens.get(type).contains(eIdx))
                    .count();

            signalCount.put(type,    sigCount);
            evaluatorCount.put(type, evalCount);
 
            double coverage = totalWords > 0
                    ? (double) pairTokens.get(type).size() / totalWords
                    : 0.0;
            pairCoverage.put(type, coverage);
        }

        // --- Multi-type flag ------------------------------------------
        int typesWithPairs = 0;
        for (String type : biasTypes)
            if (!pairTokens.get(type).isEmpty()) typesWithPairs++;
        boolean multiType = typesWithPairs >= 2;

        return new SentenceBiasScore(
                sentence, totalWords,
                pairCoverage, signalCount, evaluatorCount,
                matchedLemmas, totalBiasWords, totalDerogatory, totalColloquial,
                multiType);
    }

    

    /**
     * Analyses all .txt files  
     */
    public void analyseDirectory(String corpusDir, String resultPath) {
        try {
            FileHandler fh = new FileHandler();

            try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
                    new FileOutputStream(resultPath, false), StandardCharsets.UTF_8))) {

                bw.write(SentenceBiasScore.tsvHeader());
                bw.newLine();

                int filesProcessed = 0;
                int sentencesWritten = 0;

                for (File f : fh.getFileListing(new File(corpusDir))) {
                    if (!f.isFile() || !f.getName().endsWith(".txt")) continue;

                    System.out.println("[BiasAnalyser] Processing: " + f.getName());

                    StringBuilder text = new StringBuilder();
                    try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
                        while (sc.hasNextLine()) {
                            text.append(sc.nextLine()).append(' ');
                        }
                    }

                    for (SentenceBiasScore score : analyseText(text.toString())) {
                        if (score.isBiased()) {
                            bw.write(f.getName() + "\t" + score.toTsv());
                            bw.newLine();
                            sentencesWritten++;
                        }
                    }
                    filesProcessed++;
                }

                System.out.printf("[BiasAnalyser] Done. Files: %d  Biased sentences written: %d%n",
                        filesProcessed, sentencesWritten);
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // -----------------------------------------------------------------------
    // Helper
    // -----------------------------------------------------------------------

   
    private String clean(String token) {
        return token.replaceAll("[^\\p{L}\\s]", "").trim();
    }
}