dcl-ibl-bas commited on
Commit
18573e4
·
verified ·
1 Parent(s): 4ae0bcb

Upload 22 files

Browse files
java/bg/bas/dcl/LLMs/BiasAnalyser.java ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.io.BufferedWriter;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.OutputStreamWriter;
7
+ import java.nio.charset.StandardCharsets;
8
+ import java.util.ArrayList;
9
+ import java.util.Arrays;
10
+ import java.util.HashMap;
11
+ import java.util.HashSet;
12
+ import java.util.List;
13
+ import java.util.Map;
14
+ import java.util.Scanner;
15
+ import java.util.Set;
16
+
17
+ import bg.bas.dcl.general.FileHandler;
18
+
19
+ /**
20
+ * BiasAnalyser
21
+ *
22
+ * Detects linguistic bias in Bulgarian text using the Bulgarian Bias Dictionary
23
+ * (v4 TSV format). Works at sentence level: for each sentence it returns a
24
+ * {@link SentenceBiasScore} whose primary metric is the pair-coverage percentage —
25
+ * the fraction of word tokens in the sentence that participate in at least one
26
+ * signal–evaluator pair for each bias category.
27
+ *
28
+ * -----------------------------------------------------------------------
29
+ * ALGORITHM (per sentence)
30
+ *
31
+ * 1. TOKENISE — split on whitespace, strip non-letter characters per token.
32
+ * 2. MATCH — look each token up in the {@link BiasLexicon} (form index,
33
+ * case-insensitive). Multi-word entries are tried first via a
34
+ * forward-scan for bigrams and trigrams.
35
+ * 3. PAIR — for every signal token, search within ±PAIR_WINDOW tokens for
36
+ * an evaluator token of the same bias type (or a general one).
37
+ * Each unique (signal position, evaluator position) is a pair.
38
+ * 4. SCORE — pairCoverage[type] = distinctPairTokens[type] / totalWords
39
+ * where distinctPairTokens = set of positions involved in
40
+ * at least one confirmed pair for that type.
41
+ *
42
+
43
+ */
44
+ public class BiasAnalyser {
45
+
46
+ // -----------------------------------------------------------------------
47
+ // Constants
48
+ // -----------------------------------------------------------------------
49
+
50
+ /**
51
+ * Maximum token distance between a signal and an evaluator for them to
52
+ * be counted as a pair. 10 matches the window used in the original
53
+ * BiasDetector.
54
+ */
55
+ public static final int PAIR_WINDOW = 10;
56
+
57
+ /**
58
+ * Sentences with fewer words than this are skipped entirely.
59
+ */
60
+ public static final int MIN_WORDS = 6;
61
+
62
+ /**
63
+ * Sentences with more words than this are still processed but a warning
64
+ * is printed (very long sentences may inflate scores).
65
+ */
66
+ public static final int MAX_WORDS = 200;
67
+
68
+ // -----------------------------------------------------------------------
69
+ // Dependencies
70
+ // -----------------------------------------------------------------------
71
+
72
+ private final BiasLexicon lexicon;
73
+ private final BulgarianSentenceSplitter splitter;
74
+
75
+ // -----------------------------------------------------------------------
76
+ // Constructor
77
+ // -----------------------------------------------------------------------
78
+
79
+ /**
80
+ * @param lexicon the loaded bias dictionary
81
+ * @param splitter an initialised Bulgarian sentence splitter
82
+ */
83
+ public BiasAnalyser(BiasLexicon lexicon, BulgarianSentenceSplitter splitter) {
84
+ if (lexicon == null) throw new IllegalArgumentException("lexicon must not be null");
85
+ if (splitter == null) throw new IllegalArgumentException("splitter must not be null");
86
+ this.lexicon = lexicon;
87
+ this.splitter = splitter;
88
+ }
89
+
90
+ // -----------------------------------------------------------------------
91
+ // Public API
92
+ // -----------------------------------------------------------------------
93
+
94
+ /**
95
+ * Splits {@code text} into sentences and returns a bias score for each.
96
+ */
97
+ public List<SentenceBiasScore> analyseText(String text) {
98
+ List<SentenceBiasScore> results = new ArrayList<>();
99
+ if (text == null || text.isBlank()) return results;
100
+
101
+ for (String sentence : splitter.split(text)) {
102
+ results.add(analyseSentence(sentence));
103
+ }
104
+ return results;
105
+ }
106
+
107
+ /**
108
+ * Analyses a single pre-split sentence.
109
+ *
110
+ */
111
+ public SentenceBiasScore analyseSentence(String sentence) {
112
+ // --- Tokenise --------------------------------------------------
113
+ String lower = sentence.toLowerCase();
114
+ String[] rawTokens = lower.split("\\s+");
115
+
116
+ // Build clean token list and a parallel lookup list
117
+ // We attempt multi-word matches (bigrams, trigrams) first
118
+ List<String> cleanTokens = new ArrayList<>(); // word-only tokens
119
+ List<BiasEntry> matched = new ArrayList<>(); // parallel match (null=no match)
120
+
121
+ int i = 0;
122
+ while (i < rawTokens.length) {
123
+ // Try trigram (3-word multi-word entry)
124
+ if (i + 2 < rawTokens.length) {
125
+ String tri = clean(rawTokens[i]) + " "
126
+ + clean(rawTokens[i + 1]) + " "
127
+ + clean(rawTokens[i + 2]);
128
+ BiasEntry e = lexicon.lookup(tri);
129
+ if (e != null) {
130
+ // Represent as 3 tokens (positions), all pointing to same entry
131
+ for (int k = 0; k < 3; k++) {
132
+ cleanTokens.add(clean(rawTokens[i + k]));
133
+ matched.add(e);
134
+ }
135
+ i += 3;
136
+ continue;
137
+ }
138
+ }
139
+ // Try bigram
140
+ if (i + 1 < rawTokens.length) {
141
+ String bi = clean(rawTokens[i]) + " " + clean(rawTokens[i + 1]);
142
+ BiasEntry e = lexicon.lookup(bi);
143
+ if (e != null) {
144
+ for (int k = 0; k < 2; k++) {
145
+ cleanTokens.add(clean(rawTokens[i + k]));
146
+ matched.add(e);
147
+ }
148
+ i += 2;
149
+ continue;
150
+ }
151
+ }
152
+ // Unigram
153
+ String tok = clean(rawTokens[i]);
154
+ if (!tok.isEmpty()) {
155
+ cleanTokens.add(tok);
156
+ matched.add(lexicon.lookup(tok));
157
+ }
158
+ i++;
159
+ }
160
+
161
+ int totalWords = cleanTokens.size();
162
+
163
+ String[] biasTypes = SentenceBiasScore.BIAS_TYPES;
164
+
165
+ Map<String, Integer> signalCount = new HashMap<>();
166
+ Map<String, Integer> evaluatorCount = new HashMap<>();
167
+ Map<String, Double> pairCoverage = new HashMap<>();
168
+
169
+ for (String type : biasTypes) {
170
+ signalCount.put(type, 0);
171
+ evaluatorCount.put(type, 0);
172
+ pairCoverage.put(type, 0.0);
173
+ }
174
+
175
+ List<String> matchedLemmas = new ArrayList<>();
176
+ int totalBiasWords = 0;
177
+ int totalDerogatory = 0;
178
+ int totalColloquial = 0;
179
+
180
+ if (totalWords < MIN_WORDS) {
181
+ // Return zero-score result for very short sentences
182
+ return new SentenceBiasScore(sentence, totalWords,
183
+ pairCoverage, signalCount, evaluatorCount,
184
+ matchedLemmas, 0, 0, 0, false);
185
+ }
186
+
187
+ // --- Collect matched positions ---------------------------------
188
+ Set<String> seenLemmas = new HashSet<>();
189
+
190
+ // signalPositions[type] = list of token indices that are signals for that type
191
+ Map<String, List<Integer>> signalPos = new HashMap<>();
192
+ // evalPositions[type] = list of token indices that are evaluators for that type
193
+ Map<String, List<Integer>> evalPos = new HashMap<>();
194
+
195
+ for (String type : biasTypes) {
196
+ signalPos.put(type, new ArrayList<>());
197
+ evalPos.put(type, new ArrayList<>());
198
+ }
199
+
200
+ for (int ti = 0; ti < totalWords; ti++) {
201
+ BiasEntry entry = matched.get(ti);
202
+ if (entry == null) continue;
203
+
204
+ String lemma = entry.getWord();
205
+
206
+ // Count each unique lemma only once (avoid double-counting
207
+ // inflected-form repetitions of the same word in one sentence)
208
+ if (seenLemmas.add(lemma)) {
209
+ matchedLemmas.add(lemma);
210
+ }
211
+
212
+ if (entry.isEvaluative()) totalBiasWords++;
213
+ if (entry.isDerogatory()) totalDerogatory++;
214
+ if (entry.isColloquial()) totalColloquial++;
215
+
216
+ // Determine which types this entry applies to
217
+ List<String> applicableTypes = entry.isTyped()
218
+ ? List.of(entry.getBiasType())
219
+ : Arrays.asList(biasTypes); // general entry → all types
220
+
221
+ for (String type : applicableTypes) {
222
+ if (entry.isSignal()) {
223
+ signalPos.get(type).add(ti);
224
+ }
225
+ if (entry.isEvaluativeModifier()) {
226
+ evalPos.get(type).add(ti);
227
+ }
228
+ }
229
+ }
230
+
231
+ // --- Pair detection & score computation -----------------------
232
+ Map<String, Set<Integer>> pairTokens = new HashMap<>();
233
+ for (String type : biasTypes) pairTokens.put(type, new HashSet<>());
234
+
235
+ for (String type : biasTypes) {
236
+ List<Integer> signals = signalPos.get(type);
237
+ List<Integer> evaluators = evalPos.get(type);
238
+
239
+ for (int sIdx : signals) {
240
+ boolean paired = false;
241
+
242
+ // Self-pair: signal is itself evaluative
243
+ BiasEntry sEntry = matched.get(sIdx);
244
+ if (sEntry != null && sEntry.isEvaluativeModifier()) {
245
+ pairTokens.get(type).add(sIdx);
246
+ paired = true;
247
+ }
248
+
249
+ // Pair with a distinct evaluator within window
250
+ for (int eIdx : evaluators) {
251
+ if (eIdx == sIdx) continue;
252
+ if (Math.abs(sIdx - eIdx) <= PAIR_WINDOW) {
253
+ pairTokens.get(type).add(sIdx);
254
+ pairTokens.get(type).add(eIdx);
255
+ paired = true;
256
+ }
257
+ }
258
+ }
259
+
260
+ int sigCount = signals.size();
261
+ int evalCount = (int) evaluators.stream()
262
+ .filter(eIdx -> pairTokens.get(type).contains(eIdx))
263
+ .count();
264
+
265
+ signalCount.put(type, sigCount);
266
+ evaluatorCount.put(type, evalCount);
267
+
268
+ double coverage = totalWords > 0
269
+ ? (double) pairTokens.get(type).size() / totalWords
270
+ : 0.0;
271
+ pairCoverage.put(type, coverage);
272
+ }
273
+
274
+ // --- Multi-type flag ------------------------------------------
275
+ int typesWithPairs = 0;
276
+ for (String type : biasTypes)
277
+ if (!pairTokens.get(type).isEmpty()) typesWithPairs++;
278
+ boolean multiType = typesWithPairs >= 2;
279
+
280
+ return new SentenceBiasScore(
281
+ sentence, totalWords,
282
+ pairCoverage, signalCount, evaluatorCount,
283
+ matchedLemmas, totalBiasWords, totalDerogatory, totalColloquial,
284
+ multiType);
285
+ }
286
+
287
+
288
+
289
+ /**
290
+ * Analyses all .txt files
291
+ */
292
+ public void analyseDirectory(String corpusDir, String resultPath) {
293
+ try {
294
+ FileHandler fh = new FileHandler();
295
+
296
+ try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
297
+ new FileOutputStream(resultPath, false), StandardCharsets.UTF_8))) {
298
+
299
+ bw.write(SentenceBiasScore.tsvHeader());
300
+ bw.newLine();
301
+
302
+ int filesProcessed = 0;
303
+ int sentencesWritten = 0;
304
+
305
+ for (File f : fh.getFileListing(new File(corpusDir))) {
306
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
307
+
308
+ System.out.println("[BiasAnalyser] Processing: " + f.getName());
309
+
310
+ StringBuilder text = new StringBuilder();
311
+ try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
312
+ while (sc.hasNextLine()) {
313
+ text.append(sc.nextLine()).append(' ');
314
+ }
315
+ }
316
+
317
+ for (SentenceBiasScore score : analyseText(text.toString())) {
318
+ if (score.isBiased()) {
319
+ bw.write(f.getName() + "\t" + score.toTsv());
320
+ bw.newLine();
321
+ sentencesWritten++;
322
+ }
323
+ }
324
+ filesProcessed++;
325
+ }
326
+
327
+ System.out.printf("[BiasAnalyser] Done. Files: %d Biased sentences written: %d%n",
328
+ filesProcessed, sentencesWritten);
329
+ }
330
+
331
+ } catch (Exception e) {
332
+ e.printStackTrace();
333
+ }
334
+ }
335
+
336
+ // -----------------------------------------------------------------------
337
+ // Helper
338
+ // -----------------------------------------------------------------------
339
+
340
+
341
+ private String clean(String token) {
342
+ return token.replaceAll("[^\\p{L}\\s]", "").trim();
343
+ }
344
+ }
java/bg/bas/dcl/LLMs/BiasDetectorDemo.java ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.util.List;
4
+
5
+ /**
6
+ * BiasDetectorDemo
7
+ *
8
+ *
9
+ * -----------------------------------------------------------------------
10
+ * MAVEN DEPENDENCIES (add to pom.xml):
11
+ *
12
+ * <!-- OpenNLP toolkit -->
13
+ * <dependency>
14
+ * <groupId>org.apache.opennlp</groupId>
15
+ * <artifactId>opennlp-tools</artifactId>
16
+ * <version>2.4.0</version>
17
+ * </dependency>
18
+ *
19
+ * <!-- Bulgarian sentence-detection model (UD 2.14, Apache 2.0) -->
20
+ * <dependency>
21
+ * <groupId>org.apache.opennlp</groupId>
22
+ * <artifactId>opennlp-models-sentdetect-bg</artifactId>
23
+ * <version>1.2</version>
24
+ * </dependency>
25
+ */
26
+ public class BiasDetectorDemo {
27
+
28
+ public static void main(String[] args) {
29
+
30
+ // ------------------------------------------------------------------
31
+ // 1. Load the Bulgarian sentence splitter
32
+ // (loads bundled model from the Maven JAR automatically)
33
+ // ------------------------------------------------------------------
34
+ BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
35
+
36
+ // Alternatively, supply an explicit model file path:
37
+ // BulgarianSentenceSplitter splitter =
38
+ // new BulgarianSentenceSplitter("/path/to/bg-sent.bin");
39
+
40
+
41
+ // ------------------------------------------------------------------
42
+ // 2. Load the bias lexicon
43
+ // ------------------------------------------------------------------
44
+ String dictPath = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
45
+ + "bulgarian_bias_dictionary_v4.tsv";
46
+
47
+ BiasLexicon lexicon = new BiasLexicon(dictPath);
48
+ System.out.printf("Lexicon loaded: %d entries%n%n", lexicon.size());
49
+
50
+
51
+ // ------------------------------------------------------------------
52
+ // 3. Build the analyser
53
+ // ------------------------------------------------------------------
54
+ BiasAnalyser analyser = new BiasAnalyser(lexicon, splitter);
55
+
56
+
57
+ // ------------------------------------------------------------------
58
+ // 4a. Analyse a block of text in memory
59
+ // ------------------------------------------------------------------
60
+ String sampleText =
61
+ "Слепите хора трудно могат да се справят сами в живота. " +
62
+ "Времето днес е слънчево и приятно.";
63
+
64
+ System.out.println("=== Sentence-level bias scores ===");
65
+ System.out.println(SentenceBiasScore.tsvHeader());
66
+ System.out.println();
67
+
68
+ List<SentenceBiasScore> scores = analyser.analyseText(sampleText);
69
+
70
+ for (SentenceBiasScore score : scores) {
71
+ System.out.println("Sentence : " + score.getSentence());
72
+ System.out.printf ("Words : %d%n", score.getTotalWords());
73
+ System.out.printf ("Biased : %b%n", score.isBiased());
74
+
75
+ double[] cov = score.coverageArray();
76
+ String[] types = SentenceBiasScore.BIAS_TYPES;
77
+ for (int i = 0; i < types.length; i++) {
78
+ if (cov[i] > 0)
79
+ System.out.printf(" %-18s %.2f%% pair coverage%n",
80
+ types[i] + ":", cov[i] * 100);
81
+ }
82
+ System.out.printf ("Total : %.2f%% overall coverage%n", score.totalCoverage() * 100);
83
+ System.out.println("Lemmas : " + score.getMatchedLemmas());
84
+ System.out.println();
85
+ }
86
+
87
+
88
+ // ------------------------------------------------------------------
89
+ // 4b. Analyse a corpus directory — writes a TSV results file
90
+ // (only biased sentences are written; zero-coverage sentences
91
+ // are filtered out automatically by analyseDirectory)
92
+ // ------------------------------------------------------------------
93
+ String corpusDir = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/WIKI/";
94
+ String resultTsv = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/bias_results.tsv";
95
+
96
+ // analyser.analyseDirectory(corpusDir, resultTsv);
97
+
98
+
99
+ // ------------------------------------------------------------------
100
+ // 4c. Sentence splitting only — using the splitter standalone
101
+ // ------------------------------------------------------------------
102
+ String text = "Това е първото изречение. Второто е по-дълго и сложно! " +
103
+ "А третото задава въпрос?";
104
+
105
+ String[] sentences = splitter.split(text);
106
+ System.out.println("=== Sentence splitting demo ===");
107
+ for (int i = 0; i < sentences.length; i++) {
108
+ System.out.printf(" [%d] %s%n", i + 1, sentences[i]);
109
+ }
110
+ }
111
+ }
java/bg/bas/dcl/LLMs/BiasEntry.java ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.util.Collections;
4
+ import java.util.HashSet;
5
+ import java.util.Set;
6
+
7
+ /**
8
+ * BiasEntry
9
+ *
10
+ * TSV column order (0-based, tab-separated):
11
+ * 0 word — canonical lemma
12
+ * 1 POS — part of speech (N, A, V, …)
13
+ * 2 signal — "true" / "false" : marks identity-group signals
14
+ * 3 biasType — gender | race_ethnicity | religion | disability | appearance | "" (general)
15
+ * 4 biasValue — positive | negative | neutral | ""
16
+ * 5 derogatory — "true" / "false"
17
+ * 6 colloquial — "true" / "false"
18
+ * 7 forms — "true" / "false" (unused flag; inflected forms are in col 10)
19
+ * 8 positivity — double in [0,1]
20
+ * 9 negativity — double in [0,1]
21
+ * 10 inflectedForms — pipe-separated list of surface forms, or empty
22
+ */
23
+ public class BiasEntry {
24
+
25
+ // -----------------------------------------------------------------------
26
+ // Fields
27
+ // -----------------------------------------------------------------------
28
+
29
+ private final String word;
30
+ private final String pos;
31
+ private final boolean signal;
32
+ private final String biasType; // "" means general / not type-specific
33
+ private final String biasValue; // "" means unscored
34
+ private final boolean derogatory;
35
+ private final boolean colloquial;
36
+ private final double positivity;
37
+ private final double negativity;
38
+
39
+ /** All known surface forms (lemma + inflected), lowercased for fast lookup. */
40
+ private final Set<String> forms;
41
+
42
+ // -----------------------------------------------------------------------
43
+ // Constructor — called by BiasLexicon during TSV loading
44
+ // -----------------------------------------------------------------------
45
+
46
+ public BiasEntry(String word, String pos,
47
+ boolean signal, String biasType, String biasValue,
48
+ boolean derogatory, boolean colloquial,
49
+ double positivity, double negativity,
50
+ Set<String> forms) {
51
+ this.word = word == null ? "" : word.trim();
52
+ this.pos = pos == null ? "" : pos.trim();
53
+ this.signal = signal;
54
+ this.biasType = biasType == null ? "" : biasType.trim();
55
+ this.biasValue = biasValue == null ? "" : biasValue.trim();
56
+ this.derogatory = derogatory;
57
+ this.colloquial = colloquial;
58
+ this.positivity = positivity;
59
+ this.negativity = negativity;
60
+ this.forms = Collections.unmodifiableSet(
61
+ forms == null ? new HashSet<>() : forms);
62
+ }
63
+
64
+ // -----------------------------------------------------------------------
65
+ // Accessors
66
+ // -----------------------------------------------------------------------
67
+
68
+ /** Canonical lemma as it appears in the dictionary. */
69
+ public String getWord() { return word; }
70
+
71
+ /** Part-of-speech tag (N, A, V, …). */
72
+ public String getPos() { return pos; }
73
+
74
+ /**
75
+ * True if this entry marks an identity-group signal word —
76
+ * i.e. a term that identifies a person by a protected attribute
77
+ * (e.g. "жена", "мюсюлманин").
78
+ */
79
+ public boolean isSignal() { return signal; }
80
+
81
+ /**
82
+ * Bias category, or empty string if applicable to all categories.
83
+ * Values: "gender", "race_ethnicity", "religion", "disability", "appearance".
84
+ */
85
+ public String getBiasType() { return biasType; }
86
+
87
+ /**
88
+ * Evaluative polarity of the word in a bias context.
89
+ * Values: "positive", "negative", "neutral", or "" (unscored).
90
+ */
91
+ public String getBiasValue() { return biasValue; }
92
+
93
+ /** True if the word is explicitly marked as derogatory / pejorative. */
94
+ public boolean isDerogatory() { return derogatory; }
95
+
96
+ /** True if the word is marked as colloquial / informal. */
97
+ public boolean isColloquial() { return colloquial; }
98
+
99
+ /**
100
+ * Positivity score in [0, 1] derived from BulNet synset sentiment.
101
+ * Higher = more positive connotation.
102
+ */
103
+ public double getPositivity() { return positivity; }
104
+
105
+ /**
106
+ * Negativity score in [0, 1] derived from BulNet synset sentiment.
107
+ * Higher = more negative connotation.
108
+ */
109
+ public double getNegativity() { return negativity; }
110
+
111
+ /**
112
+ * Unmodifiable set of all surface forms (lemma + inflected variants),
113
+ * stored in lowercase.
114
+ */
115
+ public Set<String> getForms() { return forms; }
116
+
117
+ // -----------------------------------------------------------------------
118
+ // Convenience predicates
119
+ // -----------------------------------------------------------------------
120
+
121
+ /** True if this entry carries any evaluative information (non-empty biasValue). */
122
+ public boolean isEvaluative() {
123
+ return !biasValue.isEmpty() && !biasValue.equals("neutral");
124
+ }
125
+
126
+ /** True if biasType is non-empty (i.e. assigned to a specific category). */
127
+ public boolean isTyped() {
128
+ return !biasType.isEmpty();
129
+ }
130
+
131
+ /**
132
+ * True if this entry can act as an evaluative modifier in a bias pair —
133
+ * i.e. it has a non-neutral polarity, or it is derogatory or colloquial.
134
+ */
135
+ public boolean isEvaluativeModifier() {
136
+ return isEvaluative() || derogatory || colloquial
137
+ || positivity > 0.5 || negativity > 0.5;
138
+ }
139
+
140
+ // -----------------------------------------------------------------------
141
+ // Object overrides
142
+ // -----------------------------------------------------------------------
143
+
144
+ @Override
145
+ public String toString() {
146
+ return String.format("BiasEntry{word='%s', signal=%b, type='%s', value='%s', "
147
+ + "pos+neg=[%.2f,%.2f], derog=%b, coll=%b, forms=%d}",
148
+ word, signal, biasType, biasValue,
149
+ positivity, negativity, derogatory, colloquial, forms.size());
150
+ }
151
+ }
java/bg/bas/dcl/LLMs/BiasLexicon.java ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.io.BufferedReader;
4
+ import java.io.FileInputStream;
5
+ import java.io.InputStreamReader;
6
+ import java.nio.charset.StandardCharsets;
7
+ import java.util.ArrayList;
8
+ import java.util.Arrays;
9
+ import java.util.Collection;
10
+ import java.util.Collections;
11
+ import java.util.HashMap;
12
+ import java.util.HashSet;
13
+ import java.util.List;
14
+ import java.util.Map;
15
+ import java.util.Set;
16
+
17
+ /**
18
+ * BiasLexicon
19
+ *
20
+ * Loads the Bulgarian bias dictionary (bulgarian_bias_dictionary_v4.tsv) and
21
+ * provides fast O(1) form-level lookup for use by the bias detector.
22
+ *
23
+ * -----------------------------------------------------------------------
24
+ * TSV FORMAT (tab-separated, first row is header):
25
+ *
26
+ * Col 0 word canonical lemma
27
+ * Col 1 POS N | A | V | …
28
+ * Col 2 signal true | false
29
+ * Col 3 biasType gender | race_ethnicity | religion | disability | appearance | ""
30
+ * Col 4 biasValue positive | negative | neutral | ""
31
+ * Col 5 derogatory true | false
32
+ * Col 6 colloquial true | false
33
+ * Col 7 forms (boolean flag — ignored; inflected forms in col 10)
34
+ * Col 8 positivity double [0,1]
35
+ * Col 9 negativity double [0,1]
36
+ * Col 10 inflectedForms pipe-separated surface forms, or empty
37
+ *
38
+ *
39
+ */
40
+ public class BiasLexicon {
41
+
42
+ // -----------------------------------------------------------------------
43
+ // Indexes
44
+ // -----------------------------------------------------------------------
45
+
46
+ /**
47
+ * Primary form index: lowercased surface form → BiasEntry.
48
+ * A single form can only map to one entry (first one wins if there are
49
+ * duplicates — extremely rare in the dictionary).
50
+ */
51
+ private final Map<String, BiasEntry> formIndex = new HashMap<>();
52
+
53
+ /**
54
+ * Canonical word index: lowercased lemma → BiasEntry.
55
+ * Useful when you already have the base form.
56
+ */
57
+ private final Map<String, BiasEntry> wordIndex = new HashMap<>();
58
+
59
+ /** All entries in load order. */
60
+ private final List<BiasEntry> entries = new ArrayList<>();
61
+
62
+ // -----------------------------------------------------------------------
63
+ // Loading statistics
64
+ // -----------------------------------------------------------------------
65
+
66
+ private int loadedEntries = 0;
67
+ private int skippedLines = 0;
68
+ private int formConflicts = 0;
69
+
70
+ // -----------------------------------------------------------------------
71
+ // Constructor
72
+ // -----------------------------------------------------------------------
73
+
74
+ /**
75
+ * Loads the bias dictionary from a TSV file.
76
+ *
77
+ * @param tsvPath absolute path to the TSV file
78
+ * @throws RuntimeException if the file cannot be read
79
+ */
80
+ public BiasLexicon(String tsvPath) {
81
+ load(tsvPath);
82
+ System.out.printf("[BiasLexicon] Loaded %d entries, %d form keys, "
83
+ + "%d skipped lines, %d form conflicts.%n",
84
+ loadedEntries, formIndex.size(), skippedLines, formConflicts);
85
+ }
86
+
87
+ // -----------------------------------------------------------------------
88
+ // Lookup API
89
+ // -----------------------------------------------------------------------
90
+
91
+ /**
92
+ * Looks up a surface token (case-insensitive) and returns the
93
+ * matching {@link BiasEntry}, or {@code null} if not found.
94
+ *
95
+ * @param token any surface form (inflected or base)
96
+ */
97
+ public BiasEntry lookup(String token) {
98
+ if (token == null || token.isBlank()) return null;
99
+ return formIndex.get(token.toLowerCase().trim());
100
+ }
101
+
102
+ /**
103
+ * Returns true if the token (any form) is present in the lexicon.
104
+ *
105
+ * @param token surface form to check
106
+ */
107
+ public boolean contains(String token) {
108
+ return lookup(token) != null;
109
+ }
110
+
111
+ /**
112
+ * Looks up a canonical lemma directly.
113
+ *
114
+ * @param lemma the base/dictionary form
115
+ */
116
+ public BiasEntry lookupLemma(String lemma) {
117
+ if (lemma == null || lemma.isBlank()) return null;
118
+ return wordIndex.get(lemma.toLowerCase().trim());
119
+ }
120
+
121
+ // -----------------------------------------------------------------------
122
+ // Filtered views
123
+ // -----------------------------------------------------------------------
124
+
125
+ /**
126
+ * Returns all entries whose {@code biasType} matches the given category
127
+ * (case-insensitive), plus all general entries (empty biasType).
128
+ *
129
+ * @param biasType e.g. "gender", "disability"
130
+ */
131
+ public List<BiasEntry> getByType(String biasType) {
132
+ List<BiasEntry> result = new ArrayList<>();
133
+ String target = biasType == null ? "" : biasType.toLowerCase().trim();
134
+ for (BiasEntry e : entries)
135
+ if (e.getBiasType().equalsIgnoreCase(target) || e.getBiasType().isEmpty())
136
+ result.add(e);
137
+ return result;
138
+ }
139
+
140
+ /**
141
+ * Returns all entries that are marked as signals (signal=true) for
142
+ * the given bias category, or all signal entries if biasType is null/empty.
143
+ */
144
+ public List<BiasEntry> getSignals(String biasType) {
145
+ List<BiasEntry> result = new ArrayList<>();
146
+ for (BiasEntry e : entries) {
147
+ if (!e.isSignal()) continue;
148
+ if (biasType == null || biasType.isBlank()
149
+ || e.getBiasType().isEmpty()
150
+ || e.getBiasType().equalsIgnoreCase(biasType))
151
+ result.add(e);
152
+ }
153
+ return result;
154
+ }
155
+
156
+ /** Returns an unmodifiable view of all loaded entries. */
157
+ public Collection<BiasEntry> getAll() {
158
+ return Collections.unmodifiableList(entries);
159
+ }
160
+
161
+ /** Number of loaded dictionary entries. */
162
+ public int size() { return entries.size(); }
163
+
164
+ // -----------------------------------------------------------------------
165
+ // Internal loading
166
+ // -----------------------------------------------------------------------
167
+
168
+ private void load(String tsvPath) {
169
+ try (BufferedReader br = new BufferedReader(
170
+ new InputStreamReader(new FileInputStream(tsvPath),
171
+ StandardCharsets.UTF_8))) {
172
+
173
+ String headerLine = br.readLine(); // skip header
174
+ if (headerLine == null) {
175
+ System.err.println("[BiasLexicon] Empty file: " + tsvPath);
176
+ return;
177
+ }
178
+
179
+ String line;
180
+ int lineNum = 1; // already read header as line 1
181
+
182
+ while ((line = br.readLine()) != null) {
183
+ lineNum++;
184
+ if (line.isBlank()) { skippedLines++; continue; }
185
+
186
+ String[] cols = line.split("\t", -1);
187
+
188
+ // Minimum viable: need at least 10 columns
189
+ if (cols.length < 10) {
190
+ System.err.printf("[BiasLexicon] Line %d: only %d columns, skipping.%n",
191
+ lineNum, cols.length);
192
+ skippedLines++;
193
+ continue;
194
+ }
195
+
196
+ try {
197
+ String word = cols[0].trim();
198
+ String pos = cols[1].trim();
199
+ boolean signal = "true".equalsIgnoreCase(cols[2].trim());
200
+ String biasType = cols[3].trim();
201
+ String biasValue = cols[4].trim();
202
+ boolean derog = "true".equalsIgnoreCase(cols[5].trim());
203
+ boolean coll = "true".equalsIgnoreCase(cols[6].trim());
204
+ // cols[7] is a boolean forms-flag (ignored)
205
+ double positivity = parseDouble(cols[8], lineNum);
206
+ double negativity = parseDouble(cols[9], lineNum);
207
+
208
+ // Inflected forms: pipe-separated in col 10 (if present)
209
+ Set<String> formsSet = new HashSet<>();
210
+ formsSet.add(word.toLowerCase()); // always include the lemma
211
+
212
+ if (cols.length > 10 && !cols[10].isBlank()) {
213
+ for (String f : cols[10].split("\\|")) {
214
+ String fc = f.trim().toLowerCase();
215
+ if (!fc.isEmpty()) formsSet.add(fc);
216
+ }
217
+ }
218
+
219
+ BiasEntry entry = new BiasEntry(word, pos, signal,
220
+ biasType, biasValue, derog, coll,
221
+ positivity, negativity, formsSet);
222
+
223
+ entries.add(entry);
224
+ wordIndex.put(word.toLowerCase(), entry);
225
+
226
+ for (String form : formsSet) {
227
+ if (formIndex.containsKey(form)) {
228
+ formConflicts++;
229
+ // Keep first entry — do not overwrite
230
+ } else {
231
+ formIndex.put(form, entry);
232
+ }
233
+ }
234
+
235
+ loadedEntries++;
236
+
237
+ } catch (Exception e) {
238
+ System.err.printf("[BiasLexicon] Line %d: parse error — %s%n",
239
+ lineNum, e.getMessage());
240
+ skippedLines++;
241
+ }
242
+ }
243
+
244
+ } catch (Exception e) {
245
+ throw new RuntimeException("Failed to load bias lexicon from: " + tsvPath, e);
246
+ }
247
+ }
248
+
249
+ private double parseDouble(String s, int lineNum) {
250
+ try {
251
+ return Double.parseDouble(s.trim());
252
+ } catch (NumberFormatException e) {
253
+ System.err.printf("[BiasLexicon] Line %d: cannot parse double '%s', using 0.0%n",
254
+ lineNum, s);
255
+ return 0.0;
256
+ }
257
+ }
258
+ }
java/bg/bas/dcl/LLMs/BulgarianSentenceSplitter.java ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.io.File;
4
+ import java.io.FileInputStream;
5
+ import java.io.InputStream;
6
+ import java.util.ArrayList;
7
+ import java.util.Arrays;
8
+ import java.util.List;
9
+
10
+ import opennlp.tools.sentdetect.SentenceDetectorME;
11
+ import opennlp.tools.sentdetect.SentenceModel;
12
+
13
+ /**
14
+ * BulgarianSentenceSplitter
15
+ *
16
+ * Wraps the Apache OpenNLP sentence detection model for Bulgarian, providing
17
+ * a clean, reusable API for all other pipeline components.
18
+ *
19
+ * -----------------------------------------------------------------------
20
+ * MAVEN DEPENDENCIES (add to pom.xml):
21
+ *
22
+ * <!-- OpenNLP toolkit -->
23
+ * <dependency>
24
+ * <groupId>org.apache.opennlp</groupId>
25
+ * <artifactId>opennlp-tools</artifactId>
26
+ * <version>2.4.0</version>
27
+ * </dependency>
28
+ *
29
+ * <!-- Bulgarian sentence-detection model (UD-based, Apache 2.0) -->
30
+ * <dependency>
31
+ * <groupId>org.apache.opennlp</groupId>
32
+ * <artifactId>opennlp-models-sentdetect-bg</artifactId>
33
+ * <version>1.2</version>
34
+ * </dependency>
35
+ *
36
+ * The model JAR bundles the binary model at:
37
+ * opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin
38
+ * You can also supply an external model file via the two-argument constructor.
39
+ *
40
+ * -------------------------------------------------
41
+ */
42
+ public class BulgarianSentenceSplitter {
43
+
44
+ // -----------------------------------------------------------------------
45
+ // Constants
46
+ // -----------------------------------------------------------------------
47
+
48
+ /**
49
+ * Classpath location of the bundled Bulgarian sentence-detection model.
50
+ * Matches the path inside the opennlp-models-sentdetect-bg JAR.
51
+ */
52
+ private static final String BUNDLED_MODEL_PATH =
53
+ "opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin";
54
+
55
+ /**
56
+ * Minimum character length for a string to be considered a valid sentence.
57
+ * Shorter strings are returned as-is without splitting.
58
+ */
59
+ private static final int MIN_TEXT_LENGTH = 5;
60
+
61
+ // -----------------------------------------------------------------------
62
+ // State
63
+ // -----------------------------------------------------------------------
64
+
65
+ private final SentenceDetectorME detector;
66
+
67
+ // -----------------------------------------------------------------------
68
+ // Constructors
69
+ // -----------------------------------------------------------------------
70
+
71
+ /**
72
+ * Loads the Bulgarian sentence-detection model from the bundled Maven JAR.
73
+ * Requires the opennlp-models-sentdetect-bg artifact on the classpath.
74
+ *
75
+ * @throws RuntimeException if the model cannot be loaded
76
+ */
77
+ public BulgarianSentenceSplitter() {
78
+ this(null);
79
+ }
80
+
81
+ /**
82
+ * Loads the Bulgarian sentence-detection model.
83
+ *
84
+ * @param modelPath absolute path to a .bin OpenNLP sentence-detection model,
85
+ * or {@code null} / empty string to load from the classpath JAR
86
+ * @throws RuntimeException if the model cannot be loaded
87
+ */
88
+ public BulgarianSentenceSplitter(String modelPath) {
89
+ try {
90
+ InputStream stream;
91
+
92
+ if (modelPath == null || modelPath.isBlank()) {
93
+ // Load from the bundled JAR on the classpath
94
+ stream = getClass().getClassLoader()
95
+ .getResourceAsStream(BUNDLED_MODEL_PATH);
96
+ if (stream == null) {
97
+ throw new IllegalStateException(
98
+ "Bulgarian sentence model not found .");
99
+ }
100
+ System.out.println("[SentenceSplitter] Loaded bundled model: " + BUNDLED_MODEL_PATH);
101
+ } else {
102
+ File f = new File(modelPath);
103
+ if (!f.exists())
104
+ throw new IllegalArgumentException(
105
+ "Sentence model file not found: " + modelPath);
106
+ stream = new FileInputStream(f);
107
+ System.out.println("[SentenceSplitter] Loaded external model: " + modelPath);
108
+ }
109
+
110
+ SentenceModel model = new SentenceModel(stream);
111
+ stream.close();
112
+ detector = new SentenceDetectorME(model);
113
+
114
+ } catch (Exception e) {
115
+ throw new RuntimeException("Failed to load Bulgarian sentence model", e);
116
+ }
117
+ }
118
+
119
+ // -----------------------------------------------------------------------
120
+ // Core API
121
+ // -----------------------------------------------------------------------
122
+
123
+
124
+ public String[] split(String text) {
125
+ if (text == null) return new String[0];
126
+ String trimmed = text.trim();
127
+ if (trimmed.length() < MIN_TEXT_LENGTH) {
128
+ return trimmed.isEmpty() ? new String[0] : new String[]{trimmed};
129
+ }
130
+ return detector.sentDetect(trimmed);
131
+ }
132
+
133
+
134
+ public List<String> splitToList(String text) {
135
+ return new ArrayList<>(Arrays.asList(split(text)));
136
+ }
137
+
138
+
139
+ public List<String> splitParagraphs(String[] paragraphs) {
140
+ List<String> all = new ArrayList<>();
141
+ if (paragraphs == null) return all;
142
+ for (String para : paragraphs) {
143
+ if (para != null && !para.isBlank())
144
+ all.addAll(splitToList(para));
145
+ }
146
+ return all;
147
+ }
148
+
149
+
150
+ public double[] getSentenceProbabilities() {
151
+ return detector.getSentenceProbabilities();
152
+ }
153
+
154
+
155
+ public List<String> splitAndFilter(String text, int minWords) {
156
+ List<String> result = new ArrayList<>();
157
+ for (String sent : split(text)) {
158
+ if (sent.split("\\s+").length >= minWords)
159
+ result.add(sent);
160
+ }
161
+ return result;
162
+ }
163
+ }
java/bg/bas/dcl/LLMs/DeduplicationProcessor.java ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.PrintWriter;
7
+ import java.io.Writer;
8
+ import java.nio.file.Files;
9
+ import java.nio.file.StandardCopyOption;
10
+ import java.util.ArrayList;
11
+ import java.util.Collections;
12
+ import java.util.HashMap;
13
+ import java.util.HashSet;
14
+ import java.util.LinkedHashMap;
15
+ import java.util.List;
16
+ import java.util.Map;
17
+ import java.util.Scanner;
18
+ import java.util.Set;
19
+ import java.util.TreeSet;
20
+
21
+ import info.debatty.java.lsh.MinHash;
22
+
23
+ import bg.bas.dcl.general.FileHandler;
24
+
25
+ /**
26
+ * DeduplicationProcessor — sentence-level near-duplicate detection
27
+ * using MinHash + LSH (Jaccard similarity).
28
+ *
29
+ * -----------------------------------------------------------------------
30
+ * MAVEN DEPENDENCY (add to pom.xml):
31
+ *
32
+ * <dependency>
33
+ * <groupId>info.debatty</groupId>
34
+ * <artifactId>java-lsh</artifactId>
35
+ * <version>0.12</version>
36
+ * </dependency>
37
+ *
38
+ * -----------------------------------------------------------------------
39
+ * HOW IT WORKS
40
+ *
41
+ * 1. INDEX phase — reads all .txt files in the "full corpus" directory.
42
+ * Each sentence is shingled into character n-grams, converted to a
43
+ * boolean vector over a shared vocabulary, and a MinHash signature
44
+ * is computed. All signatures are stored in an in-memory index keyed
45
+ * by (file, lineNumber).
46
+ *
47
+ * 2. QUERY phase — reads every sentence in the "new folder".
48
+ * For each sentence its MinHash signature is compared against every
49
+ * indexed corpus signature (approximate Jaccard via signature similarity).
50
+ * Pairs whose estimated Jaccard similarity ≥ threshold are reported.
51
+ *
52
+ * 3. REPORT — a TSV report is written listing every duplicate pair:
53
+ * new-file | new-line | corpus-file | corpus-line | similarity | sentence
54
+ *
55
+ * 4. OPTIONAL REMOVE — sentences in the new folder that are duplicates of
56
+ * corpus sentences are stripped from their file (originals backed up).
57
+ * Files that become empty after removal are deleted.
58
+ *
59
+ * -----------------------------------------------------------------------
60
+ * PARAMETERS
61
+ *
62
+ * threshold — Jaccard similarity to call a near-duplicate (default 0.90)
63
+ * shingleSize — character n-gram size for shingling (default 5)
64
+ * numHashes — number of hash functions for MinHash (default 200)
65
+ * More hashes → better accuracy, slower index.
66
+ *
67
+ * -----------------------------------------------------------------------
68
+ * USAGE
69
+ *
70
+ * DeduplicationProcessor dp = new DeduplicationProcessor(0.90);
71
+ * dp.indexCorpus("/path/to/full/corpus/");
72
+ * dp.detectDuplicates("/path/to/new/folder/", "/path/to/report.tsv");
73
+ * dp.removeDuplicatesFromNewFolder("/path/to/new/folder/", true); // true=keep .bak
74
+ */
75
+ public class DeduplicationProcessor {
76
+
77
+ // -----------------------------------------------------------------------
78
+ // Configuration
79
+ // -----------------------------------------------------------------------
80
+
81
+ private final double threshold; // Jaccard similarity cut-off
82
+ private final int shingleSize; // character n-gram size
83
+ private final int numHashes; // MinHash signature length
84
+
85
+ // -----------------------------------------------------------------------
86
+ // Index state (built during indexCorpus)
87
+ // -----------------------------------------------------------------------
88
+
89
+ /** Shared vocabulary: every distinct shingle seen across all corpus sentences. */
90
+ private final Set<String> vocabulary = new HashSet<>();
91
+
92
+ /**
93
+ * Corpus index: maps SentenceKey → raw sentence text + MinHash signature.
94
+ * Built in two passes to allow vocabulary to be finalised before signing.
95
+ */
96
+ private final Map<SentenceKey, IndexedSentence> corpusIndex = new LinkedHashMap<>();
97
+
98
+ /** MinHash object — initialised once vocabulary size is known. */
99
+ private MinHash minHash;
100
+
101
+ // -----------------------------------------------------------------------
102
+ // Duplicate results (populated by detectDuplicates)
103
+ // -----------------------------------------------------------------------
104
+
105
+ /** All duplicate pairs found in the last detectDuplicates run. */
106
+ private final List<DuplicatePair> duplicatePairs = new ArrayList<>();
107
+
108
+ /**
109
+ * Set of SentenceKeys in the NEW folder that are duplicates.
110
+ * Used by removeDuplicatesFromNewFolder.
111
+ */
112
+ private final Set<SentenceKey> duplicateNewSentences = new HashSet<>();
113
+
114
+ // -----------------------------------------------------------------------
115
+ // Constructor
116
+ // -----------------------------------------------------------------------
117
+
118
+ public DeduplicationProcessor(double threshold) {
119
+ this(threshold, 5, 200);
120
+ }
121
+
122
+ public DeduplicationProcessor(double threshold, int shingleSize, int numHashes) {
123
+ if (threshold < 0 || threshold > 1)
124
+ throw new IllegalArgumentException("Threshold must be in [0, 1].");
125
+ this.threshold = threshold;
126
+ this.shingleSize = shingleSize;
127
+ this.numHashes = numHashes;
128
+ }
129
+
130
+ // -----------------------------------------------------------------------
131
+ // Phase 1 — Index the full corpus
132
+ // -----------------------------------------------------------------------
133
+
134
+ /**
135
+ * Reads all .txt files in {@code corpusDir}, shingles every sentence,
136
+ * builds a shared vocabulary, and computes MinHash signatures.
137
+ *
138
+ * This must be called before {@link #detectDuplicates}.
139
+ *
140
+ * @param corpusDir directory of clean .txt files representing the full corpus
141
+ */
142
+ public void indexCorpus(String corpusDir) {
143
+ System.out.println("[Index] Scanning corpus: " + corpusDir);
144
+ try {
145
+ FileHandler fh = new FileHandler();
146
+
147
+ // --- Pass 1: collect sentences and build vocabulary ---
148
+ // Temporary store: key → raw text + shingle set (signatures computed later)
149
+ Map<SentenceKey, Set<String>> rawShingles = new LinkedHashMap<>();
150
+
151
+ for (File f : fh.getFileListing(new File(corpusDir))) {
152
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
153
+
154
+ Scanner sc = new Scanner(f, "UTF-8");
155
+ int lineNum = 0;
156
+ while (sc.hasNextLine()) {
157
+ String line = sc.nextLine().trim();
158
+ lineNum++;
159
+ if (line.length() < shingleSize) continue;
160
+
161
+ Set<String> shingles = shingle(line);
162
+ vocabulary.addAll(shingles);
163
+ rawShingles.put(new SentenceKey(f.getName(), lineNum), shingles);
164
+ }
165
+ sc.close();
166
+ }
167
+
168
+ System.out.println("[Index] Vocabulary size: " + vocabulary.size()
169
+ + " Sentences: " + rawShingles.size());
170
+
171
+ if (vocabulary.isEmpty()) {
172
+ System.err.println("[Index] No sentences found — aborting.");
173
+ return;
174
+ }
175
+
176
+ // --- Initialise MinHash with finalised vocabulary size ---
177
+ // Error parameter 0.05 → ~400 hashes needed; we use numHashes directly.
178
+ // The debatty MinHash constructor accepts (error, dictSize).
179
+ // We use the lower-level approach: fix numHashes via the signature size.
180
+ // info.debatty MinHash(double error, int dictSize) chooses hash count itself.
181
+ // For explicit control we pass a small error so it aligns with numHashes.
182
+ minHash = new MinHash(numHashes, vocabulary.size());
183
+
184
+ // --- Pass 2: compute and store signatures ---
185
+ List<String> vocabList = new ArrayList<>(vocabulary);
186
+ corpusIndex.clear();
187
+
188
+ // Also keep a raw-text map for the report
189
+ Map<SentenceKey, String> rawTexts = new HashMap<>();
190
+ // re-scan to get raw text (we only stored shingles above)
191
+ for (File f : fh.getFileListing(new File(corpusDir))) {
192
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
193
+ Scanner sc = new Scanner(f, "UTF-8");
194
+ int lineNum = 0;
195
+ while (sc.hasNextLine()) {
196
+ String line = sc.nextLine().trim();
197
+ lineNum++;
198
+ if (line.length() < shingleSize) continue;
199
+ rawTexts.put(new SentenceKey(f.getName(), lineNum), line);
200
+ }
201
+ sc.close();
202
+ }
203
+
204
+ for (Map.Entry<SentenceKey, Set<String>> entry : rawShingles.entrySet()) {
205
+ SentenceKey key = entry.getKey();
206
+ boolean[] vector = toVector(entry.getValue(), vocabList);
207
+ int[] sig = minHash.signature(vector);
208
+ String rawText = rawTexts.getOrDefault(key, "");
209
+ corpusIndex.put(key, new IndexedSentence(rawText, sig));
210
+ }
211
+
212
+ System.out.println("[Index] Corpus index built: "
213
+ + corpusIndex.size() + " sentences.");
214
+
215
+ } catch (Exception e) {
216
+ e.printStackTrace();
217
+ }
218
+ }
219
+
220
+ // -----------------------------------------------------------------------
221
+ // Phase 2 — Detect duplicates in new folder
222
+ // -----------------------------------------------------------------------
223
+
224
+ /**
225
+ * Compares every sentence in {@code newDir} against the corpus index.
226
+ * Pairs with estimated Jaccard ≥ threshold are recorded as duplicates
227
+ * and written to {@code reportPath}.
228
+ *
229
+ * Call {@link #indexCorpus} first.
230
+ *
231
+ * @param newDir directory of new .txt files to check
232
+ * @param reportPath destination TSV report file
233
+ */
234
+ public void detectDuplicates(String newDir, String reportPath) {
235
+ if (corpusIndex.isEmpty()) {
236
+ System.err.println("[Detect] Corpus index is empty. Call indexCorpus() first.");
237
+ return;
238
+ }
239
+
240
+ System.out.println("[Detect] Comparing new folder against corpus index...");
241
+ duplicatePairs.clear();
242
+ duplicateNewSentences.clear();
243
+
244
+ List<String> vocabList = new ArrayList<>(vocabulary);
245
+
246
+ try {
247
+ FileHandler fh = new FileHandler();
248
+
249
+ for (File f : fh.getFileListing(new File(newDir))) {
250
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
251
+
252
+ System.out.println("[Detect] Checking: " + f.getName());
253
+
254
+ Scanner sc = new Scanner(f, "UTF-8");
255
+ int lineNum = 0;
256
+
257
+ while (sc.hasNextLine()) {
258
+ String line = sc.nextLine().trim();
259
+ lineNum++;
260
+ if (line.length() < shingleSize) continue;
261
+
262
+ Set<String> shingles = shingle(line);
263
+
264
+ // Only shingles already in vocabulary are meaningful
265
+ Set<String> filtered = new HashSet<>(shingles);
266
+ filtered.retainAll(vocabulary);
267
+
268
+ // If almost none of the shingles are in vocab → skip
269
+ // (the sentence is likely from a very different domain)
270
+ if (filtered.isEmpty()) continue;
271
+
272
+ boolean[] newVec = toVector(filtered, vocabList);
273
+ int[] newSig = minHash.signature(newVec);
274
+
275
+ SentenceKey newKey = new SentenceKey(f.getName(), lineNum);
276
+
277
+ // Compare against all corpus sentences
278
+ // For large corpora, replace this loop with an LSH band index
279
+ for (Map.Entry<SentenceKey, IndexedSentence> entry : corpusIndex.entrySet()) {
280
+ double sim = minHash.similarity(newSig, entry.getValue().signature);
281
+ if (sim >= threshold) {
282
+ DuplicatePair pair = new DuplicatePair(
283
+ newKey, line,
284
+ entry.getKey(), entry.getValue().text,
285
+ sim);
286
+ duplicatePairs.add(pair);
287
+ duplicateNewSentences.add(newKey);
288
+ // Don't break — report ALL corpus matches for transparency
289
+ }
290
+ }
291
+ }
292
+ sc.close();
293
+ }
294
+
295
+ System.out.println("[Detect] Duplicate sentence pairs found: "
296
+ + duplicatePairs.size());
297
+ System.out.println("[Detect] Unique new sentences flagged: "
298
+ + duplicateNewSentences.size());
299
+
300
+ writeReport(reportPath);
301
+
302
+ } catch (Exception e) {
303
+ e.printStackTrace();
304
+ }
305
+ }
306
+
307
+ // -----------------------------------------------------------------------
308
+ // Phase 3 — Optionally remove duplicates from new folder
309
+ // -----------------------------------------------------------------------
310
+
311
+ /**
312
+ * Removes from every file in {@code newDir} any sentence whose
313
+ * (file, lineNumber) is in the duplicate set detected by
314
+ * {@link #detectDuplicates}.
315
+ *
316
+ * Files that become empty after removal are deleted.
317
+ * Must be called after {@link #detectDuplicates}.
318
+ *
319
+ * @param newDir directory of new .txt files to clean
320
+ * @param keepBackup if true, originals are renamed to *.bak first
321
+ */
322
+ public void removeDuplicatesFromNewFolder(String newDir, boolean keepBackup) {
323
+ if (duplicateNewSentences.isEmpty()) {
324
+ System.out.println("[Remove] No duplicates to remove.");
325
+ return;
326
+ }
327
+
328
+ System.out.println("[Remove] Removing "
329
+ + duplicateNewSentences.size() + " duplicate sentences...");
330
+
331
+ try {
332
+ FileHandler fh = new FileHandler();
333
+ int filesModified = 0;
334
+ int totalRemoved = 0;
335
+
336
+ for (File f : fh.getFileListing(new File(newDir))) {
337
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
338
+
339
+ List<String> inputLines = new ArrayList<>();
340
+ Scanner sc = new Scanner(f, "UTF-8");
341
+ int lineNum = 0;
342
+ while (sc.hasNextLine()) {
343
+ inputLines.add(sc.nextLine());
344
+ lineNum++;
345
+ }
346
+ sc.close();
347
+
348
+ List<String> outputLines = new ArrayList<>();
349
+ int removed = 0;
350
+
351
+ for (int i = 0; i < inputLines.size(); i++) {
352
+ String trimmed = inputLines.get(i).trim();
353
+ // +1 because lineNum was 1-based during indexing
354
+ SentenceKey key = new SentenceKey(f.getName(), i + 1);
355
+
356
+ if (trimmed.length() >= shingleSize
357
+ && duplicateNewSentences.contains(key)) {
358
+ removed++;
359
+ } else {
360
+ outputLines.add(inputLines.get(i));
361
+ }
362
+ }
363
+
364
+ if (removed > 0) {
365
+ if (keepBackup) {
366
+ Files.copy(f.toPath(),
367
+ new File(f.getAbsolutePath() + ".bak").toPath(),
368
+ StandardCopyOption.REPLACE_EXISTING);
369
+ }
370
+
371
+ // Check if file would become empty (only blank lines)
372
+ boolean allBlank = outputLines.stream()
373
+ .allMatch(String::isBlank);
374
+
375
+ if (allBlank) {
376
+ f.delete();
377
+ System.out.println("[Remove] Deleted (empty after dedup): "
378
+ + f.getName());
379
+ } else {
380
+ Writer w = new OutputStreamWriter(
381
+ new FileOutputStream(f), "UTF-8");
382
+ for (String l : outputLines) {
383
+ w.write(l + "\n");
384
+ }
385
+ w.flush();
386
+ w.close();
387
+ System.out.println("[Remove] " + f.getName()
388
+ + " — removed " + removed + " sentences.");
389
+ }
390
+
391
+ filesModified++;
392
+ totalRemoved += removed;
393
+ }
394
+ }
395
+
396
+ System.out.println("[Remove] Done. Files modified: " + filesModified
397
+ + " Sentences removed: " + totalRemoved);
398
+
399
+ } catch (Exception e) {
400
+ e.printStackTrace();
401
+ }
402
+ }
403
+
404
+ // -----------------------------------------------------------------------
405
+ // Report writer
406
+ // -----------------------------------------------------------------------
407
+
408
+ private void writeReport(String reportPath) throws Exception {
409
+ try (PrintWriter pw = new PrintWriter(
410
+ new OutputStreamWriter(new FileOutputStream(reportPath), "UTF-8"))) {
411
+
412
+ // Header
413
+ pw.println("# DeduplicationProcessor report");
414
+ pw.println("# Threshold: " + threshold
415
+ + " ShingleSize: " + shingleSize
416
+ + " NumHashes: " + numHashes);
417
+ pw.println("# Duplicate pairs: " + duplicatePairs.size());
418
+ pw.println("# Unique new sentences flagged: " + duplicateNewSentences.size());
419
+ pw.println();
420
+ pw.println("NEW_FILE\tNEW_LINE\tCORPUS_FILE\tCORPUS_LINE\tSIMILARITY\tNEW_SENTENCE\tCORPUS_SENTENCE");
421
+
422
+ // Sort by similarity descending, then new file, then line
423
+ List<DuplicatePair> sorted = new ArrayList<>(duplicatePairs);
424
+ sorted.sort((a, b) -> {
425
+ int cmp = Double.compare(b.similarity, a.similarity);
426
+ if (cmp != 0) return cmp;
427
+ cmp = a.newKey.fileName.compareTo(b.newKey.fileName);
428
+ if (cmp != 0) return cmp;
429
+ return Integer.compare(a.newKey.lineNumber, b.newKey.lineNumber);
430
+ });
431
+
432
+ for (DuplicatePair p : sorted) {
433
+ pw.printf("%s\t%d\t%s\t%d\t%.4f\t%s\t%s%n",
434
+ p.newKey.fileName,
435
+ p.newKey.lineNumber,
436
+ p.corpusKey.fileName,
437
+ p.corpusKey.lineNumber,
438
+ p.similarity,
439
+ sanitiseTsv(p.newText),
440
+ sanitiseTsv(p.corpusText));
441
+ }
442
+ }
443
+ System.out.println("[Report] Written to: " + reportPath);
444
+ }
445
+
446
+ // -----------------------------------------------------------------------
447
+ // Shingling and vectorisation helpers
448
+ // -----------------------------------------------------------------------
449
+
450
+ /**
451
+ * Produces the set of character n-grams (shingles) for a sentence.
452
+ * Lowercased so matching is case-insensitive.
453
+ */
454
+ private Set<String> shingle(String text) {
455
+ Set<String> shingles = new TreeSet<>();
456
+ String lower = text.toLowerCase();
457
+ for (int i = 0; i <= lower.length() - shingleSize; i++) {
458
+ shingles.add(lower.substring(i, i + shingleSize));
459
+ }
460
+ return shingles;
461
+ }
462
+
463
+ /**
464
+ * Converts a shingle set to a boolean presence vector over the shared vocabulary.
465
+ *
466
+ * @param shingles shingle set for this sentence
467
+ * @param vocabList ordered list of all vocabulary shingles
468
+ * @return boolean[] where true = shingle present
469
+ */
470
+ private boolean[] toVector(Set<String> shingles, List<String> vocabList) {
471
+ boolean[] vector = new boolean[vocabList.size()];
472
+ for (int i = 0; i < vocabList.size(); i++) {
473
+ vector[i] = shingles.contains(vocabList.get(i));
474
+ }
475
+ return vector;
476
+ }
477
+
478
+ // -----------------------------------------------------------------------
479
+ // Utility
480
+ // -----------------------------------------------------------------------
481
+
482
+ private String sanitiseTsv(String s) {
483
+ if (s == null) return "";
484
+ return s.replace("\t", " ").replace("\n", " ").replace("\r", "");
485
+ }
486
+
487
+ /** Returns an unmodifiable view of all detected duplicate pairs. */
488
+ public List<DuplicatePair> getDuplicatePairs() {
489
+ return Collections.unmodifiableList(duplicatePairs);
490
+ }
491
+
492
+ /** Returns the number of corpus sentences indexed. */
493
+ public int getCorpusSize() {
494
+ return corpusIndex.size();
495
+ }
496
+
497
+ // -----------------------------------------------------------------------
498
+ // Inner data classes
499
+ // -----------------------------------------------------------------------
500
+
501
+ /**
502
+ * Uniquely identifies a sentence by its source file name and line number.
503
+ */
504
+ public static class SentenceKey {
505
+ public final String fileName;
506
+ public final int lineNumber;
507
+
508
+ public SentenceKey(String fileName, int lineNumber) {
509
+ this.fileName = fileName;
510
+ this.lineNumber = lineNumber;
511
+ }
512
+
513
+ @Override
514
+ public boolean equals(Object o) {
515
+ if (!(o instanceof SentenceKey)) return false;
516
+ SentenceKey other = (SentenceKey) o;
517
+ return lineNumber == other.lineNumber
518
+ && fileName.equals(other.fileName);
519
+ }
520
+
521
+ @Override
522
+ public int hashCode() {
523
+ return 31 * fileName.hashCode() + lineNumber;
524
+ }
525
+
526
+ @Override
527
+ public String toString() {
528
+ return fileName + ":" + lineNumber;
529
+ }
530
+ }
531
+
532
+ /**
533
+ * Holds the raw text and MinHash signature for an indexed corpus sentence.
534
+ */
535
+ private static class IndexedSentence {
536
+ final String text;
537
+ final int[] signature;
538
+
539
+ IndexedSentence(String text, int[] signature) {
540
+ this.text = text;
541
+ this.signature = signature;
542
+ }
543
+ }
544
+
545
+ /**
546
+ * Represents a detected near-duplicate pair between a new sentence
547
+ * and a corpus sentence.
548
+ */
549
+ public static class DuplicatePair {
550
+ public final SentenceKey newKey;
551
+ public final String newText;
552
+ public final SentenceKey corpusKey;
553
+ public final String corpusText;
554
+ public final double similarity;
555
+
556
+ public DuplicatePair(SentenceKey newKey, String newText,
557
+ SentenceKey corpusKey, String corpusText,
558
+ double similarity) {
559
+ this.newKey = newKey;
560
+ this.newText = newText;
561
+ this.corpusKey = corpusKey;
562
+ this.corpusText = corpusText;
563
+ this.similarity = similarity;
564
+ }
565
+
566
+ @Override
567
+ public String toString() {
568
+ return String.format("[%.2f] %s ↔ %s", similarity, newKey, corpusKey);
569
+ }
570
+ }
571
+ }
java/bg/bas/dcl/LLMs/FileCleanProcessor.java ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.PrintWriter;
7
+ import java.io.Writer;
8
+ import java.nio.file.Files;
9
+ import java.nio.file.StandardCopyOption;
10
+ import java.util.ArrayList;
11
+ import java.util.Arrays;
12
+ import java.util.HashMap;
13
+ import java.util.HashSet;
14
+ import java.util.LinkedHashMap;
15
+ import java.util.List;
16
+ import java.util.Map;
17
+ import java.util.Scanner;
18
+ import java.util.Set;
19
+ import java.util.regex.Pattern;
20
+
21
+ import bg.bas.dcl.general.FileHandler;
22
+
23
+ /**
24
+ * FileCleanProcessor — corpus boilerplate remover.
25
+ *
26
+ * Two-phase cleaning:
27
+ *
28
+ * Phase 1 — LEARN (from a sample directory):
29
+ * Scans every .txt file in the sample dir and records how many files each
30
+ * non-empty line appears in. Lines that appear in ≥ THRESHOLD of the
31
+ * sample files are added to the "common lines" blocklist.
32
+ * The blocklist is also saved to disk for inspection / reuse.
33
+ *
34
+ * Phase 2 — CLEAN (over the full data directory):
35
+ * For every .txt file, removes lines that:
36
+ * (a) appear in the learned common-lines blocklist, OR
37
+ * (b) match any of the hardcoded boilerplate regex patterns
38
+ * (HTML/XML tags, PHP markers, navigation patterns,
39
+ * URLs, e-mail addresses, cookie/GDPR banners).
40
+ * Cleaned files overwrite the originals (a .bak backup is kept by default).
41
+ *
42
+ * Usage:
43
+ * FileCleanProcessor fcp = new FileCleanProcessor(0.50); // 50 % threshold
44
+ * fcp.learnFromSample("/path/to/sample/dir/");
45
+ * fcp.saveBlocklist("/path/to/blocklist.txt"); // optional
46
+ * fcp.cleanDirectory("/path/to/full/data/dir/", true); // true = keep .bak
47
+ */
48
+ public class FileCleanProcessor {
49
+
50
+ // -----------------------------------------------------------------------
51
+ // Configuration
52
+ // -----------------------------------------------------------------------
53
+
54
+ /** Fraction of sample files a line must appear in to be considered boilerplate. */
55
+ private final double threshold;
56
+
57
+ /** Minimum non-whitespace characters a line must have to be evaluated (avoids
58
+ * treating every blank separator the same way). */
59
+ private static final int MIN_LINE_LENGTH = 3;
60
+
61
+ // -----------------------------------------------------------------------
62
+ // State
63
+ // -----------------------------------------------------------------------
64
+
65
+ /** Lines found to be common across the sample (Phase 1 output). */
66
+ private final Set<String> commonLines = new HashSet<>();
67
+
68
+ /** Diagnostic: line → number of sample files it appeared in. */
69
+ private final Map<String, Integer> lineFrequency = new LinkedHashMap<>();
70
+
71
+ // -----------------------------------------------------------------------
72
+ // Hardcoded boilerplate patterns (always applied regardless of frequency)
73
+ // -----------------------------------------------------------------------
74
+
75
+ private static final List<Pattern> BOILERPLATE_PATTERNS = Arrays.asList(
76
+
77
+ // ---- HTML / XML tags ------------------------------------------------
78
+ Pattern.compile("(?i)^\\s*<[^>]+>\\s*$"), // whole-line tag
79
+ Pattern.compile("(?i).*<(script|style|head|meta|link|iframe)[^>]*>.*"),
80
+ Pattern.compile("(?i).*</(script|style|head|body|html)>.*"),
81
+ Pattern.compile("(?i).*<!--.*-->.*"), // HTML comment
82
+ Pattern.compile("(?i).*&(nbsp|amp|lt|gt|quot|apos);.*"), // HTML entities
83
+
84
+ // ---- PHP / server-side markers --------------------------------------
85
+ Pattern.compile("(?i).*<\\?php.*"),
86
+ Pattern.compile("(?i).*\\?>\\s*"),
87
+ Pattern.compile("(?i).*<%.*%>.*"), // ASP-style tags
88
+
89
+ // ---- Navigation / menu patterns ------------------------------------
90
+ Pattern.compile("(?i)^\\s*(home|начало|меню|menu|навигация|navigation"
91
+ + "|търсене|search|вход|login|изход|logout"
92
+ + "|регистрация|register|контакти|contacts"
93
+ + "|за нас|about us|sitemap|карта на сайта)\\s*$"),
94
+ Pattern.compile("(?i)^\\s*(next|prev|previous|следващ|предишен"
95
+ + "|напред|назад|нагоре|back|forward|top|горе)\\s*$"),
96
+ Pattern.compile("(?i)^\\s*\\|\\s*(.*\\|\\s*)+$"), // pipe-separated nav bars
97
+ Pattern.compile("(?i)^\\s*(>\\s*){2,}"), // breadcrumb: A > B > C
98
+ Pattern.compile("(?i)^\\s*(\\d+\\.?\\s+){3,}$"), // numbered nav lists
99
+
100
+ // ---- URLs ----------------------------------------------------------
101
+ Pattern.compile("(?i)\\bhttps?://\\S+"),
102
+ Pattern.compile("(?i)\\bwww\\.\\S+\\.\\S+"),
103
+ Pattern.compile("(?i)\\bftp://\\S+"),
104
+
105
+ // ---- E-mail addresses ----------------------------------------------
106
+ Pattern.compile("[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"),
107
+
108
+ // ---- Cookie / GDPR banners -----------------------------------------
109
+ Pattern.compile("(?i).*(бисквитки|cookies|gdpr|privacy policy|поверителност"
110
+ + "|приемам|accept all|отхвърлям|decline|consent"
111
+ + "|лични данни|personal data|условия за ползване"
112
+ + "|terms of (use|service)|политика за).*"),
113
+
114
+ // ---- Social / sharing buttons --------------------------------------
115
+ Pattern.compile("(?i)^\\s*(share|сподели|like|харесай|tweet|retweet"
116
+ + "|pinterest|linkedin|facebook|twitter|instagram"
117
+ + "|google\\+?|youtube|tiktok|viber|whatsapp)\\s*$"),
118
+
119
+ // ---- Counters / analytics snippets ---------------------------------
120
+ Pattern.compile("(?i).*google.analytics.*"),
121
+ Pattern.compile("(?i).*ga\\s*\\(\\s*['\"].*"),
122
+ Pattern.compile("(?i).*gtag\\s*\\(.*"),
123
+ Pattern.compile("(?i).*_gaq\\.push.*"),
124
+
125
+ // ---- Print / date / page artefacts ---------------------------------
126
+ Pattern.compile("(?i)^\\s*страница\\s+\\d+\\s*(от\\s+\\d+)?\\s*$"), // "страница 1 от 5"
127
+ Pattern.compile("(?i)^\\s*page\\s+\\d+\\s*(of\\s+\\d+)?\\s*$"),
128
+ Pattern.compile("(?i)^\\s*©.*$"), // copyright line
129
+ Pattern.compile("(?i)^\\s*all rights reserved.*$"),
130
+ Pattern.compile("(?i)^\\s*права запазени.*$"),
131
+
132
+ // ---- Lines that are purely punctuation / symbols -------------------
133
+ Pattern.compile("^[\\s\\p{Punct}\\|\\-_=*~`^]+$")
134
+ );
135
+
136
+ // -----------------------------------------------------------------------
137
+ // Constructor
138
+ // -----------------------------------------------------------------------
139
+
140
+ /**
141
+ * @param threshold fraction [0,1] of sample files a line must appear in
142
+ * to be added to the blocklist (e.g. 0.50 for 50 %).
143
+ */
144
+ public FileCleanProcessor(double threshold) {
145
+ if (threshold < 0 || threshold > 1)
146
+ throw new IllegalArgumentException("Threshold must be in [0, 1].");
147
+ this.threshold = threshold;
148
+ }
149
+
150
+ // -----------------------------------------------------------------------
151
+ // Phase 1 — Learn from sample
152
+ // -----------------------------------------------------------------------
153
+
154
+ /**
155
+ * Scans all .txt files in {@code sampleDir}, counts how many files each
156
+ * trimmed non-empty line appears in, and populates {@link #commonLines}
157
+ * with those meeting the threshold.
158
+ *
159
+ * @param sampleDir directory containing representative sample .txt files
160
+ */
161
+ public void learnFromSample(String sampleDir) {
162
+ try {
163
+ FileHandler fh = new FileHandler();
164
+ List<File> sampleFiles = new ArrayList<>();
165
+
166
+ for (File f : fh.getFileListing(new File(sampleDir))) {
167
+ if (f.isFile() && f.getName().endsWith(".txt"))
168
+ sampleFiles.add(f);
169
+ }
170
+
171
+ int total = sampleFiles.size();
172
+ if (total == 0) {
173
+ System.err.println("[LearnPhase] No .txt files found in: " + sampleDir);
174
+ return;
175
+ }
176
+ System.out.println("[LearnPhase] Scanning " + total + " sample files...");
177
+
178
+ // For each file, collect the *distinct* lines it contains so a
179
+ // repeated line inside one document only counts once.
180
+ Map<String, Integer> fileCount = new HashMap<>();
181
+
182
+ for (File f : sampleFiles) {
183
+ Set<String> seenInFile = new HashSet<>();
184
+ Scanner s = new Scanner(f, "UTF-8");
185
+ while (s.hasNextLine()) {
186
+ String line = s.nextLine().trim();
187
+ if (line.length() < MIN_LINE_LENGTH) continue;
188
+ if (seenInFile.add(line)) { // first occurrence in this file
189
+ fileCount.merge(line, 1, Integer::sum);
190
+ }
191
+ }
192
+ s.close();
193
+ }
194
+
195
+ // Apply threshold
196
+ commonLines.clear();
197
+ lineFrequency.clear();
198
+
199
+ double cutoff = threshold * total;
200
+ for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
201
+ lineFrequency.put(entry.getKey(), entry.getValue());
202
+ if (entry.getValue() >= cutoff) {
203
+ commonLines.add(entry.getKey());
204
+ }
205
+ }
206
+
207
+ System.out.println("[LearnPhase] Common lines identified: " + commonLines.size()
208
+ + " (threshold=" + (int)(threshold * 100) + "%, files=" + total + ")");
209
+
210
+ } catch (Exception e) {
211
+ e.printStackTrace();
212
+ }
213
+ }
214
+
215
+ /**
216
+ * Replaces the learned common-lines set with a pre-built one.
217
+ * Useful when loading a previously saved blocklist.
218
+ *
219
+ * @param lines set of exact line strings to treat as boilerplate
220
+ */
221
+ public void setCommonLines(Set<String> lines) {
222
+ commonLines.clear();
223
+ commonLines.addAll(lines);
224
+ }
225
+
226
+ // -----------------------------------------------------------------------
227
+ // Blocklist persistence
228
+ // -----------------------------------------------------------------------
229
+
230
+ /**
231
+ * Saves the learned blocklist to a plain-text file (one line per entry),
232
+ * preceded by a frequency comment for human review.
233
+ *
234
+ * @param outPath destination file path
235
+ */
236
+ public void saveBlocklist(String outPath) {
237
+ try (PrintWriter pw = new PrintWriter(
238
+ new OutputStreamWriter(new FileOutputStream(outPath), "UTF-8"))) {
239
+
240
+ pw.println("# FileCleanProcessor blocklist");
241
+ pw.println("# threshold=" + threshold
242
+ + " entries=" + commonLines.size());
243
+ pw.println("# Format: <frequency TAB line>");
244
+ pw.println();
245
+
246
+ // Sort by descending frequency for readability
247
+ lineFrequency.entrySet().stream()
248
+ .filter(e -> commonLines.contains(e.getKey()))
249
+ .sorted((a, b) -> b.getValue() - a.getValue())
250
+ .forEach(e -> pw.println(e.getValue() + "\t" + e.getKey()));
251
+
252
+ System.out.println("[Blocklist] Saved " + commonLines.size()
253
+ + " entries to: " + outPath);
254
+
255
+ } catch (Exception e) {
256
+ e.printStackTrace();
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Loads a blocklist previously saved by {@link #saveBlocklist}.
262
+ * Comment lines (starting with #) and blank lines are skipped.
263
+ *
264
+ * @param blocklistPath path to the blocklist file
265
+ */
266
+ public void loadBlocklist(String blocklistPath) {
267
+ try {
268
+ commonLines.clear();
269
+ Scanner sc = new Scanner(new File(blocklistPath), "UTF-8");
270
+ while (sc.hasNextLine()) {
271
+ String line = sc.nextLine();
272
+ if (line.startsWith("#") || line.isBlank()) continue;
273
+ // Format: "<freq>\t<content>" or bare "<content>"
274
+ int tab = line.indexOf('\t');
275
+ String content = (tab >= 0) ? line.substring(tab + 1) : line;
276
+ if (!content.isBlank()) commonLines.add(content.trim());
277
+ }
278
+ sc.close();
279
+ System.out.println("[Blocklist] Loaded " + commonLines.size()
280
+ + " entries from: " + blocklistPath);
281
+ } catch (Exception e) {
282
+ e.printStackTrace();
283
+ }
284
+ }
285
+
286
+ // -----------------------------------------------------------------------
287
+ // Phase 2 — Clean full directory
288
+ // -----------------------------------------------------------------------
289
+
290
+ /**
291
+ * Cleans every .txt file in {@code dataDir} by removing lines that are
292
+ * in the learned blocklist or match a hardcoded boilerplate pattern.
293
+ *
294
+ * @param dataDir directory containing corpus .txt files to clean
295
+ * @param keepBackup if true, originals are renamed to *.bak before overwriting
296
+ */
297
+ public void cleanDirectory(String dataDir, boolean keepBackup) {
298
+ try {
299
+ if (commonLines.isEmpty()) {
300
+ System.out.println("[CleanPhase] Warning: no common lines loaded. "
301
+ + "Only regex patterns will be applied.");
302
+ }
303
+
304
+ FileHandler fh = new FileHandler();
305
+ int processed = 0, linesRemoved = 0;
306
+
307
+ for (File f : fh.getFileListing(new File(dataDir))) {
308
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
309
+
310
+ CleanResult result = cleanFile(f, keepBackup);
311
+ processed++;
312
+ linesRemoved += result.linesRemoved;
313
+
314
+ if (result.linesRemoved > 0) {
315
+ System.out.println("[CleanPhase] " + f.getName()
316
+ + " — removed " + result.linesRemoved + " lines.");
317
+ }
318
+ }
319
+
320
+ System.out.println("[CleanPhase] Done. Files processed: " + processed
321
+ + " Total lines removed: " + linesRemoved);
322
+
323
+ } catch (Exception e) {
324
+ e.printStackTrace();
325
+ }
326
+ }
327
+
328
+ /**
329
+ * Cleans a single file in place.
330
+ *
331
+ * @param file the .txt file to clean
332
+ * @param keepBackup if true, a .bak copy of the original is kept
333
+ * @return CleanResult with statistics
334
+ */
335
+ public CleanResult cleanFile(File file, boolean keepBackup) {
336
+ int removed = 0;
337
+ try {
338
+ // Read all lines
339
+ List<String> inputLines = new ArrayList<>();
340
+ Scanner sc = new Scanner(file, "UTF-8");
341
+ while (sc.hasNextLine()) inputLines.add(sc.nextLine());
342
+ sc.close();
343
+
344
+ // Filter
345
+ List<String> outputLines = new ArrayList<>();
346
+ for (String line : inputLines) {
347
+ if (shouldRemove(line)) {
348
+ removed++;
349
+ } else {
350
+ outputLines.add(line);
351
+ }
352
+ }
353
+
354
+ if (removed > 0) {
355
+ // Backup
356
+ if (keepBackup) {
357
+ File bak = new File(file.getAbsolutePath() + ".bak");
358
+ Files.copy(file.toPath(), bak.toPath(),
359
+ StandardCopyOption.REPLACE_EXISTING);
360
+ }
361
+
362
+ // Overwrite
363
+ Writer w = new OutputStreamWriter(
364
+ new FileOutputStream(file), "UTF-8");
365
+ for (String l : outputLines) {
366
+ w.write(l + "\n");
367
+ }
368
+ w.flush();
369
+ w.close();
370
+ }
371
+
372
+ } catch (Exception e) {
373
+ e.printStackTrace();
374
+ }
375
+ return new CleanResult(file, removed);
376
+ }
377
+
378
+ // -----------------------------------------------------------------------
379
+ // Core line decision
380
+ // -----------------------------------------------------------------------
381
+
382
+ /**
383
+ * Returns true if the line should be removed.
384
+ *
385
+ * A line is removed if:
386
+ * 1. Its trimmed form is in the learned common-lines blocklist, OR
387
+ * 2. It matches any hardcoded boilerplate regex pattern.
388
+ *
389
+ * Blank lines shorter than MIN_LINE_LENGTH are always kept so that
390
+ * paragraph structure is preserved.
391
+ *
392
+ * @param rawLine the original line from the file (not yet trimmed)
393
+ */
394
+ public boolean shouldRemove(String rawLine) {
395
+ String trimmed = rawLine.trim();
396
+
397
+ // Always keep blank/very-short lines (paragraph separators)
398
+ if (trimmed.length() < MIN_LINE_LENGTH) return false;
399
+
400
+ // 1. Exact-match blocklist
401
+ if (commonLines.contains(trimmed)) return true;
402
+
403
+ // 2. Regex boilerplate patterns
404
+ for (Pattern p : BOILERPLATE_PATTERNS) {
405
+ if (p.matcher(trimmed).matches() || p.matcher(trimmed).find()) {
406
+ return true;
407
+ }
408
+ }
409
+
410
+ return false;
411
+ }
412
+
413
+ // -----------------------------------------------------------------------
414
+ // Diagnostic helpers
415
+ // -----------------------------------------------------------------------
416
+
417
+ /** Returns an unmodifiable view of the learned common-lines set. */
418
+ public Set<String> getCommonLines() {
419
+ return java.util.Collections.unmodifiableSet(commonLines);
420
+ }
421
+
422
+ /** Returns a copy of the frequency map (line → number of sample files). */
423
+ public Map<String, Integer> getLineFrequency() {
424
+ return java.util.Collections.unmodifiableMap(lineFrequency);
425
+ }
426
+
427
+ /**
428
+ * Prints a summary of the top {@code n} most-frequent common lines to stdout.
429
+ */
430
+ public void printTopCommonLines(int n) {
431
+ System.out.println("--- Top " + n + " common lines (by sample frequency) ---");
432
+ lineFrequency.entrySet().stream()
433
+ .filter(e -> commonLines.contains(e.getKey()))
434
+ .sorted((a, b) -> b.getValue() - a.getValue())
435
+ .limit(n)
436
+ .forEach(e -> System.out.printf(" [%4d] %s%n", e.getValue(), e.getKey()));
437
+ }
438
+
439
+ // -----------------------------------------------------------------------
440
+ // Inner result class
441
+ // -----------------------------------------------------------------------
442
+
443
+ /** Simple value object returned by {@link #cleanFile}. */
444
+ public static class CleanResult {
445
+ public final File file;
446
+ public final int linesRemoved;
447
+
448
+ public CleanResult(File file, int linesRemoved) {
449
+ this.file = file;
450
+ this.linesRemoved = linesRemoved;
451
+ }
452
+ }
453
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/.BulNCProcessor.java.kate-swp ADDED
Binary file (348 Bytes). View file
 
java/bg/bas/dcl/LLMs/IfGPTDataset/.CurlicatProcessor.java.kate-swp ADDED
Binary file (98 Bytes). View file
 
java/bg/bas/dcl/LLMs/IfGPTDataset/BaseSourceProcessor.java ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.FileWriter;
4
+ import java.io.PrintWriter;
5
+ import java.util.ArrayList;
6
+ import java.util.LinkedHashSet;
7
+
8
+ import org.json.simple.JSONArray;
9
+ import org.json.simple.JSONObject;
10
+
11
+ import bg.bas.dcl.general.JSONProcessor;
12
+
13
+ import java.io.File;
14
+
15
+ /**
16
+ * Abstract base for all source processors.
17
+ *
18
+ * Provides shared utilities:
19
+ * - convertJsonToCSV: write a metadata JSONObject to a CSV file
20
+ * - estimateTokenCount: simple punctuation-aware token estimator
21
+ *
22
+ * Each concrete subclass implements {@link SourceProcessor#process(String, String)}
23
+ * with source-specific parsing logic.
24
+ */
25
+ public abstract class BaseSourceProcessor implements SourceProcessor {
26
+
27
+ // -----------------------------------------------------------------------
28
+ // CSV export
29
+ // -----------------------------------------------------------------------
30
+
31
+ /**
32
+ * Reads a metadata.json file from disk and writes a CSV alongside it.
33
+ *
34
+ * @param metadataJsonPath path to the metadata JSON file
35
+ */
36
+ public void convertJsonToCSV(String metadataJsonPath) {
37
+ try {
38
+ JSONProcessor pr = new JSONProcessor();
39
+ JSONObject json = pr.readJSON(new File(metadataJsonPath));
40
+ convertJsonToCSV(json, metadataJsonPath + "_CSV.csv");
41
+ } catch (Exception e) {
42
+ e.printStackTrace();
43
+ }
44
+ }
45
+
46
+ /**
47
+ * Writes the "metadata" array inside {@code json} to a CSV at {@code outCsvPath}.
48
+ * Reports structural inconsistencies (missing/extra fields) to stderr.
49
+ *
50
+ * @param json JSONObject that contains a "metadata" JSONArray
51
+ * @param outCsvPath destination CSV file path
52
+ */
53
+ public void convertJsonToCSV(JSONObject json, String outCsvPath) {
54
+ try {
55
+ JSONArray array = (JSONArray) json.get("metadata");
56
+
57
+ if (array == null || array.isEmpty()) {
58
+ System.err.println("[INCONSISTENCY] 'metadata' array is null or empty in: " + outCsvPath);
59
+ return;
60
+ }
61
+
62
+ // Collect all unique field names, preserving insertion order
63
+ LinkedHashSet<String> headersSet = new LinkedHashSet<>();
64
+ for (Object obj : array) {
65
+ if (obj instanceof JSONObject) {
66
+ headersSet.addAll(((JSONObject) obj).keySet());
67
+ } else {
68
+ System.err.println("[INCONSISTENCY] Non-JSONObject entry found in metadata array.");
69
+ }
70
+ }
71
+
72
+ ArrayList<String> headers = new ArrayList<>(headersSet);
73
+
74
+ try (PrintWriter writer = new PrintWriter(new FileWriter(outCsvPath))) {
75
+
76
+ // Header row
77
+ writer.println(String.join(",", headers));
78
+
79
+ // Data rows
80
+ for (int i = 0; i < array.size(); i++) {
81
+ Object obj = array.get(i);
82
+
83
+ if (!(obj instanceof JSONObject)) {
84
+ System.err.println("[INCONSISTENCY] Row " + i + " is not a JSONObject, skipping.");
85
+ continue;
86
+ }
87
+
88
+ JSONObject row = (JSONObject) obj;
89
+
90
+ // Structural checks
91
+ for (String header : headers) {
92
+ if (!row.containsKey(header)) {
93
+ System.err.println("[INCONSISTENCY] Row " + i + " missing field: '" + header + "'");
94
+ }
95
+ }
96
+ for (Object key : row.keySet()) {
97
+ if (!headersSet.contains(key.toString())) {
98
+ System.err.println("[INCONSISTENCY] Row " + i + " has unexpected field: '" + key + "'");
99
+ }
100
+ }
101
+
102
+ // Build CSV line with RFC-4180 escaping
103
+ ArrayList<String> values = new ArrayList<>();
104
+ for (String header : headers) {
105
+ Object value = row.get(header);
106
+ if (value == null) {
107
+ values.add("");
108
+ } else {
109
+ String strVal = value.toString();
110
+ if (strVal.contains(",") || strVal.contains("\"") || strVal.contains("\n")) {
111
+ strVal = "\"" + strVal.replace("\"", "\"\"") + "\"";
112
+ }
113
+ values.add(strVal);
114
+ }
115
+ }
116
+ writer.println(String.join(",", values));
117
+ }
118
+ }
119
+
120
+ System.out.println("CSV written to: " + outCsvPath);
121
+
122
+ } catch (Exception e) {
123
+ e.printStackTrace();
124
+ }
125
+ }
126
+
127
+ // -----------------------------------------------------------------------
128
+ // Shared helpers
129
+ // -----------------------------------------------------------------------
130
+
131
+ /**
132
+ * Estimates the number of tokens in a sentence by counting words plus
133
+ * standalone punctuation characters (.,;:?!()-).
134
+ *
135
+ * @param sentence whitespace-tokenised sentence string
136
+ * @return estimated token count
137
+ */
138
+ protected int estimateTokenCount(String sentence) {
139
+ String[] words = sentence.split(" ");
140
+ int punctCount = sentence.length()
141
+ - sentence.replaceAll("[.,;:()?!\\-]", "").length();
142
+ return words.length + punctCount;
143
+ }
144
+
145
+ /**
146
+ * Creates a JSONObject pre-populated with the metadata fields that are
147
+ * common to every source (counts start at 0).
148
+ *
149
+ * @param identifier unique document identifier
150
+ * @return partially initialised JSONObject
151
+ */
152
+ @SuppressWarnings("unchecked")
153
+ protected JSONObject newBaseDescriptor(String identifier) {
154
+ JSONObject fdescr = new JSONObject();
155
+ fdescr.put("Identifier", identifier);
156
+ fdescr.put("Licence", "");
157
+ fdescr.put("LicenceLink", "");
158
+ fdescr.put("PublicationDate", "");
159
+ fdescr.put("DocumentTitle", "");
160
+ fdescr.put("Source", "");
161
+ fdescr.put("Author", "");
162
+ fdescr.put("Style", "");
163
+ fdescr.put("Type", "");
164
+ fdescr.put("Subdomain", "");
165
+ fdescr.put("TranslatedDocument", "");
166
+ fdescr.put("CollectionDate", "");
167
+ fdescr.put("Medium", "text");
168
+ fdescr.put("Url", "");
169
+ fdescr.put("Domain", "");
170
+ fdescr.put("Keywords", "");
171
+ fdescr.put("PersonallyIdentifiableInformation", "");
172
+ fdescr.put("BiasedInformation", "");
173
+ fdescr.put("TaskCategories", "");
174
+ fdescr.put("NumberWords", 0);
175
+ fdescr.put("NumberSentences", 0);
176
+ fdescr.put("NumberParagraphs", 0);
177
+ fdescr.put("NumberTokens", 0);
178
+ return fdescr;
179
+ }
180
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCProcessor.java ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.Writer;
7
+ import java.util.Scanner;
8
+
9
+ import org.json.simple.JSONArray;
10
+ import org.json.simple.JSONObject;
11
+
12
+ import bg.bas.dcl.monolingual.bg.TextProcessor;
13
+
14
+ /**
15
+ * Processes the Bulgarian National Corpus (BulNC) — general subcorpora.
16
+ *
17
+ * Unlike MARCELL/CURLICAT, BulNC metadata is supplied via an external
18
+ * tab-separated description file (BulNC-description.txt) rather than
19
+ * inline CoNLL-UP comments. Plain-text source files are read directly.
20
+ *
21
+ * Subcorpora included (controlled by {@link #isIncluded}):
22
+ * A-Administrative, B-Science, C-MassMedia, D-Fiction
23
+ * (edit the method to adjust the filter)
24
+ *
25
+ * SETimes articles are excluded regardless of subcorpus.
26
+ *
27
+ * Licence rules:
28
+ * A-Administrative → CC0
29
+ * B-Science → Restricted
30
+ * C-MassMedia → Restricted
31
+ * D-Fiction → Restricted
32
+ *
33
+ * Description file column indices (0-based):
34
+ * 0 filename stem | 1 relative path | 2 collection date
35
+ * 4 author | 8 title | 9 publication date
36
+ * 12 url | 13 translated | 17 type
37
+ * 19 domain | 21 subdomain (optional)
38
+ */
39
+ public class BulNCProcessor extends BaseSourceProcessor {
40
+
41
+ private static final String CC0_LICENCE = "CC0";
42
+ private static final String CC0_LICENCE_LINK =
43
+ "https://creativecommons.org/public-domain/cc0/";
44
+ private static final String RESTRICTED = "Restricted";
45
+
46
+ private final String metaFilePath; // path to BulNC-description.txt
47
+ private final TextProcessor tp = new TextProcessor();
48
+
49
+ /**
50
+ * @param metaFilePath absolute path to BulNC-description.txt
51
+ */
52
+ public BulNCProcessor(String metaFilePath) {
53
+ this.metaFilePath = metaFilePath;
54
+ }
55
+
56
+ /**
57
+ * @param indir root directory of the BulNC corpus
58
+ * @param outdir output directory for .txt files and metadata
59
+ */
60
+ @Override
61
+ public void process(String indir, String outdir) {
62
+ try {
63
+ JSONObject json = new JSONObject();
64
+ JSONArray descrArray = new JSONArray();
65
+
66
+ Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
67
+ while (sme.hasNextLine()) {
68
+ String[] dat = sme.nextLine().split("\t");
69
+
70
+ String relativePath = dat[1];
71
+ System.out.println("Checking: " + relativePath);
72
+
73
+ // --- Subcorpus filter ---
74
+ if (!isIncluded(relativePath)) continue;
75
+
76
+ // --- SETimes exclusion ---
77
+ if (dat[12].contains("setimes")) continue;
78
+
79
+ String fname = indir + relativePath;
80
+ File f = new File(fname);
81
+ if (!f.exists()) {
82
+ System.err.println("[MISSING] " + fname);
83
+ continue;
84
+ }
85
+
86
+ String tfname = "bg_bnc_" + dat[0];
87
+
88
+ JSONObject fdescr = newBaseDescriptor(tfname);
89
+ applyLicence(fdescr, relativePath);
90
+
91
+ fdescr.put("PublicationDate", dat[9].replaceAll("\\.", "-"));
92
+ fdescr.put("DocumentTitle", dat[8]);
93
+ fdescr.put("Author", dat[4]);
94
+ fdescr.put("Style", "Administrative");
95
+ fdescr.put("Type", dat[17]);
96
+ fdescr.put("Subdomain", dat.length > 21 ? dat[21] : "");
97
+ fdescr.put("TranslatedDocument", dat[13]);
98
+ fdescr.put("CollectionDate", dat[2]);
99
+ fdescr.put("Url", dat[12]);
100
+ fdescr.put("Domain", dat[19]);
101
+
102
+ Writer out = new OutputStreamWriter(
103
+ new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
104
+
105
+ Scanner s = new Scanner(f, "UTF-8");
106
+ int nw = 0, ns = 0, np = 0, nt = 0;
107
+
108
+ while (s.hasNextLine()) {
109
+ String text = s.nextLine();
110
+ np++;
111
+
112
+ out.write(text + "\n");
113
+ out.flush();
114
+
115
+ for (String sent : tp.splitToSentences(text)) {
116
+ ns++;
117
+ String[] words = sent.split(" ");
118
+ nw += words.length;
119
+ nt += estimateTokenCount(sent);
120
+ }
121
+ }
122
+
123
+ s.close();
124
+ out.flush();
125
+ out.close();
126
+
127
+ fdescr.put("NumberWords", nw);
128
+ fdescr.put("NumberSentences", ns);
129
+ fdescr.put("NumberParagraphs", np);
130
+ fdescr.put("NumberTokens", nt);
131
+
132
+ descrArray.add(fdescr);
133
+ }
134
+ sme.close();
135
+
136
+ json.put("metadata", descrArray);
137
+
138
+ System.out.println("Total documents processed: " + descrArray.size());
139
+ writeMetadata(json, outdir, "metadata_BNC_mm.json");
140
+
141
+ } catch (Exception e) {
142
+ e.printStackTrace();
143
+ }
144
+ }
145
+
146
+ // -----------------------------------------------------------------------
147
+ // Helpers
148
+ // -----------------------------------------------------------------------
149
+
150
+ /**
151
+ * Returns true for subcorpora that should be processed.
152
+ * Edit this method to change the filter.
153
+ */
154
+ protected boolean isIncluded(String relativePath) {
155
+ return relativePath.contains("C-MassMedia/");
156
+ // Uncomment to add more subcorpora:
157
+ // || relativePath.contains("A-Administrative/")
158
+ // || relativePath.contains("B-Science/")
159
+ // || relativePath.contains("D-Fiction/")
160
+ }
161
+
162
+ @SuppressWarnings("unchecked")
163
+ private void applyLicence(JSONObject fdescr, String relativePath) {
164
+ if (relativePath.contains("B-Science/")
165
+ || relativePath.contains("C-MassMedia/")
166
+ || relativePath.contains("D-Fiction/")) {
167
+ fdescr.put("Licence", RESTRICTED);
168
+ fdescr.put("LicenceLink", "");
169
+ } else {
170
+ fdescr.put("Licence", CC0_LICENCE);
171
+ fdescr.put("LicenceLink", CC0_LICENCE_LINK);
172
+ }
173
+ }
174
+
175
+ @SuppressWarnings("unchecked")
176
+ private void writeMetadata(JSONObject json, String outdir, String filename)
177
+ throws Exception {
178
+ String outMetaPath = outdir + filename;
179
+ Writer outMeta = new OutputStreamWriter(
180
+ new FileOutputStream(outMetaPath), "UTF-8");
181
+ json.writeJSONString(outMeta);
182
+ outMeta.flush();
183
+ outMeta.close();
184
+
185
+ convertJsonToCSV(json, outMetaPath + "_CSV.csv");
186
+ System.out.println("Metadata written to: " + outMetaPath);
187
+ }
188
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCWikiProcessor.java ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.Writer;
7
+ import java.util.Scanner;
8
+
9
+ import org.json.simple.JSONArray;
10
+ import org.json.simple.JSONObject;
11
+
12
+ import bg.bas.dcl.general.JSONProcessor;
13
+ import bg.bas.dcl.monolingual.bg.TextProcessor;
14
+
15
+ /**
16
+ * Processes the BulNC "F-InformalFiction" (Wiki/Informal) subcorpus.
17
+ *
18
+
19
+ */
20
+ public class BulNCWikiProcessor extends BaseSourceProcessor {
21
+
22
+ private static final String CC0_LICENCE = "CC0";
23
+ private static final String CC0_LICENCE_LINK =
24
+ "https://creativecommons.org/public-domain/cc0/";
25
+
26
+ private final String metaFilePath;
27
+ private final String existingMetaJson; // may be null
28
+ private final TextProcessor tp = new TextProcessor();
29
+
30
+ public BulNCWikiProcessor(String metaFilePath, String existingMetaJson) {
31
+ this.metaFilePath = metaFilePath;
32
+ this.existingMetaJson = existingMetaJson;
33
+ }
34
+
35
+ /**
36
+
37
+ */
38
+ @Override
39
+ public void process(String indir, String outdir) {
40
+ try {
41
+ // Load existing metadata if provided, otherwise start fresh
42
+ JSONObject json;
43
+ JSONArray descrArray;
44
+
45
+ if (existingMetaJson != null && new File(existingMetaJson).exists()) {
46
+ JSONProcessor jp = new JSONProcessor();
47
+ json = jp.readJSON(new File(existingMetaJson));
48
+ descrArray = (JSONArray) json.get("metadata");
49
+ System.out.println("Loaded existing metadata with "
50
+ + descrArray.size() + " entries.");
51
+ } else {
52
+ json = new JSONObject();
53
+ descrArray = new JSONArray();
54
+ json.put("metadata", descrArray);
55
+ }
56
+
57
+ int newDocs = 0;
58
+ long totalTokens = 0;
59
+
60
+ Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
61
+ while (sme.hasNextLine()) {
62
+ String[] dat = sme.nextLine().split("\t");
63
+
64
+ String relativePath = dat[1];
65
+ System.out.println("Checking: " + relativePath);
66
+
67
+ if (!relativePath.contains("F-InformalFiction")) continue;
68
+
69
+ String fname = indir + relativePath;
70
+ File f = new File(fname);
71
+ if (!f.exists()) {
72
+ System.err.println("[MISSING] " + fname);
73
+ continue;
74
+ }
75
+
76
+ String tfname = "bg_bnc_" + dat[0];
77
+
78
+ JSONObject fdescr = newBaseDescriptor(tfname);
79
+ fdescr.put("Licence", CC0_LICENCE);
80
+ fdescr.put("LicenceLink", CC0_LICENCE_LINK);
81
+ fdescr.put("PublicationDate", dat[9].replaceAll("\\.", "-"));
82
+ fdescr.put("DocumentTitle", dat[8]);
83
+ fdescr.put("Author", dat[4]);
84
+ fdescr.put("Style", "Administrative");
85
+ fdescr.put("Type", dat[17]);
86
+ fdescr.put("Subdomain", dat.length > 21 ? dat[21] : "");
87
+ fdescr.put("TranslatedDocument", dat[13]);
88
+ fdescr.put("CollectionDate", dat[2]);
89
+ fdescr.put("Url", dat[12]);
90
+ fdescr.put("Domain", dat[19]);
91
+
92
+ Writer out = new OutputStreamWriter(
93
+ new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
94
+
95
+ Scanner s = new Scanner(f, "UTF-8");
96
+ int nw = 0, ns = 0, np = 0, nt = 0;
97
+
98
+ while (s.hasNextLine()) {
99
+ String text = s.nextLine();
100
+ np++;
101
+
102
+ out.write(text + "\n");
103
+ out.flush();
104
+
105
+ for (String sent : tp.splitToSentences(text)) {
106
+ ns++;
107
+ String[] words = sent.split(" ");
108
+ nw += words.length;
109
+ nt += estimateTokenCount(sent);
110
+ }
111
+ }
112
+
113
+ s.close();
114
+ out.flush();
115
+ out.close();
116
+
117
+ fdescr.put("NumberWords", nw);
118
+ fdescr.put("NumberSentences", ns);
119
+ fdescr.put("NumberParagraphs", np);
120
+ fdescr.put("NumberTokens", nt);
121
+
122
+ descrArray.add(fdescr);
123
+ newDocs++;
124
+ totalTokens += nt;
125
+ }
126
+ sme.close();
127
+
128
+ System.out.println("New F-InformalFiction documents added: " + newDocs);
129
+ System.out.println("Total tokens in new documents: " + totalTokens);
130
+ System.out.println("Merged metadata total entries: " + descrArray.size());
131
+
132
+ writeMetadata(json, outdir, "metadata.json");
133
+
134
+ } catch (Exception e) {
135
+ e.printStackTrace();
136
+ }
137
+ }
138
+
139
+ // -----------------------------------------------------------------------
140
+
141
+ @SuppressWarnings("unchecked")
142
+ private void writeMetadata(JSONObject json, String outdir, String filename)
143
+ throws Exception {
144
+ String outMetaPath = outdir + filename;
145
+ Writer outMeta = new OutputStreamWriter(
146
+ new FileOutputStream(outMetaPath), "UTF-8");
147
+ json.writeJSONString(outMeta);
148
+ outMeta.flush();
149
+ outMeta.close();
150
+
151
+ System.out.println("Merged metadata written to: " + outMetaPath);
152
+
153
+ }
154
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/CurlicatProcessor.java ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.Writer;
7
+ import java.util.Scanner;
8
+
9
+ import org.json.simple.JSONArray;
10
+ import org.json.simple.JSONObject;
11
+
12
+ import bg.bas.dcl.general.FileHandler;
13
+
14
+ /**
15
+ * Processes the CURLICAT Bulgarian corpus.
16
+ *
17
+ * Input: CoNLL-UP files (.conllup) with richer inline metadata than MARCELL.
18
+ * Output: One plain-text .txt per document + metadata.json + metadata CSV.
19
+ *
20
+ * Metadata comment prefixes recognised:
21
+ * # PublicationDate = → PublicationDate
22
+ * # DocumentTitle = → DocumentTitle
23
+ * # Author = → Author
24
+ * # DocumentType = → Type
25
+ * # Url = → Url
26
+ * # Style = → Style
27
+ * # Domain = → Domain
28
+ * # Subdomain = → Subdomain
29
+ * # CollectionDate = → CollectionDate
30
+ * # License = → Licence (overrides default if present)
31
+ *
32
+ * Default licence: CC-BY-SA-4.0.
33
+ */
34
+ public class CurlicatProcessor extends BaseSourceProcessor {
35
+
36
+ private static final String DEFAULT_LICENCE = "CC-BY-SA-4.0";
37
+ private static final String DEFAULT_LICENCE_LINK =
38
+ "https://elrc-share.eu/static/metashare/licences/CC-BY-SA-4.0.pdf";
39
+ private static final String PREFIX = "bg_CURLICAT_";
40
+ private static final String EXT = ".conllup";
41
+
42
+ @Override
43
+ public void process(String indir, String outdir) {
44
+ try {
45
+ FileHandler fh = new FileHandler();
46
+ JSONObject json = new JSONObject();
47
+ JSONArray descrArray = new JSONArray();
48
+
49
+ for (File f : fh.getFileListing(new File(indir))) {
50
+ if (!f.isFile()) continue;
51
+
52
+ System.out.println("Processing: " + f.getAbsolutePath());
53
+
54
+ String tfname = PREFIX + f.getName().replace(EXT, "");
55
+
56
+ JSONObject fdescr = newBaseDescriptor(tfname);
57
+ fdescr.put("Licence", DEFAULT_LICENCE);
58
+ fdescr.put("LicenceLink", DEFAULT_LICENCE_LINK);
59
+
60
+ Writer out = new OutputStreamWriter(
61
+ new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
62
+
63
+ Scanner s = new Scanner(f, "UTF-8");
64
+ int nw = 0, ns = 0, np = 0, nt = 0;
65
+
66
+ while (s.hasNextLine()) {
67
+ String line = s.nextLine();
68
+
69
+ // --- Metadata extraction ---
70
+ if (line.startsWith("# PublicationDate =")) {
71
+ fdescr.put("PublicationDate",
72
+ line.replace("# PublicationDate =", "").trim());
73
+ } else if (line.startsWith("# DocumentTitle =")) {
74
+ fdescr.put("DocumentTitle",
75
+ line.replace("# DocumentTitle =", "").trim());
76
+ } else if (line.startsWith("# Author =")) {
77
+ fdescr.put("Author",
78
+ line.replace("# Author =", "").trim());
79
+ } else if (line.startsWith("# DocumentType =")) {
80
+ fdescr.put("Type",
81
+ line.replace("# DocumentType =", "").trim());
82
+ } else if (line.startsWith("# Url =")) {
83
+ fdescr.put("Url",
84
+ line.replace("# Url =", "").trim());
85
+ } else if (line.startsWith("# Style =")) {
86
+ fdescr.put("Style",
87
+ line.replace("# Style =", "").trim());
88
+ } else if (line.startsWith("# Domain =")) {
89
+ fdescr.put("Domain",
90
+ line.replace("# Domain =", "").trim());
91
+ } else if (line.startsWith("# Subdomain =")) {
92
+ fdescr.put("Subdomain",
93
+ line.replace("# Subdomain =", "").trim());
94
+ } else if (line.startsWith("# CollectionDate =")) {
95
+ fdescr.put("CollectionDate",
96
+ line.replace("# CollectionDate =", "").trim());
97
+ } else if (line.startsWith("# License =")) {
98
+ // Override default licence if the file declares one
99
+ fdescr.put("Licence",
100
+ line.replace("# License =", "").trim());
101
+ }
102
+
103
+ // --- Structure counting ---
104
+ else if (line.startsWith("# sent_id =")) {
105
+ ns++;
106
+ } else if (line.startsWith("# newpar id =")) {
107
+ np++;
108
+ out.write("\n");
109
+ }
110
+
111
+ // --- Text output ---
112
+ else if (line.startsWith("# text =")) {
113
+ out.write(line.replace("# text =", "").trim() + "\n");
114
+ out.flush();
115
+ } else {
116
+ // CoNLL-UP token line
117
+ String[] cols = line.split("\t");
118
+ if (cols.length > 5) {
119
+ nt++;
120
+ if (!cols[3].equals("PUNCT")) nw++;
121
+ }
122
+ }
123
+ }
124
+
125
+ s.close();
126
+ out.flush();
127
+ out.close();
128
+
129
+ fdescr.put("NumberWords", nw);
130
+ fdescr.put("NumberSentences", ns);
131
+ fdescr.put("NumberParagraphs", np);
132
+ fdescr.put("NumberTokens", nt);
133
+
134
+ descrArray.add(fdescr);
135
+ }
136
+
137
+ json.put("metadata", descrArray);
138
+ writeMetadata(json, outdir, "metadata_CC.json");
139
+
140
+ } catch (Exception e) {
141
+ e.printStackTrace();
142
+ }
143
+ }
144
+
145
+ // -----------------------------------------------------------------------
146
+
147
+ @SuppressWarnings("unchecked")
148
+ private void writeMetadata(JSONObject json, String outdir, String filename)
149
+ throws Exception {
150
+ String outMetaPath = outdir + filename;
151
+ Writer outMeta = new OutputStreamWriter(
152
+ new FileOutputStream(outMetaPath), "UTF-8");
153
+ json.writeJSONString(outMeta);
154
+ outMeta.flush();
155
+ outMeta.close();
156
+
157
+ convertJsonToCSV(json, outMetaPath + "_CSV.csv");
158
+ System.out.println("Metadata written to: " + outMetaPath);
159
+ }
160
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/DocumentMetadata.java ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.Arrays;
5
+ import java.util.Collections;
6
+ import java.util.List;
7
+
8
+ import org.json.simple.JSONArray;
9
+ import org.json.simple.JSONObject;
10
+
11
+ /**
12
+ * DocumentMetadata
13
+ *
14
+ * Canonical in-memory representation of the ifGPT dataset metadata schema.
15
+
16
+ */
17
+ @SuppressWarnings("unchecked")
18
+ public class DocumentMetadata {
19
+
20
+ // -----------------------------------------------------------------------
21
+ // ── MANDATORY (15) ──────────────────────────────────────────────────────
22
+ // -----------------------------------------------------------------------
23
+
24
+ /** Unique document identifier with the language prefix "bg". */
25
+ private String identifier = "";
26
+
27
+ /** Licence name (open, restricted, …). */
28
+ private String licence = "";
29
+
30
+ /** Publication date yyyy-mm-dd. */
31
+ private String publicationDate = "";
32
+
33
+ /** Title of the document. */
34
+ private String documentTitle = "";
35
+
36
+ /** Publishing organisation / media outlet / institutional originator. */
37
+ private String source = "";
38
+
39
+ /** Modality: "textual" | "multimodal". */
40
+ private String medium = "textual";
41
+
42
+ /** Original web address. */
43
+ private String url = "";
44
+
45
+ /** Up to six subject-area labels from a controlled vocabulary. */
46
+ private List<String> domain = new ArrayList<>();
47
+
48
+ /** Up to six free-text keywords. */
49
+ private List<String> keywords = new ArrayList<>();
50
+
51
+ /** Total word count (non-punctuation tokens). */
52
+ private int numberWords = 0;
53
+
54
+ /** Total sentence count. */
55
+ private int numberSentences = 0;
56
+
57
+ /** Total paragraph count. */
58
+ private int numberParagraphs = 0;
59
+
60
+ /** Total token count (words + punctuation). */
61
+ private int numberTokens = 0;
62
+
63
+ /**
64
+ * Per-sentence PII coverage vector.
65
+ * Entry i = proportion of tokens in sentence i flagged as PII ∈ [0,1].
66
+ * Length == numberSentences after pipeline completion.
67
+ */
68
+ private List<Double> piiVector = new ArrayList<>();
69
+
70
+ /**
71
+ * Per-sentence bias coverage vector.
72
+ * Entry i = proportion of tokens in sentence i flagged as biased ∈ [0,1].
73
+ * Length == numberSentences after pipeline completion.
74
+ */
75
+ private List<Double> biasVector = new ArrayList<>();
76
+
77
+ // -----------------------------------------------------------------------
78
+ // ── OPTIONAL (8) ────────────────────────────────────────────────────────
79
+ // -----------------------------------------------------------------------
80
+
81
+ /** Name(s) of the author(s). */
82
+ private List<String> author = new ArrayList<>();
83
+
84
+ /** Stylistic register: legal | journalistic | administrative | … */
85
+ private String style = "";
86
+
87
+ /** Document genre: book | document | article | … */
88
+ private String type = "";
89
+
90
+ /** Narrower thematic classification, hierarchically linked to Domain. */
91
+ private List<String> subdomain = new ArrayList<>();
92
+
93
+ /** true = translation, false = original Bulgarian text. */
94
+ private Boolean translatedDocument = null; // null = unknown
95
+
96
+ /** Date of acquisition yyyy-mm-dd. */
97
+ private String collectionDate = "";
98
+
99
+ /** URL of the licence text. */
100
+ private String licenceLink = "";
101
+
102
+ /** Anticipated NLP applications from a predefined list. */
103
+ private List<String> taskCategories = new ArrayList<>();
104
+
105
+ // -----------------------------------------------------------------------
106
+ // Constructor
107
+ // -----------------------------------------------------------------------
108
+
109
+ public DocumentMetadata() {}
110
+
111
+ public DocumentMetadata(String identifier) {
112
+ this.identifier = identifier;
113
+ }
114
+
115
+ // -----------------------------------------------------------------------
116
+ // Fluent setters — mandatory
117
+ // -----------------------------------------------------------------------
118
+
119
+ public DocumentMetadata setIdentifier(String v) { identifier = v; return this; }
120
+ public DocumentMetadata setLicence(String v) { licence = v; return this; }
121
+ public DocumentMetadata setPublicationDate(String v) { publicationDate = v; return this; }
122
+ public DocumentMetadata setDocumentTitle(String v) { documentTitle = v; return this; }
123
+ public DocumentMetadata setSource(String v) { source = v; return this; }
124
+ public DocumentMetadata setMedium(String v) { medium = v; return this; }
125
+ public DocumentMetadata setUrl(String v) { url = v; return this; }
126
+ public DocumentMetadata setDomain(List<String> v) { domain = v != null ? v : new ArrayList<>(); return this; }
127
+ public DocumentMetadata addDomain(String v) { domain.add(v); return this; }
128
+ public DocumentMetadata setKeywords(List<String> v) { keywords = v != null ? v : new ArrayList<>(); return this; }
129
+ public DocumentMetadata addKeyword(String v) { keywords.add(v); return this; }
130
+ public DocumentMetadata setNumberWords(int v) { numberWords = v; return this; }
131
+ public DocumentMetadata setNumberSentences(int v) { numberSentences = v; return this; }
132
+ public DocumentMetadata setNumberParagraphs(int v) { numberParagraphs = v; return this; }
133
+ public DocumentMetadata setNumberTokens(int v) { numberTokens = v; return this; }
134
+ public DocumentMetadata setPiiVector(List<Double> v) { piiVector = v != null ? v : new ArrayList<>(); return this; }
135
+ public DocumentMetadata setBiasVector(List<Double> v) { biasVector = v != null ? v : new ArrayList<>(); return this; }
136
+
137
+ // Fluent setters — optional
138
+ public DocumentMetadata setAuthor(List<String> v) { author = v != null ? v : new ArrayList<>(); return this; }
139
+ public DocumentMetadata addAuthor(String v) { author.add(v); return this; }
140
+ public DocumentMetadata setStyle(String v) { style = v; return this; }
141
+ public DocumentMetadata setType(String v) { type = v; return this; }
142
+ public DocumentMetadata setSubdomain(List<String> v) { subdomain = v != null ? v : new ArrayList<>(); return this; }
143
+ public DocumentMetadata addSubdomain(String v) { subdomain.add(v); return this; }
144
+ public DocumentMetadata setTranslatedDocument(Boolean v) { translatedDocument= v; return this; }
145
+ public DocumentMetadata setCollectionDate(String v) { collectionDate = v; return this; }
146
+ public DocumentMetadata setLicenceLink(String v) { licenceLink = v; return this; }
147
+ public DocumentMetadata setTaskCategories(List<String> v) { taskCategories = v != null ? v : new ArrayList<>(); return this; }
148
+ public DocumentMetadata addTaskCategory(String v) { taskCategories.add(v); return this; }
149
+
150
+ // -----------------------------------------------------------------------
151
+ // Getters
152
+ // -----------------------------------------------------------------------
153
+
154
+ public String getIdentifier() { return identifier; }
155
+ public String getLicence() { return licence; }
156
+ public String getPublicationDate() { return publicationDate; }
157
+ public String getDocumentTitle() { return documentTitle; }
158
+ public String getSource() { return source; }
159
+ public String getMedium() { return medium; }
160
+ public String getUrl() { return url; }
161
+ public List<String> getDomain() { return Collections.unmodifiableList(domain); }
162
+ public List<String> getKeywords() { return Collections.unmodifiableList(keywords); }
163
+ public int getNumberWords() { return numberWords; }
164
+ public int getNumberSentences() { return numberSentences; }
165
+ public int getNumberParagraphs() { return numberParagraphs; }
166
+ public int getNumberTokens() { return numberTokens; }
167
+ public List<Double> getPiiVector() { return Collections.unmodifiableList(piiVector); }
168
+ public List<Double> getBiasVector() { return Collections.unmodifiableList(biasVector); }
169
+
170
+ public List<String> getAuthor() { return Collections.unmodifiableList(author); }
171
+ public String getStyle() { return style; }
172
+ public String getType() { return type; }
173
+ public List<String> getSubdomain() { return Collections.unmodifiableList(subdomain); }
174
+ public Boolean getTranslatedDocument(){ return translatedDocument; }
175
+ public String getCollectionDate() { return collectionDate; }
176
+ public String getLicenceLink() { return licenceLink; }
177
+ public List<String> getTaskCategories() { return Collections.unmodifiableList(taskCategories); }
178
+
179
+ // -----------------------------------------------------------------------
180
+ // Validation
181
+ // -----------------------------------------------------------------------
182
+
183
+ /**
184
+ * Returns a list of missing mandatory fields.
185
+ * An empty list means the record is complete.
186
+ */
187
+ public List<String> missingMandatoryFields() {
188
+ List<String> missing = new ArrayList<>();
189
+ if (identifier.isBlank()) missing.add("Identifier");
190
+ if (licence.isBlank()) missing.add("Licence");
191
+ if (medium.isBlank()) missing.add("Medium");
192
+ if (numberWords == 0) missing.add("NumberWords");
193
+ if (numberSentences == 0) missing.add("NumberSentences");
194
+ if (numberParagraphs == 0) missing.add("NumberParagraphs");
195
+ if (numberTokens == 0) missing.add("NumberTokens");
196
+ // piiVector and biasVector may legitimately be empty for clean docs
197
+ return missing;
198
+ }
199
+
200
+ // -----------------------------------------------------------------------
201
+ // JSON serialisation (json-simple)
202
+ // -----------------------------------------------------------------------
203
+
204
+ /** Serialises this record to a json-simple JSONObject. */
205
+ public JSONObject toJson() {
206
+ JSONObject o = new JSONObject();
207
+
208
+ // Mandatory
209
+ o.put("Identifier", identifier);
210
+ o.put("Licence", licence);
211
+ o.put("PublicationDate", publicationDate);
212
+ o.put("DocumentTitle", documentTitle);
213
+ o.put("Source", source);
214
+ o.put("Medium", medium);
215
+ o.put("Url", url);
216
+ o.put("Domain", toJsonArray(domain));
217
+ o.put("Keywords", toJsonArray(keywords));
218
+ o.put("NumberWords", numberWords);
219
+ o.put("NumberSentences", numberSentences);
220
+ o.put("NumberParagraphs", numberParagraphs);
221
+ o.put("NumberTokens", numberTokens);
222
+ o.put("PersonallyIdentifiableInformation",toJsonDoubleArray(piiVector));
223
+ o.put("BiasedInformation", toJsonDoubleArray(biasVector));
224
+
225
+ // Optional
226
+ o.put("Author", toJsonArray(author));
227
+ o.put("Style", style);
228
+ o.put("Type", type);
229
+ o.put("Subdomain", toJsonArray(subdomain));
230
+ o.put("TranslatedDocument",
231
+ translatedDocument == null ? "" : translatedDocument.toString());
232
+ o.put("CollectionDate", collectionDate);
233
+ o.put("LicenceLink", licenceLink);
234
+ o.put("TaskCategories", toJsonArray(taskCategories));
235
+
236
+ return o;
237
+ }
238
+
239
+ /**
240
+ * Populates a DocumentMetadata from a json-simple JSONObject previously
241
+ * produced by {@link #toJson()}.
242
+ */
243
+ public static DocumentMetadata fromJson(JSONObject o) {
244
+ DocumentMetadata m = new DocumentMetadata();
245
+
246
+ m.identifier = str(o, "Identifier");
247
+ m.licence = str(o, "Licence");
248
+ m.publicationDate = str(o, "PublicationDate");
249
+ m.documentTitle = str(o, "DocumentTitle");
250
+ m.source = str(o, "Source");
251
+ m.medium = str(o, "Medium");
252
+ m.url = str(o, "Url");
253
+ m.domain = strList(o, "Domain");
254
+ m.keywords = strList(o, "Keywords");
255
+ m.numberWords = intVal(o, "NumberWords");
256
+ m.numberSentences = intVal(o, "NumberSentences");
257
+ m.numberParagraphs = intVal(o, "NumberParagraphs");
258
+ m.numberTokens = intVal(o, "NumberTokens");
259
+ m.piiVector = doubleList(o, "PersonallyIdentifiableInformation");
260
+ m.biasVector = doubleList(o, "BiasedInformation");
261
+
262
+ m.author = strList(o, "Author");
263
+ m.style = str(o, "Style");
264
+ m.type = str(o, "Type");
265
+ m.subdomain = strList(o, "Subdomain");
266
+ String td = str(o, "TranslatedDocument");
267
+ m.translatedDocument= td.isBlank() ? null : Boolean.parseBoolean(td);
268
+ m.collectionDate = str(o, "CollectionDate");
269
+ m.licenceLink = str(o, "LicenceLink");
270
+ m.taskCategories = strList(o, "TaskCategories");
271
+
272
+ return m;
273
+ }
274
+
275
+ // -----------------------------------------------------------------------
276
+ // Interop with legacy JSONObject format (used by source processors)
277
+ // -----------------------------------------------------------------------
278
+
279
+ /**
280
+ * Merges fields from a legacy source-processor JSONObject (the format
281
+ * produced by MarcellProcessor, BulNCProcessor, etc.) into this record.
282
+ * Fields already set on {@code this} are NOT overwritten.
283
+ */
284
+ public void mergeLegacy(JSONObject legacy) {
285
+ if (identifier.isBlank()) setIdentifier(str(legacy, "Identifier"));
286
+ if (licence.isBlank()) setLicence(str(legacy, "Licence"));
287
+ if (licenceLink.isBlank()) setLicenceLink(str(legacy, "LicenceLink"));
288
+ if (publicationDate.isBlank()) setPublicationDate(str(legacy, "PublicationDate"));
289
+ if (documentTitle.isBlank()) setDocumentTitle(str(legacy, "DocumentTitle"));
290
+ if (source.isBlank()) setSource(str(legacy, "Source"));
291
+ if (url.isBlank()) setUrl(str(legacy, "Url"));
292
+ if (style.isBlank()) setStyle(str(legacy, "Style"));
293
+ if (type.isBlank()) setType(str(legacy, "Type"));
294
+ if (collectionDate.isBlank()) setCollectionDate(str(legacy, "CollectionDate"));
295
+
296
+ if (author.isEmpty()) {
297
+ String a = str(legacy, "Author");
298
+ if (!a.isBlank()) author.add(a);
299
+ }
300
+ if (domain.isEmpty()) {
301
+ String d = str(legacy, "Domain");
302
+ if (!d.isBlank()) domain.add(d);
303
+ }
304
+ if (subdomain.isEmpty()) {
305
+ String s = str(legacy, "Subdomain");
306
+ if (!s.isBlank()) subdomain.add(s);
307
+ }
308
+ if (numberWords == 0) numberWords = intVal(legacy, "NumberWords");
309
+ if (numberSentences == 0) numberSentences = intVal(legacy, "NumberSentences");
310
+ if (numberParagraphs == 0) numberParagraphs = intVal(legacy, "NumberParagraphs");
311
+ if (numberTokens == 0) numberTokens = intVal(legacy, "NumberTokens");
312
+
313
+ String translated = str(legacy, "TranslatedDocument");
314
+ if (translatedDocument == null && !translated.isBlank())
315
+ translatedDocument = Boolean.parseBoolean(translated);
316
+ }
317
+
318
+ // -----------------------------------------------------------------------
319
+ // Private helpers
320
+ // -----------------------------------------------------------------------
321
+
322
+ private static String str(JSONObject o, String key) {
323
+ Object v = o.get(key);
324
+ return v == null ? "" : v.toString().trim();
325
+ }
326
+
327
+ private static int intVal(JSONObject o, String key) {
328
+ Object v = o.get(key);
329
+ if (v == null) return 0;
330
+ try { return Integer.parseInt(v.toString().trim()); }
331
+ catch (NumberFormatException e) { return 0; }
332
+ }
333
+
334
+ private static List<String> strList(JSONObject o, String key) {
335
+ Object v = o.get(key);
336
+ List<String> list = new ArrayList<>();
337
+ if (v instanceof JSONArray) {
338
+ for (Object item : (JSONArray) v)
339
+ if (item != null) list.add(item.toString());
340
+ } else if (v != null && !v.toString().isBlank()) {
341
+ list.add(v.toString().trim());
342
+ }
343
+ return list;
344
+ }
345
+
346
+ private static List<Double> doubleList(JSONObject o, String key) {
347
+ Object v = o.get(key);
348
+ List<Double> list = new ArrayList<>();
349
+ if (v instanceof JSONArray) {
350
+ for (Object item : (JSONArray) v) {
351
+ try { list.add(Double.parseDouble(item.toString())); }
352
+ catch (NumberFormatException ignored) {}
353
+ }
354
+ }
355
+ return list;
356
+ }
357
+
358
+ private JSONArray toJsonArray(List<String> list) {
359
+ JSONArray a = new JSONArray();
360
+ if (list != null) a.addAll(list);
361
+ return a;
362
+ }
363
+
364
+ private JSONArray toJsonDoubleArray(List<Double> list) {
365
+ JSONArray a = new JSONArray();
366
+ if (list != null) a.addAll(list);
367
+ return a;
368
+ }
369
+
370
+ @Override
371
+ public String toString() {
372
+ return String.format(
373
+ "DocumentMetadata{id='%s', sentences=%d, words=%d, piiEntries=%d, biasEntries=%d}",
374
+ identifier, numberSentences, numberWords, piiVector.size(), biasVector.size());
375
+ }
376
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTDatasetProcessor.java ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ /**
4
+ * IfGPTDatasetProcessor
5
+ *
6
+
7
+ */
8
+ public class IfGPTDatasetProcessor {
9
+
10
+ // -----------------------------------------------------------------------
11
+ // Shared paths
12
+ // -----------------------------------------------------------------------
13
+
14
+ // New batch being ingested
15
+ static final String NEW_DATA_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/data/";
16
+ static final String NEW_META_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/metadata/";
17
+ static final String SAMPLE_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/sample/";
18
+ static final String BLOCKLIST_FILE = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/blocklist.txt";
19
+ static final String DEDUP_REPORT = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/dedup_report.tsv";
20
+
21
+ // Shared resources
22
+ static final String BULNC_META_FILE = "/home/ivelina/SVN_CORPUS/BulNC/BulNC-description.txt";
23
+ static final String BIAS_DICT = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
24
+ + "bulgarian_bias_dictionary_v4.tsv";
25
+
26
+ // -----------------------------------------------------------------------
27
+ // Main
28
+ // -----------------------------------------------------------------------
29
+
30
+ public static void main(String[] args) {
31
+
32
+ // ==================================================================
33
+ // MODE A — FULL PIPELINE (one call runs all 8 stages)
34
+ // ==================================================================
35
+ // Choose the source processor that matches the new batch format,
36
+ // then call pipeline.run().
37
+
38
+ // --- BulNC Mass Media batch ---
39
+ runBulNCPipeline();
40
+
41
+ // --- MARCELL batch ---
42
+ // runMarcellPipeline();
43
+
44
+ // --- CURLICAT batch ---
45
+ // runCurlicatPipeline();
46
+
47
+ // --- BulNC Wiki/InformalFiction batch ---
48
+ // runBulNCWikiPipeline();
49
+
50
+
51
+ // ==================================================================
52
+ // MODE B — INDIVIDUAL STAGES
53
+ // ==================================================================
54
+
55
+ // --- 1. Extract only ---
56
+ // new BulNCProcessor(BULNC_META_FILE).process(NEW_DATA_DIR, NEW_META_DIR);
57
+
58
+ // --- 3. Clean only (learn + apply) ---
59
+ // FileCleanProcessor fcp = new FileCleanProcessor(0.50);
60
+ // fcp.learnFromSample(SAMPLE_DIR);
61
+ // fcp.printTopCommonLines(30);
62
+ // fcp.saveBlocklist(BLOCKLIST_FILE);
63
+ // fcp.cleanDirectory(NEW_DATA_DIR, true);
64
+
65
+ // --- 4. Deduplication only ---
66
+ // DeduplicationProcessor dp = new DeduplicationProcessor(0.90, 5, 200);
67
+ // dp.indexCorpus(IfGPTPipeline.FULL_DATA_DIR);
68
+ // dp.detectDuplicates(NEW_DATA_DIR, DEDUP_REPORT);
69
+ // dp.removeDuplicatesFromNewFolder(NEW_DATA_DIR, true); // optional
70
+
71
+ // --- 5/6. PII + Bias annotation only (on already-split sentences) ---
72
+ // bg.bas.dcl.LLMs.BulgarianSentenceSplitter splitter =
73
+ // new bg.bas.dcl.LLMs.BulgarianSentenceSplitter();
74
+ // bg.bas.dcl.LLMs.PIIDetector pii = new bg.bas.dcl.LLMs.PIIDetector(splitter);
75
+ // pii.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "pii_report.tsv");
76
+ //
77
+ // bg.bas.dcl.LLMs.BiasLexicon lex =
78
+ // new bg.bas.dcl.LLMs.BiasLexicon(BIAS_DICT);
79
+ // bg.bas.dcl.LLMs.BiasAnalyser bias =
80
+ // new bg.bas.dcl.LLMs.BiasAnalyser(lex, splitter);
81
+ // bias.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "bias_report.tsv");
82
+
83
+
84
+ // ==================================================================
85
+ // MODE C — UTILITIES
86
+ // ==================================================================
87
+
88
+ // Convert an existing metadata JSON to CSV
89
+ // new MarcellProcessor().convertJsonToCSV(
90
+ // IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json");
91
+ }
92
+
93
+ // -----------------------------------------------------------------------
94
+ // Pipeline factory methods (one per source type)
95
+ // -----------------------------------------------------------------------
96
+
97
+ private static void runBulNCPipeline() {
98
+ new IfGPTPipeline()
99
+ .setSourceProcessor(new BulNCProcessor(BULNC_META_FILE))
100
+ .setNewDataDir(NEW_DATA_DIR)
101
+ .setSampleDir(SAMPLE_DIR)
102
+ .setNewMetaDir(NEW_META_DIR)
103
+ .setBlocklistFile(BLOCKLIST_FILE)
104
+ .setDedupReport(DEDUP_REPORT)
105
+ .setBiasDictPath(BIAS_DICT)
106
+ .setBoilerplateThreshold(0.50)
107
+ .setDedupThreshold(0.90)
108
+ .setRemoveDuplicates(false) // set true to delete dup sentences
109
+ .setKeepBackups(true)
110
+ .run();
111
+ }
112
+
113
+ private static void runMarcellPipeline() {
114
+ String indirMarcell = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/bg-annotated/";
115
+ String outdirMarcell= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/texts/";
116
+
117
+ new IfGPTPipeline()
118
+ .setSourceProcessor(new MarcellProcessor())
119
+ .setNewDataDir(outdirMarcell)
120
+ .setSampleDir(SAMPLE_DIR)
121
+ .setNewMetaDir(NEW_META_DIR)
122
+ .setBlocklistFile(BLOCKLIST_FILE)
123
+ .setDedupReport(DEDUP_REPORT)
124
+ .setBiasDictPath(BIAS_DICT)
125
+ .setSkipClean(false)
126
+ .setSkipDedup(false)
127
+ .run();
128
+ }
129
+
130
+ private static void runCurlicatPipeline() {
131
+ String indirCurlicat = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/archive/"
132
+ + "Bulgarian_Curlicat_corpus/";
133
+ String outdirCurlicat= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/texts/";
134
+
135
+ new IfGPTPipeline()
136
+ .setSourceProcessor(new CurlicatProcessor())
137
+ .setNewDataDir(outdirCurlicat)
138
+ .setSampleDir(SAMPLE_DIR)
139
+ .setNewMetaDir(NEW_META_DIR)
140
+ .setBlocklistFile(BLOCKLIST_FILE)
141
+ .setDedupReport(DEDUP_REPORT)
142
+ .setBiasDictPath(BIAS_DICT)
143
+ .run();
144
+ }
145
+
146
+ private static void runBulNCWikiPipeline() {
147
+ String existingMeta = IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json";
148
+ String outdirWiki = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/BulNC/wiki-texts/";
149
+
150
+ new IfGPTPipeline()
151
+ .setSourceProcessor(new BulNCWikiProcessor(BULNC_META_FILE, existingMeta))
152
+ .setNewDataDir(outdirWiki)
153
+ .setSampleDir(SAMPLE_DIR)
154
+ .setNewMetaDir(NEW_META_DIR)
155
+ .setBlocklistFile(BLOCKLIST_FILE)
156
+ .setDedupReport(DEDUP_REPORT)
157
+ .setBiasDictPath(BIAS_DICT)
158
+ .run();
159
+ }
160
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTPipeline.java ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.Writer;
7
+ import java.nio.charset.StandardCharsets;
8
+ import java.nio.file.Files;
9
+ import java.nio.file.StandardCopyOption;
10
+ import java.util.ArrayList;
11
+ import java.util.List;
12
+ import java.util.Properties;
13
+ import java.util.Scanner;
14
+
15
+ import org.json.simple.JSONArray;
16
+ import org.json.simple.JSONObject;
17
+
18
+ import bg.bas.dcl.LLMs.BiasAnalyser;
19
+ import bg.bas.dcl.LLMs.BiasLexicon;
20
+ import bg.bas.dcl.LLMs.BulgarianSentenceSplitter;
21
+ import bg.bas.dcl.LLMs.PIIDetector;
22
+ import bg.bas.dcl.LLMs.SentenceBiasScore;
23
+ import bg.bas.dcl.general.FileHandler;
24
+ import bg.bas.dcl.general.JSONProcessor;
25
+
26
+ /**
27
+ * IfGPTPipeline
28
+ *
29
+ * Pipeline for the ifGPT Bulgarian language dataset.
30
+ *
31
+ * -----------------------------------------------------------------------
32
+ -----------------------------------------------------------------------
33
+ * PIPELINE STAGES (executed in order by {@link #run()})
34
+ *
35
+ * 1. EXTRACT
36
+ * 2. SPLIT
37
+ * 3. CLEAN
38
+ * 4. DEDUPLICATE
39
+ * 5. PII
40
+ * 6. BIAS
41
+ * 7. COUNTS — word / sentence / token counts are recomputed on the cleaned, deduplicated text
42
+ * FULL_DATA_DIR / FULL_META_DIR
43
+ *
44
+ * -----------------------------------------------------------------------
45
+
46
+ */
47
+ @SuppressWarnings("unchecked")
48
+ public class IfGPTPipeline {
49
+
50
+ // -----------------------------------------------------------------------
51
+ // Fixed paths
52
+ // -----------------------------------------------------------------------
53
+
54
+ public static final String FULL_DATA_DIR =
55
+ "/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-DATA/";
56
+ public static final String FULL_META_DIR =
57
+ "/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-METADATA/";
58
+
59
+ // -----------------------------------------------------------------------
60
+ // Configurable paths and options
61
+ // -----------------------------------------------------------------------
62
+
63
+ private SourceProcessor sourceProcessor; // mandatory
64
+ private String newDataDir; // mandatory: incoming texts
65
+ private String sampleDir; // mandatory: boilerplate sample
66
+ private String newMetaDir; // mandatory: output metadata
67
+ private String blocklistFile; // boilerplate blocklist file
68
+ private String dedupReport; // dedup TSV report path
69
+ private String biasDictPath; // bias dictionary TSV
70
+ private String openNlpModelPath = null; // null = bundled JAR model
71
+ private double boilerplateThreshold = 0.50; // FileCleanProcessor threshold
72
+ private double dedupThreshold = 0.90; // DeduplicationProcessor threshold
73
+ private int dedupShingleSize = 5;
74
+ private int dedupNumHashes = 200;
75
+ private boolean removeDuplicates = false; // whether to strip dup sentences
76
+ private boolean keepBackups = true; // keep .bak on file modification
77
+ private boolean skipClean = false; // skip boilerplate cleaning
78
+ private boolean skipDedup = false; // skip deduplication
79
+ private boolean skipPii = false; // skip PII scoring
80
+ private boolean skipBias = false; // skip bias scoring
81
+
82
+ // -----------------------------------------------------------------------
83
+ //
84
+ // -----------------------------------------------------------------------
85
+
86
+ public IfGPTPipeline setSourceProcessor(SourceProcessor p) { sourceProcessor = p; return this; }
87
+ public IfGPTPipeline setNewDataDir(String p) { newDataDir = p; return this; }
88
+ public IfGPTPipeline setSampleDir(String p) { sampleDir = p; return this; }
89
+ public IfGPTPipeline setNewMetaDir(String p) { newMetaDir = p; return this; }
90
+ public IfGPTPipeline setBlocklistFile(String p) { blocklistFile = p; return this; }
91
+ public IfGPTPipeline setDedupReport(String p) { dedupReport = p; return this; }
92
+ public IfGPTPipeline setBiasDictPath(String p) { biasDictPath = p; return this; }
93
+ public IfGPTPipeline setOpenNlpModelPath(String p) { openNlpModelPath = p; return this; }
94
+ public IfGPTPipeline setBoilerplateThreshold(double t) { boilerplateThreshold = t; return this; }
95
+ public IfGPTPipeline setDedupThreshold(double t) { dedupThreshold = t; return this; }
96
+ public IfGPTPipeline setDedupShingleSize(int n) { dedupShingleSize = n; return this; }
97
+ public IfGPTPipeline setDedupNumHashes(int n) { dedupNumHashes = n; return this; }
98
+ public IfGPTPipeline setRemoveDuplicates(boolean b) { removeDuplicates = b; return this; }
99
+ public IfGPTPipeline setKeepBackups(boolean b) { keepBackups = b; return this; }
100
+ public IfGPTPipeline setSkipClean(boolean b) { skipClean = b; return this; }
101
+ public IfGPTPipeline setSkipDedup(boolean b) { skipDedup = b; return this; }
102
+ public IfGPTPipeline setSkipPii(boolean b) { skipPii = b; return this; }
103
+ public IfGPTPipeline setSkipBias(boolean b) { skipBias = b; return this; }
104
+
105
+ // -----------------------------------------------------------------------
106
+ // -----------------------------------------------------------------------
107
+
108
+ /**
109
+ * Executes all stages in order.
110
+ * Throws {@link IllegalStateException} if mandatory configuration is missing.
111
+ */
112
+ public void run() {
113
+ validateConfig();
114
+ ensureDirs(newMetaDir, FULL_DATA_DIR, FULL_META_DIR);
115
+
116
+ banner("STAGE 1 — SOURCE EXTRACTION");
117
+ runExtraction();
118
+
119
+ // Shared NLP components (initialised once, reused across stages)
120
+ BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter(openNlpModelPath);
121
+
122
+ banner("STAGE 2 — SENTENCE SPLITTING & INITIAL METADATA");
123
+ runSentenceSplitting(splitter);
124
+
125
+ if (!skipClean) {
126
+ banner("STAGE 3 — BOILERPLATE CLEANING");
127
+ runCleaning();
128
+ } else {
129
+ log("STAGE 3 skipped (skipClean=true)");
130
+ }
131
+
132
+ if (!skipDedup) {
133
+ banner("STAGE 4 — DEDUPLICATION");
134
+ runDeduplication();
135
+ } else {
136
+ log("STAGE 4 skipped (skipDedup=true)");
137
+ }
138
+
139
+ PIIDetector piiDetector = skipPii ? null : new PIIDetector(splitter);
140
+ BiasAnalyser biasAnalyser = skipBias ? null : buildBiasAnalyser(splitter);
141
+
142
+ banner("STAGES 5-7 — PII, BIAS & FINAL COUNTS");
143
+ runAnnotationAndCounts(splitter, piiDetector, biasAnalyser);
144
+
145
+ banner("STAGE 8 — PERSIST TO FULL CORPUS");
146
+ runPersist();
147
+
148
+ banner("PIPELINE COMPLETE");
149
+ }
150
+
151
+ // -----------------------------------------------------------------------
152
+ // Stage 1 — Extraction
153
+ // -----------------------------------------------------------------------
154
+
155
+ private void runExtraction() {
156
+ // The source processor writes plain-text files to newDataDir and
157
+ // seed metadata JSON to newMetaDir.
158
+ sourceProcessor.process(newDataDir, newMetaDir);
159
+ log("Extraction complete → " + newDataDir);
160
+ }
161
+
162
+ // -----------------------------------------------------------------------
163
+ // Stage 2 — Sentence splitting
164
+ // -----------------------------------------------------------------------
165
+
166
+ /**
167
+ * Reads each metadata JSON produced by the source processor, then for
168
+ * each document text file counts sentences properly using the OpenNLP
169
+ * splitter and writes the sentence list to a parallel .sentences file
170
+ * (one sentence per line) used by later stages.
171
+ */
172
+ private void runSentenceSplitting(BulgarianSentenceSplitter splitter) {
173
+ try {
174
+ FileHandler fh = new FileHandler();
175
+ int docs = 0;
176
+
177
+ for (File txtFile : fh.getFileListing(new File(newDataDir))) {
178
+ if (!txtFile.isFile() || !txtFile.getName().endsWith(".txt")) continue;
179
+
180
+ // Read document text
181
+ StringBuilder sb = new StringBuilder();
182
+ try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) {
183
+ while (sc.hasNextLine()) sb.append(sc.nextLine()).append('\n');
184
+ }
185
+ String text = sb.toString().trim();
186
+
187
+ // Split into sentences and persist to .sentences sidecar file
188
+ String[] sentences = splitter.split(text);
189
+ File sentFile = new File(newDataDir, txtFile.getName()
190
+ .replace(".txt", ".sentences"));
191
+
192
+ try (Writer w = new OutputStreamWriter(
193
+ new FileOutputStream(sentFile), StandardCharsets.UTF_8)) {
194
+ for (String sent : sentences) {
195
+ if (!sent.isBlank()) {
196
+ w.write(sent.trim());
197
+ w.write('\n');
198
+ }
199
+ }
200
+ }
201
+ docs++;
202
+ }
203
+ log("Sentence splitting complete. Documents: " + docs);
204
+ } catch (Exception e) {
205
+ e.printStackTrace();
206
+ }
207
+ }
208
+
209
+ // -----------------------------------------------------------------------
210
+ // Stage 3 — Boilerplate cleaning
211
+ // -----------------------------------------------------------------------
212
+
213
+ private void runCleaning() {
214
+ FileCleanProcessor fcp = new FileCleanProcessor(boilerplateThreshold);
215
+
216
+ // Learn from sample
217
+ fcp.learnFromSample(sampleDir);
218
+ fcp.printTopCommonLines(20);
219
+
220
+ // Save blocklist for audit / reproducibility
221
+ if (blocklistFile != null && !blocklistFile.isBlank()) {
222
+ fcp.saveBlocklist(blocklistFile);
223
+ }
224
+
225
+ // Clean the new data directory
226
+ fcp.cleanDirectory(newDataDir, keepBackups);
227
+ log("Boilerplate cleaning complete → " + newDataDir);
228
+ }
229
+
230
+ // -----------------------------------------------------------------------
231
+ // Stage 4 — Deduplication
232
+ // -----------------------------------------------------------------------
233
+
234
+ private void runDeduplication() {
235
+ DeduplicationProcessor dp = new DeduplicationProcessor(
236
+ dedupThreshold, dedupShingleSize, dedupNumHashes);
237
+
238
+ // Index the full existing corpus
239
+ log("Indexing full corpus for deduplication…");
240
+ dp.indexCorpus(FULL_DATA_DIR);
241
+ log("Corpus indexed. Sentences: " + dp.getCorpusSize());
242
+
243
+ // Detect near-duplicates in new data
244
+ String report = dedupReport != null
245
+ ? dedupReport
246
+ : newMetaDir + "dedup_report.tsv";
247
+ dp.detectDuplicates(newDataDir, report);
248
+
249
+ if (removeDuplicates) {
250
+ dp.removeDuplicatesFromNewFolder(newDataDir, keepBackups);
251
+ }
252
+ }
253
+
254
+ // -----------------------------------------------------------------------
255
+ // Stages 5-7 — PII, Bias annotation + final counts
256
+ // -----------------------------------------------------------------------
257
+
258
+ /**
259
+ * For each document:
260
+ * a) reads the (cleaned, deduplicated) .sentences sidecar file,
261
+ * b) runs PII and/or Bias scoring per sentence,
262
+ * c) recomputes word/sentence/token counts on the surviving text,
263
+ * d) merges all computed values into a DocumentMetadata and writes
264
+ * the final metadata JSON to newMetaDir.
265
+ */
266
+ private void runAnnotationAndCounts(BulgarianSentenceSplitter splitter,
267
+ PIIDetector piiDetector,
268
+ BiasAnalyser biasAnalyser) {
269
+ try {
270
+ FileHandler fh = new FileHandler();
271
+ JSONProcessor jp = new JSONProcessor();
272
+ int docs = 0, errors = 0;
273
+
274
+ for (File sentFile : fh.getFileListing(new File(newDataDir))) {
275
+ if (!sentFile.isFile()
276
+ || !sentFile.getName().endsWith(".sentences")) continue;
277
+
278
+ String stem = sentFile.getName().replace(".sentences", "");
279
+
280
+ // --- Load sentences ---
281
+ List<String> sentences = new ArrayList<>();
282
+ try (Scanner sc = new Scanner(sentFile, StandardCharsets.UTF_8)) {
283
+ while (sc.hasNextLine()) {
284
+ String s = sc.nextLine().trim();
285
+ if (!s.isBlank()) sentences.add(s);
286
+ }
287
+ }
288
+
289
+ if (sentences.isEmpty()) {
290
+ log("[WARN] No sentences for: " + stem);
291
+ errors++;
292
+ continue;
293
+ }
294
+
295
+ // --- Load or create DocumentMetadata ---
296
+ DocumentMetadata meta = loadOrCreateMetadata(jp, stem);
297
+
298
+ // --- PII per sentence ---
299
+ List<Double> piiVec = new ArrayList<>();
300
+ if (piiDetector != null) {
301
+ int sentIdx = 0;
302
+ for (String sent : sentences) {
303
+ PIIDetector.SentencePIIScore score =
304
+ piiDetector.analyseSentence(sent, stem + "-" + sentIdx++);
305
+ piiVec.add(score.getPiiCoverage());
306
+ }
307
+ }
308
+ meta.setPiiVector(piiVec);
309
+
310
+ // --- Bias per sentence ---
311
+ List<Double> biasVec = new ArrayList<>();
312
+ if (biasAnalyser != null) {
313
+ for (String sent : sentences) {
314
+ SentenceBiasScore score = biasAnalyser.analyseSentence(sent);
315
+ biasVec.add(score.totalCoverage());
316
+ }
317
+ }
318
+ meta.setBiasVector(biasVec);
319
+
320
+ // --- Recompute counts from surviving sentences ---
321
+ int nSentences = sentences.size();
322
+ int nWords = 0;
323
+ int nTokens = 0;
324
+
325
+ for (String sent : sentences) {
326
+ String[] toks = sent.split("\\s+");
327
+ nWords += toks.length;
328
+ // estimate tokens: words + punctuation characters
329
+ nTokens += toks.length + sent.length()
330
+ - sent.replaceAll("[.,;:!?()\\-]", "").length();
331
+ }
332
+
333
+ // Paragraphs: count blank-line groups in the original text file
334
+ int nParagraphs = countParagraphs(new File(newDataDir, stem + ".txt"));
335
+
336
+ meta.setNumberSentences(nSentences)
337
+ .setNumberWords(nWords)
338
+ .setNumberTokens(nTokens)
339
+ .setNumberParagraphs(nParagraphs);
340
+
341
+ // --- Persist metadata JSON ---
342
+ writeMetadata(meta, newMetaDir + stem + "_meta.json");
343
+ docs++;
344
+ }
345
+
346
+ log("Annotation & counts complete. Documents: " + docs
347
+ + " Errors: " + errors);
348
+
349
+ } catch (Exception e) {
350
+ e.printStackTrace();
351
+ }
352
+ }
353
+
354
+ // -----------------------------------------------------------------------
355
+ // Stage 8
356
+ // -----------------------------------------------------------------------
357
+
358
+ /**
359
+ */
360
+ private void runPersist() {
361
+ try {
362
+ FileHandler fh = new FileHandler();
363
+ int dataCopied = 0, metaCopied = 0;
364
+
365
+ // Copy text files
366
+ for (File f : fh.getFileListing(new File(newDataDir))) {
367
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
368
+ File dest = new File(FULL_DATA_DIR, f.getName());
369
+ if (!dest.exists()) {
370
+ Files.copy(f.toPath(), dest.toPath(),
371
+ StandardCopyOption.REPLACE_EXISTING);
372
+ dataCopied++;
373
+ }
374
+ }
375
+
376
+ // Copy metadata JSON files
377
+ for (File f : fh.getFileListing(new File(newMetaDir))) {
378
+ if (!f.isFile() || !f.getName().endsWith("_meta.json")) continue;
379
+ File dest = new File(FULL_META_DIR, f.getName());
380
+ if (!dest.exists()) {
381
+ Files.copy(f.toPath(), dest.toPath(),
382
+ StandardCopyOption.REPLACE_EXISTING);
383
+ metaCopied++;
384
+ }
385
+ }
386
+
387
+ log("Persist complete. Text files copied: " + dataCopied
388
+ + " Metadata files copied: " + metaCopied);
389
+ log("FULL_DATA_DIR : " + FULL_DATA_DIR);
390
+ log("FULL_META_DIR : " + FULL_META_DIR);
391
+
392
+ } catch (Exception e) {
393
+ e.printStackTrace();
394
+ }
395
+ }
396
+
397
+ // -----------------------------------------------------------------------
398
+ // Helpers
399
+ // -----------------------------------------------------------------------
400
+
401
+ private DocumentMetadata loadOrCreateMetadata(JSONProcessor jp, String stem) {
402
+ // Try to find a seed metadata JSON written by the source processor
403
+ // Filename conventions: stem + ".json" or stem + "_meta.json"
404
+ String[] candidates = {
405
+ newMetaDir + stem + "_meta.json",
406
+ newMetaDir + stem + ".json"
407
+ };
408
+ for (String path : candidates) {
409
+ File f = new File(path);
410
+ if (f.exists()) {
411
+ try {
412
+ JSONObject raw = jp.readJSON(f);
413
+ // First try full schema, then legacy format
414
+ if (raw.containsKey("Identifier")) {
415
+ return DocumentMetadata.fromJson(raw);
416
+ } else {
417
+ DocumentMetadata m = new DocumentMetadata(stem);
418
+ m.mergeLegacy(raw);
419
+ return m;
420
+ }
421
+ } catch (Exception e) {
422
+ log("[WARN] Could not parse metadata JSON for " + stem + ": " + e.getMessage());
423
+ }
424
+ }
425
+ }
426
+ // Fall back to empty skeleton
427
+ return new DocumentMetadata(stem);
428
+ }
429
+
430
+ private void writeMetadata(DocumentMetadata meta, String outPath) throws Exception {
431
+ JSONObject json = meta.toJson();
432
+ try (Writer w = new OutputStreamWriter(
433
+ new FileOutputStream(outPath), StandardCharsets.UTF_8)) {
434
+ json.writeJSONString(w);
435
+ }
436
+ }
437
+
438
+ private int countParagraphs(File txtFile) {
439
+ if (!txtFile.exists()) return 0;
440
+ int count = 0;
441
+ boolean inPara = false;
442
+ try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) {
443
+ while (sc.hasNextLine()) {
444
+ String line = sc.nextLine();
445
+ if (line.isBlank()) {
446
+ inPara = false;
447
+ } else {
448
+ if (!inPara) { count++; inPara = true; }
449
+ }
450
+ }
451
+ } catch (Exception e) { /* ignored */ }
452
+ return Math.max(count, 1);
453
+ }
454
+
455
+ private BiasAnalyser buildBiasAnalyser(BulgarianSentenceSplitter splitter) {
456
+ if (biasDictPath == null || biasDictPath.isBlank()) {
457
+ log("[WARN] No bias dictionary path set — bias scoring disabled.");
458
+ return null;
459
+ }
460
+ BiasLexicon lexicon = new BiasLexicon(biasDictPath);
461
+ return new BiasAnalyser(lexicon, splitter);
462
+ }
463
+
464
+ private void validateConfig() {
465
+ List<String> missing = new ArrayList<>();
466
+ if (sourceProcessor == null) missing.add("sourceProcessor");
467
+ if (newDataDir == null || newDataDir.isBlank()) missing.add("newDataDir");
468
+ if (sampleDir == null || sampleDir.isBlank()) missing.add("sampleDir");
469
+ if (newMetaDir == null || newMetaDir.isBlank()) missing.add("newMetaDir");
470
+ if (!missing.isEmpty())
471
+ throw new IllegalStateException(
472
+ "Pipeline configuration missing: " + missing);
473
+ }
474
+
475
+ private void ensureDirs(String... paths) {
476
+ for (String p : paths) {
477
+ if (p != null) new File(p).mkdirs();
478
+ }
479
+ }
480
+
481
+ private void banner(String msg) {
482
+ System.out.println("\n" + "=".repeat(60));
483
+ System.out.println(" " + msg);
484
+ System.out.println("=".repeat(60));
485
+ }
486
+
487
+ private void log(String msg) {
488
+ System.out.println("[Pipeline] " + msg);
489
+ }
490
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/MarcellProcessor.java ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ import java.io.File;
4
+ import java.io.FileOutputStream;
5
+ import java.io.OutputStreamWriter;
6
+ import java.io.Writer;
7
+ import java.util.Scanner;
8
+
9
+ import org.json.simple.JSONArray;
10
+ import org.json.simple.JSONObject;
11
+
12
+ import bg.bas.dcl.general.FileHandler;
13
+
14
+ /**
15
+ * Processes the MARCELL Bulgarian annotated corpus.
16
+ *
17
+ * Licence: CC0-1.0 (fixed for all MARCELL documents).
18
+ * Domain: "Държавно управление" (State governance).
19
+ * Style: "Административен".
20
+ */
21
+ public class MarcellProcessor extends BaseSourceProcessor {
22
+
23
+ private static final String LICENCE = "CC0-1.0";
24
+ private static final String LICENCE_LINK =
25
+ "https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf";
26
+ private static final String DOMAIN = "Държавно управление";
27
+ private static final String STYLE = "Административен";
28
+ private static final String PREFIX = "bg_MARCELL_";
29
+ private static final String EXT = ".conllup";
30
+
31
+ @Override
32
+ public void process(String indir, String outdir) {
33
+ try {
34
+ FileHandler fh = new FileHandler();
35
+ JSONObject json = new JSONObject();
36
+ JSONArray descrArray = new JSONArray();
37
+
38
+ for (File f : fh.getFileListing(new File(indir))) {
39
+ if (!f.isFile()) continue;
40
+
41
+ System.out.println("Processing: " + f.getAbsolutePath());
42
+
43
+ String tfname = PREFIX + f.getName().replace(EXT, "");
44
+
45
+ JSONObject fdescr = newBaseDescriptor(tfname);
46
+ fdescr.put("Licence", LICENCE);
47
+ fdescr.put("LicenceLink", LICENCE_LINK);
48
+ fdescr.put("Domain", DOMAIN);
49
+ fdescr.put("Style", STYLE);
50
+
51
+ Writer out = new OutputStreamWriter(
52
+ new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
53
+
54
+ Scanner s = new Scanner(f, "UTF-8");
55
+ int nw = 0, ns = 0, np = 0, nt = 0;
56
+
57
+ while (s.hasNextLine()) {
58
+ String line = s.nextLine();
59
+
60
+ // --- Metadata extraction ---
61
+ if (line.startsWith("# date =")) {
62
+ fdescr.put("PublicationDate", line.replace("# date =", "").trim());
63
+ } else if (line.startsWith("# title =")) {
64
+ fdescr.put("DocumentTitle", line.replace("# title =", "").trim());
65
+ } else if (line.startsWith("# issuer =")) {
66
+ fdescr.put("Author", line.replace("# issuer =", "").trim());
67
+ } else if (line.startsWith("# type =")) {
68
+ fdescr.put("Type", line.replace("# type =", "").trim());
69
+ } else if (line.startsWith("# url =")) {
70
+ fdescr.put("Url", line.replace("# url =", "").trim());
71
+ }
72
+
73
+ // --- Structure counting ---
74
+ else if (line.startsWith("# sent_id =")) {
75
+ ns++;
76
+ } else if (line.startsWith("# newpar id =")) {
77
+ np++;
78
+ out.write("\n");
79
+ }
80
+
81
+ // --- Text output ---
82
+ else if (line.startsWith("# text =")) {
83
+ out.write(line.replace("# text =", "").trim() + "\n");
84
+ out.flush();
85
+ } else {
86
+ // CoNLL-UP token line: count words and tokens
87
+ String[] cols = line.split("\t");
88
+ if (cols.length > 5) {
89
+ nt++;
90
+ if (!cols[3].equals("PUNCT")) nw++;
91
+ }
92
+ }
93
+ }
94
+
95
+ s.close();
96
+ out.flush();
97
+ out.close();
98
+
99
+ fdescr.put("NumberWords", nw);
100
+ fdescr.put("NumberSentences", ns);
101
+ fdescr.put("NumberParagraphs", np);
102
+ fdescr.put("NumberTokens", nt);
103
+
104
+ descrArray.add(fdescr);
105
+ }
106
+
107
+ json.put("metadata", descrArray);
108
+ writeMetadata(json, outdir, "metadata.json");
109
+
110
+ } catch (Exception e) {
111
+ e.printStackTrace();
112
+ }
113
+ }
114
+
115
+ // -----------------------------------------------------------------------
116
+
117
+ @SuppressWarnings("unchecked")
118
+ private void writeMetadata(JSONObject json, String outdir, String filename)
119
+ throws Exception {
120
+ String outMetaPath = outdir + filename;
121
+ Writer outMeta = new OutputStreamWriter(
122
+ new FileOutputStream(outMetaPath), "UTF-8");
123
+ json.writeJSONString(outMeta);
124
+ outMeta.flush();
125
+ outMeta.close();
126
+
127
+ convertJsonToCSV(json, outMetaPath + "_CSV.csv");
128
+ System.out.println("Metadata written to: " + outMetaPath);
129
+ }
130
+ }
java/bg/bas/dcl/LLMs/IfGPTDataset/SourceProcessor.java ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs.IfGPTDataset;
2
+
3
+ /**
4
+ */
5
+ public interface SourceProcessor {
6
+
7
+ /**
8
+ */
9
+ void process(String indir, String outdir);
10
+ }
java/bg/bas/dcl/LLMs/PIIDetector.java ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.io.BufferedWriter;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.OutputStreamWriter;
7
+ import java.nio.charset.StandardCharsets;
8
+ import java.util.ArrayList;
9
+ import java.util.Collections;
10
+ import java.util.LinkedHashMap;
11
+ import java.util.List;
12
+ import java.util.Map;
13
+ import java.util.Properties;
14
+ import java.util.Scanner;
15
+
16
+ import ai.philterd.phileas.model.configuration.PhileasConfiguration;
17
+ import ai.philterd.phileas.model.policy.Policy;
18
+ import ai.philterd.phileas.model.responses.FilterResponse;
19
+ import ai.philterd.phileas.model.responses.Span;
20
+ import ai.philterd.phileas.services.PlainTextFilterService;
21
+
22
+ import bg.bas.dcl.general.FileHandler;
23
+
24
+ /**
25
+ * PIIDetector
26
+ *
27
+ * Detects Personally Identifiable Information (PII) in Bulgarian text at
28
+ * sentence level using the <b>Phileas</b> library (ai.philterd:phileas).
29
+ *
30
+ * -----------------------------------------------------------------------
31
+ * NOTE ON "PIISA"
32
+ * PIISA (https://piisa.org) is a Python-only PII framework with no Java
33
+ * bindings. The closest Java-native equivalent with a compatible
34
+ * detection scope is Phileas (Apache 2.0, Maven Central, actively
35
+ * maintained as of 2025). This component uses Phileas and documents
36
+ * all places where a future PIISA Java binding could be substituted.
37
+ * -----------------------------------------------------------------------
38
+ *
39
+ * MAVEN DEPENDENCY (pom.xml):
40
+ * <pre>
41
+ * &lt;dependency&gt;
42
+ * &lt;groupId&gt;ai.philterd&lt;/groupId&gt;
43
+ * &lt;artifactId&gt;phileas&lt;/artifactId&gt;
44
+ * &lt;version&gt;3.1.0&lt;/version&gt;
45
+ * &lt;/dependency&gt;
46
+ * </pre>
47
+ *
48
+ * -----------------------------------------------------------------------
49
+ * PII TYPES DETECTED (Phileas built-in, language-agnostic unless noted):
50
+ *
51
+ * Person names (NER + census dictionary) | Ages | Email addresses
52
+ * Phone numbers | IP addresses (v4 + v6) | URLs | Credit card numbers
53
+ * SSN / TIN | IBAN codes | Bank account numbers | Dates | Zip codes
54
+ * MAC addresses | Bitcoin addresses | VINs | Passport numbers
55
+ * Driver licence numbers | Medical conditions
56
+ *
57
+ * Language note: NER-based person-name detection uses English models by
58
+ * default. For Bulgarian names, supply a custom dictionary filter
59
+ * (see {@link #buildPolicy()}) or integrate a Bulgarian NER model.
60
+ * Regex-based filters (emails, phones, IPs, etc.) are language-independent
61
+ * and work directly on Bulgarian text.
62
+ *
63
+ * -----------------------------------------------------------------------
64
+ * ALGORITHM (per sentence):
65
+ *
66
+ * 1. Phileas scans the sentence and returns a list of PII *spans*, each
67
+ * carrying a character start/end offset and a PII type label.
68
+ * 2. We map spans back to word tokens by checking which token positions
69
+ * overlap any detected span.
70
+ * 3. piiCoverage = |tokens overlapping PII spans| / |total word tokens|
71
+ *
72
+ * -----------------------------------------------------------------------
73
+ * USAGE
74
+ *
75
+ * BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
76
+ * PIIDetector detector = new PIIDetector(splitter);
77
+ *
78
+ * List&lt;SentencePIIScore&gt; scores = detector.analyseText("Иван Петров живее на ул. Роза 5.");
79
+ * for (SentencePIIScore s : scores) {
80
+ * System.out.printf("%.1f%% PII — %s%n", s.getPiiCoveragePercent(), s.getSentence());
81
+ * }
82
+ *
83
+ * // Corpus-level processing with TSV output
84
+ * detector.analyseDirectory("/path/to/corpus/", "/path/to/pii_report.tsv");
85
+ */
86
+ public class PIIDetector {
87
+
88
+ // -----------------------------------------------------------------------
89
+ // Constants
90
+ // -----------------------------------------------------------------------
91
+
92
+ /** Context string passed to Phileas (arbitrary; used for logging/caching). */
93
+ private static final String CONTEXT = "bg-corpus";
94
+
95
+ /** Document ID prefix; a counter suffix is appended per sentence. */
96
+ private static final String DOC_ID = "sent-";
97
+
98
+ /** Minimum word count for a sentence to be analysed. */
99
+ private static final int MIN_WORDS = 3;
100
+
101
+ // -----------------------------------------------------------------------
102
+ // Dependencies
103
+ // -----------------------------------------------------------------------
104
+
105
+ private final BulgarianSentenceSplitter splitter;
106
+ private final PlainTextFilterService filterService;
107
+ private final List<Policy> policies;
108
+
109
+ // -----------------------------------------------------------------------
110
+ // Constructors
111
+ // -----------------------------------------------------------------------
112
+
113
+ /**
114
+ * Creates a PIIDetector with the default policy (all built-in Phileas
115
+ * filters active, REDACT strategy so spans are easy to count).
116
+ *
117
+ * @param splitter an initialised {@link BulgarianSentenceSplitter}
118
+ */
119
+ public PIIDetector(BulgarianSentenceSplitter splitter) {
120
+ this(splitter, null);
121
+ }
122
+
123
+ /**
124
+ * Creates a PIIDetector with a custom Phileas {@link Policy}.
125
+ * Pass {@code null} to use the built-in all-PII policy.
126
+ *
127
+ * @param splitter an initialised {@link BulgarianSentenceSplitter}
128
+ * @param customPolicy a pre-built Phileas Policy, or null for default
129
+ */
130
+ public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) {
131
+ if (splitter == null)
132
+ throw new IllegalArgumentException("splitter must not be null");
133
+
134
+ this.splitter = splitter;
135
+
136
+ try {
137
+ Properties props = new Properties();
138
+ PhileasConfiguration config = new PhileasConfiguration(props);
139
+ this.filterService = new PlainTextFilterService(config);
140
+ this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy());
141
+ System.out.println("[PIIDetector] Phileas filter service initialised.");
142
+ } catch (Exception e) {
143
+ throw new RuntimeException("Failed to initialise Phileas filter service", e);
144
+ }
145
+ }
146
+
147
+ // -----------------------------------------------------------------------
148
+ // Public API
149
+ // -----------------------------------------------------------------------
150
+
151
+ /**
152
+ * Splits {@code text} into sentences and returns a {@link SentencePIIScore}
153
+ * for each sentence.
154
+ *
155
+ * Sentences shorter than {@link #MIN_WORDS} words receive a zero score
156
+ * without calling Phileas (to avoid spurious detections on fragments).
157
+ *
158
+ * @param text any Bulgarian plain text (may span multiple paragraphs)
159
+ * @return one score per detected sentence, in order; never null
160
+ */
161
+ public List<SentencePIIScore> analyseText(String text) {
162
+ List<SentencePIIScore> results = new ArrayList<>();
163
+ if (text == null || text.isBlank()) return results;
164
+
165
+ int docCounter = 0;
166
+ for (String sentence : splitter.split(text)) {
167
+ results.add(analyseSentence(sentence, DOC_ID + (docCounter++)));
168
+ }
169
+ return results;
170
+ }
171
+
172
+ /**
173
+ * Analyses a single pre-split sentence.
174
+ *
175
+ * @param sentence the sentence string (not null)
176
+ * @param docId a document/sentence identifier string for Phileas context
177
+ * @return a fully populated {@link SentencePIIScore}
178
+ */
179
+ public SentencePIIScore analyseSentence(String sentence, String docId) {
180
+
181
+ // --- Tokenise ---
182
+ String[] rawTokens = sentence.trim().split("\\s+");
183
+ List<String> tokens = new ArrayList<>();
184
+ for (String t : rawTokens) {
185
+ String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", "");
186
+ if (!clean.isEmpty()) tokens.add(clean);
187
+ }
188
+ int totalWords = tokens.size();
189
+
190
+ if (totalWords < MIN_WORDS) {
191
+ return SentencePIIScore.empty(sentence, totalWords);
192
+ }
193
+
194
+ // --- Run Phileas ---
195
+ List<Span> spans;
196
+ try {
197
+ FilterResponse response = filterService.filter(
198
+ policies, CONTEXT, docId, sentence, null);
199
+ spans = response.getSpans() != null ? response.getSpans() : List.of();
200
+ } catch (Exception e) {
201
+ System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage());
202
+ return SentencePIIScore.error(sentence, totalWords, e.getMessage());
203
+ }
204
+
205
+ // --- Map character-level spans back to token positions ---
206
+ // Build token character offsets from the original sentence string
207
+ int[] tokenStart = new int[tokens.size()];
208
+ int[] tokenEnd = new int[tokens.size()];
209
+ int cursor = 0;
210
+ for (int ti = 0; ti < tokens.size(); ti++) {
211
+ String tok = tokens.get(ti);
212
+ int idx = sentence.indexOf(tok, cursor);
213
+ if (idx < 0) {
214
+ // Fallback: token not found at expected position (normalisation artefact)
215
+ tokenStart[ti] = cursor;
216
+ tokenEnd[ti] = cursor + tok.length();
217
+ } else {
218
+ tokenStart[ti] = idx;
219
+ tokenEnd[ti] = idx + tok.length();
220
+ cursor = idx + tok.length();
221
+ }
222
+ }
223
+
224
+ // Count distinct PII tokens and collect type labels per token
225
+ Map<Integer, String> piiTokenType = new LinkedHashMap<>(); // tokenIndex → PII type
226
+ for (Span span : spans) {
227
+ int spanStart = span.getStart();
228
+ int spanEnd = span.getEnd();
229
+ String type = span.getFilterType() != null
230
+ ? span.getFilterType().name()
231
+ : "UNKNOWN";
232
+
233
+ for (int ti = 0; ti < tokens.size(); ti++) {
234
+ // Overlap: token and span share at least one character
235
+ if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) {
236
+ piiTokenType.put(ti, type);
237
+ }
238
+ }
239
+ }
240
+
241
+ // --- Build type frequency map ---
242
+ Map<String, Integer> typeCounts = new LinkedHashMap<>();
243
+ for (String type : piiTokenType.values()) {
244
+ typeCounts.merge(type, 1, Integer::sum);
245
+ }
246
+
247
+ int piiTokenCount = piiTokenType.size();
248
+ double coverage = totalWords > 0
249
+ ? (double) piiTokenCount / totalWords
250
+ : 0.0;
251
+
252
+ return new SentencePIIScore(
253
+ sentence, totalWords, piiTokenCount, coverage,
254
+ new ArrayList<>(piiTokenType.values()),
255
+ typeCounts, spans, null);
256
+ }
257
+
258
+ // -----------------------------------------------------------------------
259
+ // Corpus-level processing
260
+ // -----------------------------------------------------------------------
261
+
262
+ /**
263
+ * Analyses all .txt files in {@code corpusDir} sentence by sentence and
264
+ * writes results to a TSV file at {@code reportPath}.
265
+ *
266
+ * Only sentences with at least one PII token are written to the report.
267
+ *
268
+ * @param corpusDir directory of plain-text .txt files
269
+ * @param reportPath destination TSV report file path
270
+ */
271
+ public void analyseDirectory(String corpusDir, String reportPath) {
272
+ try {
273
+ FileHandler fh = new FileHandler();
274
+ int filesProcessed = 0, sentencesWritten = 0;
275
+
276
+ try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
277
+ new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) {
278
+
279
+ bw.write("file\t" + SentencePIIScore.tsvHeader());
280
+ bw.newLine();
281
+
282
+ for (File f : fh.getFileListing(new File(corpusDir))) {
283
+ if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
284
+
285
+ System.out.println("[PIIDetector] Processing: " + f.getName());
286
+
287
+ StringBuilder text = new StringBuilder();
288
+ try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
289
+ while (sc.hasNextLine()) text.append(sc.nextLine()).append(' ');
290
+ }
291
+
292
+ int docCounter = 0;
293
+ for (SentencePIIScore score : analyseText(text.toString())) {
294
+ if (score.hasPII()) {
295
+ bw.write(f.getName() + "\t" + score.toTsv());
296
+ bw.newLine();
297
+ sentencesWritten++;
298
+ }
299
+ docCounter++;
300
+ }
301
+ filesProcessed++;
302
+ }
303
+ }
304
+
305
+ System.out.printf("[PIIDetector] Done. Files: %d Sentences with PII written: %d%n",
306
+ filesProcessed, sentencesWritten);
307
+
308
+ } catch (Exception e) {
309
+ e.printStackTrace();
310
+ }
311
+ }
312
+
313
+ // -----------------------------------------------------------------------
314
+ // Policy builder
315
+ // -----------------------------------------------------------------------
316
+
317
+ /**
318
+ * Builds the default Phileas {@link Policy} that activates all
319
+ * language-agnostic PII filters with a REDACT strategy (so that
320
+ * span positions remain stable for overlap calculation).
321
+ *
322
+ * To customise, edit the JSON string below or deserialise your own
323
+ * policy from a .json file with:
324
+ * Policy policy = Policy.fromJson(new String(Files.readAllBytes(path)));
325
+ *
326
+ * To add a Bulgarian names dictionary, add an "identifiers.dictionary"
327
+ * block pointing to a file of Bulgarian given names and surnames.
328
+ */
329
+ private Policy buildPolicy() throws Exception {
330
+ String policyJson = "{"
331
+ + "\"name\": \"pii-all\","
332
+ + "\"identifiers\": {"
333
+ + "\"emailAddress\": {\"emailAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
334
+ + "\"phoneNumber\": {\"phoneNumberFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
335
+ + "\"ipAddress\": {\"ipAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
336
+ + "\"url\": {\"urlFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
337
+ + "\"creditCard\": {\"creditCardFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
338
+ + "\"ssn\": {\"ssnFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
339
+ + "\"ibanCode\": {\"ibanCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
340
+ + "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
341
+ + "\"date\": {\"dateFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
342
+ + "\"age\": {\"ageFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
343
+ + "\"macAddress\": {\"macAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
344
+ + "\"bitcoinAddress\": {\"bitcoinAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
345
+ + "\"vin\": {\"vinFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
346
+ + "\"zipCode\": {\"zipCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
347
+ + "\"person\": {\"personFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}"
348
+ + "}"
349
+ + "}";
350
+ return Policy.fromJson(policyJson);
351
+ }
352
+
353
+ // -----------------------------------------------------------------------
354
+ // Inner result class
355
+ // -----------------------------------------------------------------------
356
+
357
+ /**
358
+ * Immutable result object for one sentence's PII analysis.
359
+ */
360
+ public static class SentencePIIScore {
361
+
362
+ private final String sentence;
363
+ private final int totalWords;
364
+ private final int piiTokenCount;
365
+ /** PII coverage: piiTokenCount / totalWords in [0, 1]. */
366
+ private final double piiCoverage;
367
+ /** Ordered list of PII type labels for each PII token found. */
368
+ private final List<String> piiTypes;
369
+ /** Frequency of each PII type in this sentence. */
370
+ private final Map<String, Integer> typeFrequency;
371
+ /** Raw Phileas spans (character-level). */
372
+ private final List<Span> spans;
373
+ /** Non-null if Phileas threw an exception for this sentence. */
374
+ private final String errorMessage;
375
+
376
+ SentencePIIScore(String sentence, int totalWords, int piiTokenCount,
377
+ double piiCoverage, List<String> piiTypes,
378
+ Map<String, Integer> typeFrequency,
379
+ List<Span> spans, String errorMessage) {
380
+ this.sentence = sentence;
381
+ this.totalWords = totalWords;
382
+ this.piiTokenCount = piiTokenCount;
383
+ this.piiCoverage = piiCoverage;
384
+ this.piiTypes = Collections.unmodifiableList(piiTypes);
385
+ this.typeFrequency = Collections.unmodifiableMap(typeFrequency);
386
+ this.spans = spans != null
387
+ ? Collections.unmodifiableList(spans)
388
+ : List.of();
389
+ this.errorMessage = errorMessage;
390
+ }
391
+
392
+ static SentencePIIScore empty(String sentence, int totalWords) {
393
+ return new SentencePIIScore(sentence, totalWords, 0, 0.0,
394
+ List.of(), Map.of(), List.of(), null);
395
+ }
396
+
397
+ static SentencePIIScore error(String sentence, int totalWords, String msg) {
398
+ return new SentencePIIScore(sentence, totalWords, 0, 0.0,
399
+ List.of(), Map.of(), List.of(), msg);
400
+ }
401
+
402
+ // --- Accessors ---
403
+
404
+ public String getSentence() { return sentence; }
405
+ public int getTotalWords() { return totalWords; }
406
+ public int getPiiTokenCount() { return piiTokenCount; }
407
+ /** PII coverage ratio in [0, 1]. */
408
+ public double getPiiCoverage() { return piiCoverage; }
409
+ /** PII coverage expressed as a percentage [0, 100]. */
410
+ public double getPiiCoveragePercent() { return piiCoverage * 100.0; }
411
+ public List<String> getPiiTypes() { return piiTypes; }
412
+ public Map<String, Integer> getTypeFrequency() { return typeFrequency; }
413
+ public List<Span> getSpans() { return spans; }
414
+ public boolean hasPII() { return piiTokenCount > 0; }
415
+ public boolean hasError() { return errorMessage != null; }
416
+ public String getErrorMessage() { return errorMessage; }
417
+
418
+ /** Number of distinct PII categories detected in this sentence. */
419
+ public int distinctPiiTypes() { return typeFrequency.size(); }
420
+
421
+ // --- TSV export ---
422
+
423
+ /**
424
+ * TSV row: sentence | totalWords | piiTokens | coverage% | distinctTypes | typeFrequency
425
+ */
426
+ public String toTsv() {
427
+ return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s",
428
+ sentence.replace('\t', ' '),
429
+ totalWords,
430
+ piiTokenCount,
431
+ piiCoverage,
432
+ getPiiCoveragePercent(),
433
+ distinctPiiTypes(),
434
+ typeFrequency.toString());
435
+ }
436
+
437
+ public static String tsvHeader() {
438
+ return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency";
439
+ }
440
+
441
+ @Override
442
+ public String toString() {
443
+ return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}",
444
+ totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet());
445
+ }
446
+ }
447
+ }
java/bg/bas/dcl/LLMs/SentenceBiasScore.java ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package bg.bas.dcl.LLMs;
2
+
3
+ import java.util.Collections;
4
+ import java.util.List;
5
+ import java.util.Map;
6
+
7
+ /**
8
+ * SentenceBiasScore
9
+ *
10
+ */
11
+ public class SentenceBiasScore {
12
+
13
+ public static final String[] BIAS_TYPES = {
14
+ "gender", "race_ethnicity", "religion", "disability", "appearance"
15
+ };
16
+
17
+
18
+ private final String sentence;
19
+
20
+ private final int totalWords;
21
+
22
+
23
+ private final Map<String, Double> pairCoverage;
24
+
25
+
26
+ private final Map<String, Integer> signalCount;
27
+
28
+
29
+ private final Map<String, Integer> evaluatorCount;
30
+
31
+
32
+ /** All dictionary entries matched in this sentence (lemma strings). */
33
+ private final List<String> matchedLemmas;
34
+
35
+ /** Total matched bias words (evaluative, non-neutral). */
36
+ private final int totalBiasWords;
37
+
38
+ /** Count of matched derogatory terms. */
39
+ private final int totalDerogatory;
40
+
41
+ /** Count of matched colloquial terms. */
42
+ private final int totalColloquial;
43
+
44
+
45
+ private final boolean multiType;
46
+
47
+
48
+ SentenceBiasScore(String sentence,
49
+ int totalWords,
50
+ Map<String, Double> pairCoverage,
51
+ Map<String, Integer> signalCount,
52
+ Map<String, Integer> evaluatorCount,
53
+ List<String> matchedLemmas,
54
+ int totalBiasWords,
55
+ int totalDerogatory,
56
+ int totalColloquial,
57
+ boolean multiType) {
58
+ this.sentence = sentence;
59
+ this.totalWords = totalWords;
60
+ this.pairCoverage = Collections.unmodifiableMap(pairCoverage);
61
+ this.signalCount = Collections.unmodifiableMap(signalCount);
62
+ this.evaluatorCount = Collections.unmodifiableMap(evaluatorCount);
63
+ this.matchedLemmas = Collections.unmodifiableList(matchedLemmas);
64
+ this.totalBiasWords = totalBiasWords;
65
+ this.totalDerogatory= totalDerogatory;
66
+ this.totalColloquial= totalColloquial;
67
+ this.multiType = multiType;
68
+ }
69
+
70
+
71
+ public double getPairCoverage(String biasType) {
72
+ if (biasType == null || biasType.isBlank()) return totalCoverage();
73
+ return pairCoverage.getOrDefault(biasType.toLowerCase(), 0.0);
74
+ }
75
+
76
+
77
+ public double totalCoverage() {
78
+ double sum = 0;
79
+ for (double v : pairCoverage.values()) sum += v;
80
+ return sum;
81
+ }
82
+
83
+
84
+ public double[] coverageArray() {
85
+ double[] arr = new double[BIAS_TYPES.length];
86
+ for (int i = 0; i < BIAS_TYPES.length; i++)
87
+ arr[i] = getPairCoverage(BIAS_TYPES[i]);
88
+ return arr;
89
+ }
90
+
91
+ /** True if any bias type has a non-zero pair-coverage score. */
92
+ public boolean isBiased() {
93
+ for (double v : pairCoverage.values())
94
+ if (v > 0) return true;
95
+ return false;
96
+ }
97
+
98
+
99
+ public String getSentence() { return sentence; }
100
+ public int getTotalWords() { return totalWords; }
101
+ public int getSignalCount(String type) { return signalCount.getOrDefault(type, 0); }
102
+ public int getEvaluatorCount(String type) { return evaluatorCount.getOrDefault(type, 0); }
103
+ public List<String>getMatchedLemmas() { return matchedLemmas; }
104
+ public int getTotalBiasWords() { return totalBiasWords; }
105
+ public int getTotalDerogatory() { return totalDerogatory; }
106
+ public int getTotalColloquial() { return totalColloquial; }
107
+ public boolean isMultiType() { return multiType; }
108
+
109
+
110
+ public String toTsv() {
111
+ StringBuilder sb = new StringBuilder();
112
+ sb.append(sentence).append('\t');
113
+ sb.append(totalWords).append('\t');
114
+ sb.append(matchedLemmas).append('\t');
115
+
116
+ for (String type : BIAS_TYPES) {
117
+ sb.append(signalCount.getOrDefault(type, 0)).append('\t');
118
+ sb.append(evaluatorCount.getOrDefault(type, 0)).append('\t');
119
+ sb.append(String.format("%.4f", getPairCoverage(type))).append('\t');
120
+ }
121
+
122
+ sb.append(totalBiasWords).append('\t');
123
+ sb.append(totalDerogatory).append('\t');
124
+ sb.append(totalColloquial).append('\t');
125
+ sb.append(multiType ? 1 : 0).append('\t');
126
+ sb.append(String.format("%.4f", totalCoverage()));
127
+
128
+ return sb.toString();
129
+ }
130
+
131
+
132
+ public static String tsvHeader() {
133
+ StringBuilder sb = new StringBuilder();
134
+ sb.append("sentence\ttotalWords\tmatchedLemmas\t");
135
+ for (String type : BIAS_TYPES)
136
+ sb.append(type).append("_signals\t")
137
+ .append(type).append("_evaluators\t")
138
+ .append(type).append("_coverage\t");
139
+ sb.append("totalBiasWords\ttotalDerogatory\ttotalColloquial\t")
140
+ .append("multiType\ttotalCoverage");
141
+ return sb.toString();
142
+ }
143
+
144
+ @Override
145
+ public String toString() {
146
+ return String.format("SentenceBiasScore{words=%d, coverage=%.3f, biased=%b, sentence='%s'}",
147
+ totalWords, totalCoverage(), isBiased(),
148
+ sentence.length() > 80 ? sentence.substring(0, 80) + "…" : sentence);
149
+ }
150
+ }
resources/bulgarian_bias_dictionary_v4.tsv ADDED
The diff for this file is too large to render. See raw diff
 
resources/metadata_schema.json ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://dcl.bas.bg/ifgpt/metadata-schema/v1.0",
4
+ "title": "IfGPT Document Metadata Schema",
5
+ "description": "Metadata schema for textual documents in the IfGPT Bulgarian language dataset. 15 mandatory fields + 8 optional fields.",
6
+ "type": "object",
7
+
8
+ "required": [
9
+ "Identifier",
10
+ "Licence",
11
+ "PublicationDate",
12
+ "DocumentTitle",
13
+ "Source",
14
+ "Medium",
15
+ "Url",
16
+ "Domain",
17
+ "Keywords",
18
+ "NumberWords",
19
+ "NumberSentences",
20
+ "NumberParagraphs",
21
+ "NumberTokens",
22
+ "PersonallyIdentifiableInformation",
23
+ "BiasedInformation"
24
+ ],
25
+
26
+ "properties": {
27
+
28
+ "Identifier": {
29
+ "type": "string",
30
+ "description": "Unique document identifier with the language prefix 'bg'.",
31
+ "pattern": "^bg_",
32
+ "examples": ["bg_bnc_12345", "bg_MARCELL_doc001", "bg_CURLICAT_xyz"]
33
+ },
34
+
35
+ "Licence": {
36
+ "type": "string",
37
+ "description": "Licence name with classification by type (open, restricted, etc.).",
38
+ "enum": [
39
+ "CC0",
40
+ "CC0-1.0",
41
+ "CC-BY-4.0",
42
+ "CC-BY-SA-4.0",
43
+ "CC-BY-NC-4.0",
44
+ "CC-BY-NC-SA-4.0",
45
+ "Restricted",
46
+ "Proprietary",
47
+ "Unknown"
48
+ ]
49
+ },
50
+
51
+ "PublicationDate": {
52
+ "type": "string",
53
+ "description": "Date of publication of the text (yyyy-mm-dd).",
54
+ "pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
55
+ "examples": ["2023-04-15", "2019-01-01", ""]
56
+ },
57
+
58
+ "DocumentTitle": {
59
+ "type": "string",
60
+ "description": "Title of the document.",
61
+ "examples": ["Закон за защита на данните", "Статия за климатичните промени"]
62
+ },
63
+
64
+ "Source": {
65
+ "type": "string",
66
+ "description": "Publishing organisation, media outlet or institutional originator.",
67
+ "examples": ["Министерски съвет", "БНР", "Сега"]
68
+ },
69
+
70
+ "Medium": {
71
+ "type": "string",
72
+ "description": "Modality of the resource.",
73
+ "enum": ["textual", "multimodal"]
74
+ },
75
+
76
+ "Url": {
77
+ "type": "string",
78
+ "description": "Original web address of the document.",
79
+ "format": "uri",
80
+ "examples": ["https://www.lex.bg/laws/ldoc/123", ""]
81
+ },
82
+
83
+ "Domain": {
84
+ "type": "array",
85
+ "description": "Up to six subject areas from a controlled vocabulary.",
86
+ "maxItems": 6,
87
+ "items": {
88
+ "type": "string",
89
+ "enum": [
90
+ "Държавно управление",
91
+ "Право и законодателство",
92
+ "Икономика и финанси",
93
+ "Образование",
94
+ "Наука и технологии",
95
+ "Здравеопазване",
96
+ "Култура и изкуство",
97
+ "Спорт",
98
+ "Медии и журналистика",
99
+ "Общество и политика",
100
+ "Околна среда",
101
+ "Религия",
102
+ "История",
103
+ "Литература и художествена проза",
104
+ "Неформална комуникация",
105
+ "Друго"
106
+ ]
107
+ },
108
+ "examples": [["Държавно управление"], ["Медии и журналистика", "Общество и политика"]]
109
+ },
110
+
111
+ "Keywords": {
112
+ "type": "array",
113
+ "description": "Up to six free-text keywords characterising the content.",
114
+ "maxItems": 6,
115
+ "items": { "type": "string" },
116
+ "examples": [["климат", "законодателство", "ЕС"]]
117
+ },
118
+
119
+ "NumberWords": {
120
+ "type": "integer",
121
+ "description": "Total number of words (non-punctuation tokens).",
122
+ "minimum": 0
123
+ },
124
+
125
+ "NumberSentences": {
126
+ "type": "integer",
127
+ "description": "Total number of sentences.",
128
+ "minimum": 0
129
+ },
130
+
131
+ "NumberParagraphs": {
132
+ "type": "integer",
133
+ "description": "Total number of paragraphs.",
134
+ "minimum": 0
135
+ },
136
+
137
+ "NumberTokens": {
138
+ "type": "integer",
139
+ "description": "Total number of tokens (words + punctuation).",
140
+ "minimum": 0
141
+ },
142
+
143
+ "PersonallyIdentifiableInformation": {
144
+ "type": "array",
145
+ "description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as personally identifiable information, in [0,1]. Length equals NumberSentences.",
146
+ "items": {
147
+ "type": "number",
148
+ "minimum": 0.0,
149
+ "maximum": 1.0
150
+ },
151
+ "examples": [[0.0, 0.0, 0.15, 0.0, 0.05]]
152
+ },
153
+
154
+ "BiasedInformation": {
155
+ "type": "array",
156
+ "description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as potentially biased (signal-evaluator pair coverage), in [0,1]. Length equals NumberSentences.",
157
+ "items": {
158
+ "type": "number",
159
+ "minimum": 0.0,
160
+ "maximum": 1.0
161
+ },
162
+ "examples": [[0.0, 0.0, 0.0, 0.10, 0.0]]
163
+ },
164
+
165
+ "Author": {
166
+ "type": "array",
167
+ "description": "[Optional] Name(s) of the author(s).",
168
+ "items": { "type": "string" },
169
+ "examples": [["Иван Иванов"], ["Агенция БТА"]]
170
+ },
171
+
172
+ "Style": {
173
+ "type": "string",
174
+ "description": "[Optional] Stylistic register of the document.",
175
+ "enum": [
176
+ "Административен",
177
+ "Журналистически",
178
+ "Научен",
179
+ "Художествен",
180
+ "Разговорен",
181
+ "Правен",
182
+ "Технически",
183
+ "Неформален",
184
+ ""
185
+ ]
186
+ },
187
+
188
+ "Type": {
189
+ "type": "string",
190
+ "description": "[Optional] Document genre.",
191
+ "enum": [
192
+ "Закон",
193
+ "Наредба",
194
+ "Решение",
195
+ "Статия",
196
+ "Книга",
197
+ "Доклад",
198
+ "Интервю",
199
+ "Коментар",
200
+ "Форум",
201
+ "Блог",
202
+ "Уикипедия",
203
+ "Друго",
204
+ ""
205
+ ]
206
+ },
207
+
208
+ "Subdomain": {
209
+ "type": "array",
210
+ "description": "[Optional] Narrower thematic classification, hierarchically linked to Domain.",
211
+ "maxItems": 6,
212
+ "items": { "type": "string" },
213
+ "examples": [["Европейско законодателство"], ["Климатична политика"]]
214
+ },
215
+
216
+ "TranslatedDocument": {
217
+ "type": ["boolean", "string"],
218
+ "description": "[Optional] true = translation into Bulgarian; false = original Bulgarian text.",
219
+ "examples": [false, true, ""]
220
+ },
221
+
222
+ "CollectionDate": {
223
+ "type": "string",
224
+ "description": "[Optional] Date of acquisition into the collection (yyyy-mm-dd).",
225
+ "pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
226
+ "examples": ["2024-03-10", ""]
227
+ },
228
+
229
+ "LicenceLink": {
230
+ "type": "string",
231
+ "description": "[Optional] URL of the licence text.",
232
+ "format": "uri",
233
+ "examples": [
234
+ "https://creativecommons.org/public-domain/cc0/",
235
+ "https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf",
236
+ ""
237
+ ]
238
+ },
239
+
240
+ "TaskCategories": {
241
+ "type": "array",
242
+ "description": "[Optional] Anticipated NLP applications from a predefined list.",
243
+ "items": {
244
+ "type": "string",
245
+ "enum": [
246
+ "Language Modelling",
247
+ "Text Classification",
248
+ "Named Entity Recognition",
249
+ "Machine Translation",
250
+ "Summarisation",
251
+ "Question Answering",
252
+ "Sentiment Analysis",
253
+ "Bias Detection",
254
+ "PII Detection",
255
+ "Information Extraction",
256
+ "Coreference Resolution",
257
+ "Dependency Parsing",
258
+ "Other"
259
+ ]
260
+ },
261
+ "examples": [["Language Modelling", "Named Entity Recognition"]]
262
+ }
263
+
264
+ },
265
+
266
+ "additionalProperties": false
267
+ }