Upload 22 files
Browse files- java/bg/bas/dcl/LLMs/BiasAnalyser.java +344 -0
- java/bg/bas/dcl/LLMs/BiasDetectorDemo.java +111 -0
- java/bg/bas/dcl/LLMs/BiasEntry.java +151 -0
- java/bg/bas/dcl/LLMs/BiasLexicon.java +258 -0
- java/bg/bas/dcl/LLMs/BulgarianSentenceSplitter.java +163 -0
- java/bg/bas/dcl/LLMs/DeduplicationProcessor.java +571 -0
- java/bg/bas/dcl/LLMs/FileCleanProcessor.java +453 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/.BulNCProcessor.java.kate-swp +0 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/.CurlicatProcessor.java.kate-swp +0 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/BaseSourceProcessor.java +180 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCProcessor.java +188 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCWikiProcessor.java +154 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/CurlicatProcessor.java +160 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/DocumentMetadata.java +376 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTDatasetProcessor.java +160 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTPipeline.java +490 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/MarcellProcessor.java +130 -0
- java/bg/bas/dcl/LLMs/IfGPTDataset/SourceProcessor.java +10 -0
- java/bg/bas/dcl/LLMs/PIIDetector.java +447 -0
- java/bg/bas/dcl/LLMs/SentenceBiasScore.java +150 -0
- resources/bulgarian_bias_dictionary_v4.tsv +0 -0
- resources/metadata_schema.json +267 -0
java/bg/bas/dcl/LLMs/BiasAnalyser.java
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.io.BufferedWriter;
|
| 4 |
+
import java.io.File;
|
| 5 |
+
import java.io.FileOutputStream;
|
| 6 |
+
import java.io.OutputStreamWriter;
|
| 7 |
+
import java.nio.charset.StandardCharsets;
|
| 8 |
+
import java.util.ArrayList;
|
| 9 |
+
import java.util.Arrays;
|
| 10 |
+
import java.util.HashMap;
|
| 11 |
+
import java.util.HashSet;
|
| 12 |
+
import java.util.List;
|
| 13 |
+
import java.util.Map;
|
| 14 |
+
import java.util.Scanner;
|
| 15 |
+
import java.util.Set;
|
| 16 |
+
|
| 17 |
+
import bg.bas.dcl.general.FileHandler;
|
| 18 |
+
|
| 19 |
+
/**
|
| 20 |
+
* BiasAnalyser
|
| 21 |
+
*
|
| 22 |
+
* Detects linguistic bias in Bulgarian text using the Bulgarian Bias Dictionary
|
| 23 |
+
* (v4 TSV format). Works at sentence level: for each sentence it returns a
|
| 24 |
+
* {@link SentenceBiasScore} whose primary metric is the pair-coverage percentage —
|
| 25 |
+
* the fraction of word tokens in the sentence that participate in at least one
|
| 26 |
+
* signal–evaluator pair for each bias category.
|
| 27 |
+
*
|
| 28 |
+
* -----------------------------------------------------------------------
|
| 29 |
+
* ALGORITHM (per sentence)
|
| 30 |
+
*
|
| 31 |
+
* 1. TOKENISE — split on whitespace, strip non-letter characters per token.
|
| 32 |
+
* 2. MATCH — look each token up in the {@link BiasLexicon} (form index,
|
| 33 |
+
* case-insensitive). Multi-word entries are tried first via a
|
| 34 |
+
* forward-scan for bigrams and trigrams.
|
| 35 |
+
* 3. PAIR — for every signal token, search within ±PAIR_WINDOW tokens for
|
| 36 |
+
* an evaluator token of the same bias type (or a general one).
|
| 37 |
+
* Each unique (signal position, evaluator position) is a pair.
|
| 38 |
+
* 4. SCORE — pairCoverage[type] = distinctPairTokens[type] / totalWords
|
| 39 |
+
* where distinctPairTokens = set of positions involved in
|
| 40 |
+
* at least one confirmed pair for that type.
|
| 41 |
+
*
|
| 42 |
+
|
| 43 |
+
*/
|
| 44 |
+
public class BiasAnalyser {
|
| 45 |
+
|
| 46 |
+
// -----------------------------------------------------------------------
|
| 47 |
+
// Constants
|
| 48 |
+
// -----------------------------------------------------------------------
|
| 49 |
+
|
| 50 |
+
/**
|
| 51 |
+
* Maximum token distance between a signal and an evaluator for them to
|
| 52 |
+
* be counted as a pair. 10 matches the window used in the original
|
| 53 |
+
* BiasDetector.
|
| 54 |
+
*/
|
| 55 |
+
public static final int PAIR_WINDOW = 10;
|
| 56 |
+
|
| 57 |
+
/**
|
| 58 |
+
* Sentences with fewer words than this are skipped entirely.
|
| 59 |
+
*/
|
| 60 |
+
public static final int MIN_WORDS = 6;
|
| 61 |
+
|
| 62 |
+
/**
|
| 63 |
+
* Sentences with more words than this are still processed but a warning
|
| 64 |
+
* is printed (very long sentences may inflate scores).
|
| 65 |
+
*/
|
| 66 |
+
public static final int MAX_WORDS = 200;
|
| 67 |
+
|
| 68 |
+
// -----------------------------------------------------------------------
|
| 69 |
+
// Dependencies
|
| 70 |
+
// -----------------------------------------------------------------------
|
| 71 |
+
|
| 72 |
+
private final BiasLexicon lexicon;
|
| 73 |
+
private final BulgarianSentenceSplitter splitter;
|
| 74 |
+
|
| 75 |
+
// -----------------------------------------------------------------------
|
| 76 |
+
// Constructor
|
| 77 |
+
// -----------------------------------------------------------------------
|
| 78 |
+
|
| 79 |
+
/**
|
| 80 |
+
* @param lexicon the loaded bias dictionary
|
| 81 |
+
* @param splitter an initialised Bulgarian sentence splitter
|
| 82 |
+
*/
|
| 83 |
+
public BiasAnalyser(BiasLexicon lexicon, BulgarianSentenceSplitter splitter) {
|
| 84 |
+
if (lexicon == null) throw new IllegalArgumentException("lexicon must not be null");
|
| 85 |
+
if (splitter == null) throw new IllegalArgumentException("splitter must not be null");
|
| 86 |
+
this.lexicon = lexicon;
|
| 87 |
+
this.splitter = splitter;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
// -----------------------------------------------------------------------
|
| 91 |
+
// Public API
|
| 92 |
+
// -----------------------------------------------------------------------
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* Splits {@code text} into sentences and returns a bias score for each.
|
| 96 |
+
*/
|
| 97 |
+
public List<SentenceBiasScore> analyseText(String text) {
|
| 98 |
+
List<SentenceBiasScore> results = new ArrayList<>();
|
| 99 |
+
if (text == null || text.isBlank()) return results;
|
| 100 |
+
|
| 101 |
+
for (String sentence : splitter.split(text)) {
|
| 102 |
+
results.add(analyseSentence(sentence));
|
| 103 |
+
}
|
| 104 |
+
return results;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
/**
|
| 108 |
+
* Analyses a single pre-split sentence.
|
| 109 |
+
*
|
| 110 |
+
*/
|
| 111 |
+
public SentenceBiasScore analyseSentence(String sentence) {
|
| 112 |
+
// --- Tokenise --------------------------------------------------
|
| 113 |
+
String lower = sentence.toLowerCase();
|
| 114 |
+
String[] rawTokens = lower.split("\\s+");
|
| 115 |
+
|
| 116 |
+
// Build clean token list and a parallel lookup list
|
| 117 |
+
// We attempt multi-word matches (bigrams, trigrams) first
|
| 118 |
+
List<String> cleanTokens = new ArrayList<>(); // word-only tokens
|
| 119 |
+
List<BiasEntry> matched = new ArrayList<>(); // parallel match (null=no match)
|
| 120 |
+
|
| 121 |
+
int i = 0;
|
| 122 |
+
while (i < rawTokens.length) {
|
| 123 |
+
// Try trigram (3-word multi-word entry)
|
| 124 |
+
if (i + 2 < rawTokens.length) {
|
| 125 |
+
String tri = clean(rawTokens[i]) + " "
|
| 126 |
+
+ clean(rawTokens[i + 1]) + " "
|
| 127 |
+
+ clean(rawTokens[i + 2]);
|
| 128 |
+
BiasEntry e = lexicon.lookup(tri);
|
| 129 |
+
if (e != null) {
|
| 130 |
+
// Represent as 3 tokens (positions), all pointing to same entry
|
| 131 |
+
for (int k = 0; k < 3; k++) {
|
| 132 |
+
cleanTokens.add(clean(rawTokens[i + k]));
|
| 133 |
+
matched.add(e);
|
| 134 |
+
}
|
| 135 |
+
i += 3;
|
| 136 |
+
continue;
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
// Try bigram
|
| 140 |
+
if (i + 1 < rawTokens.length) {
|
| 141 |
+
String bi = clean(rawTokens[i]) + " " + clean(rawTokens[i + 1]);
|
| 142 |
+
BiasEntry e = lexicon.lookup(bi);
|
| 143 |
+
if (e != null) {
|
| 144 |
+
for (int k = 0; k < 2; k++) {
|
| 145 |
+
cleanTokens.add(clean(rawTokens[i + k]));
|
| 146 |
+
matched.add(e);
|
| 147 |
+
}
|
| 148 |
+
i += 2;
|
| 149 |
+
continue;
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
// Unigram
|
| 153 |
+
String tok = clean(rawTokens[i]);
|
| 154 |
+
if (!tok.isEmpty()) {
|
| 155 |
+
cleanTokens.add(tok);
|
| 156 |
+
matched.add(lexicon.lookup(tok));
|
| 157 |
+
}
|
| 158 |
+
i++;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
int totalWords = cleanTokens.size();
|
| 162 |
+
|
| 163 |
+
String[] biasTypes = SentenceBiasScore.BIAS_TYPES;
|
| 164 |
+
|
| 165 |
+
Map<String, Integer> signalCount = new HashMap<>();
|
| 166 |
+
Map<String, Integer> evaluatorCount = new HashMap<>();
|
| 167 |
+
Map<String, Double> pairCoverage = new HashMap<>();
|
| 168 |
+
|
| 169 |
+
for (String type : biasTypes) {
|
| 170 |
+
signalCount.put(type, 0);
|
| 171 |
+
evaluatorCount.put(type, 0);
|
| 172 |
+
pairCoverage.put(type, 0.0);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
List<String> matchedLemmas = new ArrayList<>();
|
| 176 |
+
int totalBiasWords = 0;
|
| 177 |
+
int totalDerogatory = 0;
|
| 178 |
+
int totalColloquial = 0;
|
| 179 |
+
|
| 180 |
+
if (totalWords < MIN_WORDS) {
|
| 181 |
+
// Return zero-score result for very short sentences
|
| 182 |
+
return new SentenceBiasScore(sentence, totalWords,
|
| 183 |
+
pairCoverage, signalCount, evaluatorCount,
|
| 184 |
+
matchedLemmas, 0, 0, 0, false);
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
// --- Collect matched positions ---------------------------------
|
| 188 |
+
Set<String> seenLemmas = new HashSet<>();
|
| 189 |
+
|
| 190 |
+
// signalPositions[type] = list of token indices that are signals for that type
|
| 191 |
+
Map<String, List<Integer>> signalPos = new HashMap<>();
|
| 192 |
+
// evalPositions[type] = list of token indices that are evaluators for that type
|
| 193 |
+
Map<String, List<Integer>> evalPos = new HashMap<>();
|
| 194 |
+
|
| 195 |
+
for (String type : biasTypes) {
|
| 196 |
+
signalPos.put(type, new ArrayList<>());
|
| 197 |
+
evalPos.put(type, new ArrayList<>());
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
for (int ti = 0; ti < totalWords; ti++) {
|
| 201 |
+
BiasEntry entry = matched.get(ti);
|
| 202 |
+
if (entry == null) continue;
|
| 203 |
+
|
| 204 |
+
String lemma = entry.getWord();
|
| 205 |
+
|
| 206 |
+
// Count each unique lemma only once (avoid double-counting
|
| 207 |
+
// inflected-form repetitions of the same word in one sentence)
|
| 208 |
+
if (seenLemmas.add(lemma)) {
|
| 209 |
+
matchedLemmas.add(lemma);
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
if (entry.isEvaluative()) totalBiasWords++;
|
| 213 |
+
if (entry.isDerogatory()) totalDerogatory++;
|
| 214 |
+
if (entry.isColloquial()) totalColloquial++;
|
| 215 |
+
|
| 216 |
+
// Determine which types this entry applies to
|
| 217 |
+
List<String> applicableTypes = entry.isTyped()
|
| 218 |
+
? List.of(entry.getBiasType())
|
| 219 |
+
: Arrays.asList(biasTypes); // general entry → all types
|
| 220 |
+
|
| 221 |
+
for (String type : applicableTypes) {
|
| 222 |
+
if (entry.isSignal()) {
|
| 223 |
+
signalPos.get(type).add(ti);
|
| 224 |
+
}
|
| 225 |
+
if (entry.isEvaluativeModifier()) {
|
| 226 |
+
evalPos.get(type).add(ti);
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
// --- Pair detection & score computation -----------------------
|
| 232 |
+
Map<String, Set<Integer>> pairTokens = new HashMap<>();
|
| 233 |
+
for (String type : biasTypes) pairTokens.put(type, new HashSet<>());
|
| 234 |
+
|
| 235 |
+
for (String type : biasTypes) {
|
| 236 |
+
List<Integer> signals = signalPos.get(type);
|
| 237 |
+
List<Integer> evaluators = evalPos.get(type);
|
| 238 |
+
|
| 239 |
+
for (int sIdx : signals) {
|
| 240 |
+
boolean paired = false;
|
| 241 |
+
|
| 242 |
+
// Self-pair: signal is itself evaluative
|
| 243 |
+
BiasEntry sEntry = matched.get(sIdx);
|
| 244 |
+
if (sEntry != null && sEntry.isEvaluativeModifier()) {
|
| 245 |
+
pairTokens.get(type).add(sIdx);
|
| 246 |
+
paired = true;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
// Pair with a distinct evaluator within window
|
| 250 |
+
for (int eIdx : evaluators) {
|
| 251 |
+
if (eIdx == sIdx) continue;
|
| 252 |
+
if (Math.abs(sIdx - eIdx) <= PAIR_WINDOW) {
|
| 253 |
+
pairTokens.get(type).add(sIdx);
|
| 254 |
+
pairTokens.get(type).add(eIdx);
|
| 255 |
+
paired = true;
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
int sigCount = signals.size();
|
| 261 |
+
int evalCount = (int) evaluators.stream()
|
| 262 |
+
.filter(eIdx -> pairTokens.get(type).contains(eIdx))
|
| 263 |
+
.count();
|
| 264 |
+
|
| 265 |
+
signalCount.put(type, sigCount);
|
| 266 |
+
evaluatorCount.put(type, evalCount);
|
| 267 |
+
|
| 268 |
+
double coverage = totalWords > 0
|
| 269 |
+
? (double) pairTokens.get(type).size() / totalWords
|
| 270 |
+
: 0.0;
|
| 271 |
+
pairCoverage.put(type, coverage);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
// --- Multi-type flag ------------------------------------------
|
| 275 |
+
int typesWithPairs = 0;
|
| 276 |
+
for (String type : biasTypes)
|
| 277 |
+
if (!pairTokens.get(type).isEmpty()) typesWithPairs++;
|
| 278 |
+
boolean multiType = typesWithPairs >= 2;
|
| 279 |
+
|
| 280 |
+
return new SentenceBiasScore(
|
| 281 |
+
sentence, totalWords,
|
| 282 |
+
pairCoverage, signalCount, evaluatorCount,
|
| 283 |
+
matchedLemmas, totalBiasWords, totalDerogatory, totalColloquial,
|
| 284 |
+
multiType);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
/**
|
| 290 |
+
* Analyses all .txt files
|
| 291 |
+
*/
|
| 292 |
+
public void analyseDirectory(String corpusDir, String resultPath) {
|
| 293 |
+
try {
|
| 294 |
+
FileHandler fh = new FileHandler();
|
| 295 |
+
|
| 296 |
+
try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
|
| 297 |
+
new FileOutputStream(resultPath, false), StandardCharsets.UTF_8))) {
|
| 298 |
+
|
| 299 |
+
bw.write(SentenceBiasScore.tsvHeader());
|
| 300 |
+
bw.newLine();
|
| 301 |
+
|
| 302 |
+
int filesProcessed = 0;
|
| 303 |
+
int sentencesWritten = 0;
|
| 304 |
+
|
| 305 |
+
for (File f : fh.getFileListing(new File(corpusDir))) {
|
| 306 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 307 |
+
|
| 308 |
+
System.out.println("[BiasAnalyser] Processing: " + f.getName());
|
| 309 |
+
|
| 310 |
+
StringBuilder text = new StringBuilder();
|
| 311 |
+
try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
|
| 312 |
+
while (sc.hasNextLine()) {
|
| 313 |
+
text.append(sc.nextLine()).append(' ');
|
| 314 |
+
}
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
for (SentenceBiasScore score : analyseText(text.toString())) {
|
| 318 |
+
if (score.isBiased()) {
|
| 319 |
+
bw.write(f.getName() + "\t" + score.toTsv());
|
| 320 |
+
bw.newLine();
|
| 321 |
+
sentencesWritten++;
|
| 322 |
+
}
|
| 323 |
+
}
|
| 324 |
+
filesProcessed++;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
System.out.printf("[BiasAnalyser] Done. Files: %d Biased sentences written: %d%n",
|
| 328 |
+
filesProcessed, sentencesWritten);
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
} catch (Exception e) {
|
| 332 |
+
e.printStackTrace();
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
// -----------------------------------------------------------------------
|
| 337 |
+
// Helper
|
| 338 |
+
// -----------------------------------------------------------------------
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
private String clean(String token) {
|
| 342 |
+
return token.replaceAll("[^\\p{L}\\s]", "").trim();
|
| 343 |
+
}
|
| 344 |
+
}
|
java/bg/bas/dcl/LLMs/BiasDetectorDemo.java
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.util.List;
|
| 4 |
+
|
| 5 |
+
/**
|
| 6 |
+
* BiasDetectorDemo
|
| 7 |
+
*
|
| 8 |
+
*
|
| 9 |
+
* -----------------------------------------------------------------------
|
| 10 |
+
* MAVEN DEPENDENCIES (add to pom.xml):
|
| 11 |
+
*
|
| 12 |
+
* <!-- OpenNLP toolkit -->
|
| 13 |
+
* <dependency>
|
| 14 |
+
* <groupId>org.apache.opennlp</groupId>
|
| 15 |
+
* <artifactId>opennlp-tools</artifactId>
|
| 16 |
+
* <version>2.4.0</version>
|
| 17 |
+
* </dependency>
|
| 18 |
+
*
|
| 19 |
+
* <!-- Bulgarian sentence-detection model (UD 2.14, Apache 2.0) -->
|
| 20 |
+
* <dependency>
|
| 21 |
+
* <groupId>org.apache.opennlp</groupId>
|
| 22 |
+
* <artifactId>opennlp-models-sentdetect-bg</artifactId>
|
| 23 |
+
* <version>1.2</version>
|
| 24 |
+
* </dependency>
|
| 25 |
+
*/
|
| 26 |
+
public class BiasDetectorDemo {
|
| 27 |
+
|
| 28 |
+
public static void main(String[] args) {
|
| 29 |
+
|
| 30 |
+
// ------------------------------------------------------------------
|
| 31 |
+
// 1. Load the Bulgarian sentence splitter
|
| 32 |
+
// (loads bundled model from the Maven JAR automatically)
|
| 33 |
+
// ------------------------------------------------------------------
|
| 34 |
+
BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
|
| 35 |
+
|
| 36 |
+
// Alternatively, supply an explicit model file path:
|
| 37 |
+
// BulgarianSentenceSplitter splitter =
|
| 38 |
+
// new BulgarianSentenceSplitter("/path/to/bg-sent.bin");
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
// ------------------------------------------------------------------
|
| 42 |
+
// 2. Load the bias lexicon
|
| 43 |
+
// ------------------------------------------------------------------
|
| 44 |
+
String dictPath = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
|
| 45 |
+
+ "bulgarian_bias_dictionary_v4.tsv";
|
| 46 |
+
|
| 47 |
+
BiasLexicon lexicon = new BiasLexicon(dictPath);
|
| 48 |
+
System.out.printf("Lexicon loaded: %d entries%n%n", lexicon.size());
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
// ------------------------------------------------------------------
|
| 52 |
+
// 3. Build the analyser
|
| 53 |
+
// ------------------------------------------------------------------
|
| 54 |
+
BiasAnalyser analyser = new BiasAnalyser(lexicon, splitter);
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
// ------------------------------------------------------------------
|
| 58 |
+
// 4a. Analyse a block of text in memory
|
| 59 |
+
// ------------------------------------------------------------------
|
| 60 |
+
String sampleText =
|
| 61 |
+
"Слепите хора трудно могат да се справят сами в живота. " +
|
| 62 |
+
"Времето днес е слънчево и приятно.";
|
| 63 |
+
|
| 64 |
+
System.out.println("=== Sentence-level bias scores ===");
|
| 65 |
+
System.out.println(SentenceBiasScore.tsvHeader());
|
| 66 |
+
System.out.println();
|
| 67 |
+
|
| 68 |
+
List<SentenceBiasScore> scores = analyser.analyseText(sampleText);
|
| 69 |
+
|
| 70 |
+
for (SentenceBiasScore score : scores) {
|
| 71 |
+
System.out.println("Sentence : " + score.getSentence());
|
| 72 |
+
System.out.printf ("Words : %d%n", score.getTotalWords());
|
| 73 |
+
System.out.printf ("Biased : %b%n", score.isBiased());
|
| 74 |
+
|
| 75 |
+
double[] cov = score.coverageArray();
|
| 76 |
+
String[] types = SentenceBiasScore.BIAS_TYPES;
|
| 77 |
+
for (int i = 0; i < types.length; i++) {
|
| 78 |
+
if (cov[i] > 0)
|
| 79 |
+
System.out.printf(" %-18s %.2f%% pair coverage%n",
|
| 80 |
+
types[i] + ":", cov[i] * 100);
|
| 81 |
+
}
|
| 82 |
+
System.out.printf ("Total : %.2f%% overall coverage%n", score.totalCoverage() * 100);
|
| 83 |
+
System.out.println("Lemmas : " + score.getMatchedLemmas());
|
| 84 |
+
System.out.println();
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
// ------------------------------------------------------------------
|
| 89 |
+
// 4b. Analyse a corpus directory — writes a TSV results file
|
| 90 |
+
// (only biased sentences are written; zero-coverage sentences
|
| 91 |
+
// are filtered out automatically by analyseDirectory)
|
| 92 |
+
// ------------------------------------------------------------------
|
| 93 |
+
String corpusDir = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/WIKI/";
|
| 94 |
+
String resultTsv = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/bias_results.tsv";
|
| 95 |
+
|
| 96 |
+
// analyser.analyseDirectory(corpusDir, resultTsv);
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
// ------------------------------------------------------------------
|
| 100 |
+
// 4c. Sentence splitting only — using the splitter standalone
|
| 101 |
+
// ------------------------------------------------------------------
|
| 102 |
+
String text = "Това е първото изречение. Второто е по-дълго и сложно! " +
|
| 103 |
+
"А третото задава въпрос?";
|
| 104 |
+
|
| 105 |
+
String[] sentences = splitter.split(text);
|
| 106 |
+
System.out.println("=== Sentence splitting demo ===");
|
| 107 |
+
for (int i = 0; i < sentences.length; i++) {
|
| 108 |
+
System.out.printf(" [%d] %s%n", i + 1, sentences[i]);
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
}
|
java/bg/bas/dcl/LLMs/BiasEntry.java
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.util.Collections;
|
| 4 |
+
import java.util.HashSet;
|
| 5 |
+
import java.util.Set;
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
* BiasEntry
|
| 9 |
+
*
|
| 10 |
+
* TSV column order (0-based, tab-separated):
|
| 11 |
+
* 0 word — canonical lemma
|
| 12 |
+
* 1 POS — part of speech (N, A, V, …)
|
| 13 |
+
* 2 signal — "true" / "false" : marks identity-group signals
|
| 14 |
+
* 3 biasType — gender | race_ethnicity | religion | disability | appearance | "" (general)
|
| 15 |
+
* 4 biasValue — positive | negative | neutral | ""
|
| 16 |
+
* 5 derogatory — "true" / "false"
|
| 17 |
+
* 6 colloquial — "true" / "false"
|
| 18 |
+
* 7 forms — "true" / "false" (unused flag; inflected forms are in col 10)
|
| 19 |
+
* 8 positivity — double in [0,1]
|
| 20 |
+
* 9 negativity — double in [0,1]
|
| 21 |
+
* 10 inflectedForms — pipe-separated list of surface forms, or empty
|
| 22 |
+
*/
|
| 23 |
+
public class BiasEntry {
|
| 24 |
+
|
| 25 |
+
// -----------------------------------------------------------------------
|
| 26 |
+
// Fields
|
| 27 |
+
// -----------------------------------------------------------------------
|
| 28 |
+
|
| 29 |
+
private final String word;
|
| 30 |
+
private final String pos;
|
| 31 |
+
private final boolean signal;
|
| 32 |
+
private final String biasType; // "" means general / not type-specific
|
| 33 |
+
private final String biasValue; // "" means unscored
|
| 34 |
+
private final boolean derogatory;
|
| 35 |
+
private final boolean colloquial;
|
| 36 |
+
private final double positivity;
|
| 37 |
+
private final double negativity;
|
| 38 |
+
|
| 39 |
+
/** All known surface forms (lemma + inflected), lowercased for fast lookup. */
|
| 40 |
+
private final Set<String> forms;
|
| 41 |
+
|
| 42 |
+
// -----------------------------------------------------------------------
|
| 43 |
+
// Constructor — called by BiasLexicon during TSV loading
|
| 44 |
+
// -----------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
public BiasEntry(String word, String pos,
|
| 47 |
+
boolean signal, String biasType, String biasValue,
|
| 48 |
+
boolean derogatory, boolean colloquial,
|
| 49 |
+
double positivity, double negativity,
|
| 50 |
+
Set<String> forms) {
|
| 51 |
+
this.word = word == null ? "" : word.trim();
|
| 52 |
+
this.pos = pos == null ? "" : pos.trim();
|
| 53 |
+
this.signal = signal;
|
| 54 |
+
this.biasType = biasType == null ? "" : biasType.trim();
|
| 55 |
+
this.biasValue = biasValue == null ? "" : biasValue.trim();
|
| 56 |
+
this.derogatory = derogatory;
|
| 57 |
+
this.colloquial = colloquial;
|
| 58 |
+
this.positivity = positivity;
|
| 59 |
+
this.negativity = negativity;
|
| 60 |
+
this.forms = Collections.unmodifiableSet(
|
| 61 |
+
forms == null ? new HashSet<>() : forms);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
// -----------------------------------------------------------------------
|
| 65 |
+
// Accessors
|
| 66 |
+
// -----------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
/** Canonical lemma as it appears in the dictionary. */
|
| 69 |
+
public String getWord() { return word; }
|
| 70 |
+
|
| 71 |
+
/** Part-of-speech tag (N, A, V, …). */
|
| 72 |
+
public String getPos() { return pos; }
|
| 73 |
+
|
| 74 |
+
/**
|
| 75 |
+
* True if this entry marks an identity-group signal word —
|
| 76 |
+
* i.e. a term that identifies a person by a protected attribute
|
| 77 |
+
* (e.g. "жена", "мюсюлманин").
|
| 78 |
+
*/
|
| 79 |
+
public boolean isSignal() { return signal; }
|
| 80 |
+
|
| 81 |
+
/**
|
| 82 |
+
* Bias category, or empty string if applicable to all categories.
|
| 83 |
+
* Values: "gender", "race_ethnicity", "religion", "disability", "appearance".
|
| 84 |
+
*/
|
| 85 |
+
public String getBiasType() { return biasType; }
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
* Evaluative polarity of the word in a bias context.
|
| 89 |
+
* Values: "positive", "negative", "neutral", or "" (unscored).
|
| 90 |
+
*/
|
| 91 |
+
public String getBiasValue() { return biasValue; }
|
| 92 |
+
|
| 93 |
+
/** True if the word is explicitly marked as derogatory / pejorative. */
|
| 94 |
+
public boolean isDerogatory() { return derogatory; }
|
| 95 |
+
|
| 96 |
+
/** True if the word is marked as colloquial / informal. */
|
| 97 |
+
public boolean isColloquial() { return colloquial; }
|
| 98 |
+
|
| 99 |
+
/**
|
| 100 |
+
* Positivity score in [0, 1] derived from BulNet synset sentiment.
|
| 101 |
+
* Higher = more positive connotation.
|
| 102 |
+
*/
|
| 103 |
+
public double getPositivity() { return positivity; }
|
| 104 |
+
|
| 105 |
+
/**
|
| 106 |
+
* Negativity score in [0, 1] derived from BulNet synset sentiment.
|
| 107 |
+
* Higher = more negative connotation.
|
| 108 |
+
*/
|
| 109 |
+
public double getNegativity() { return negativity; }
|
| 110 |
+
|
| 111 |
+
/**
|
| 112 |
+
* Unmodifiable set of all surface forms (lemma + inflected variants),
|
| 113 |
+
* stored in lowercase.
|
| 114 |
+
*/
|
| 115 |
+
public Set<String> getForms() { return forms; }
|
| 116 |
+
|
| 117 |
+
// -----------------------------------------------------------------------
|
| 118 |
+
// Convenience predicates
|
| 119 |
+
// -----------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
/** True if this entry carries any evaluative information (non-empty biasValue). */
|
| 122 |
+
public boolean isEvaluative() {
|
| 123 |
+
return !biasValue.isEmpty() && !biasValue.equals("neutral");
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
/** True if biasType is non-empty (i.e. assigned to a specific category). */
|
| 127 |
+
public boolean isTyped() {
|
| 128 |
+
return !biasType.isEmpty();
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
/**
|
| 132 |
+
* True if this entry can act as an evaluative modifier in a bias pair —
|
| 133 |
+
* i.e. it has a non-neutral polarity, or it is derogatory or colloquial.
|
| 134 |
+
*/
|
| 135 |
+
public boolean isEvaluativeModifier() {
|
| 136 |
+
return isEvaluative() || derogatory || colloquial
|
| 137 |
+
|| positivity > 0.5 || negativity > 0.5;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
// -----------------------------------------------------------------------
|
| 141 |
+
// Object overrides
|
| 142 |
+
// -----------------------------------------------------------------------
|
| 143 |
+
|
| 144 |
+
@Override
|
| 145 |
+
public String toString() {
|
| 146 |
+
return String.format("BiasEntry{word='%s', signal=%b, type='%s', value='%s', "
|
| 147 |
+
+ "pos+neg=[%.2f,%.2f], derog=%b, coll=%b, forms=%d}",
|
| 148 |
+
word, signal, biasType, biasValue,
|
| 149 |
+
positivity, negativity, derogatory, colloquial, forms.size());
|
| 150 |
+
}
|
| 151 |
+
}
|
java/bg/bas/dcl/LLMs/BiasLexicon.java
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.io.BufferedReader;
|
| 4 |
+
import java.io.FileInputStream;
|
| 5 |
+
import java.io.InputStreamReader;
|
| 6 |
+
import java.nio.charset.StandardCharsets;
|
| 7 |
+
import java.util.ArrayList;
|
| 8 |
+
import java.util.Arrays;
|
| 9 |
+
import java.util.Collection;
|
| 10 |
+
import java.util.Collections;
|
| 11 |
+
import java.util.HashMap;
|
| 12 |
+
import java.util.HashSet;
|
| 13 |
+
import java.util.List;
|
| 14 |
+
import java.util.Map;
|
| 15 |
+
import java.util.Set;
|
| 16 |
+
|
| 17 |
+
/**
|
| 18 |
+
* BiasLexicon
|
| 19 |
+
*
|
| 20 |
+
* Loads the Bulgarian bias dictionary (bulgarian_bias_dictionary_v4.tsv) and
|
| 21 |
+
* provides fast O(1) form-level lookup for use by the bias detector.
|
| 22 |
+
*
|
| 23 |
+
* -----------------------------------------------------------------------
|
| 24 |
+
* TSV FORMAT (tab-separated, first row is header):
|
| 25 |
+
*
|
| 26 |
+
* Col 0 word canonical lemma
|
| 27 |
+
* Col 1 POS N | A | V | …
|
| 28 |
+
* Col 2 signal true | false
|
| 29 |
+
* Col 3 biasType gender | race_ethnicity | religion | disability | appearance | ""
|
| 30 |
+
* Col 4 biasValue positive | negative | neutral | ""
|
| 31 |
+
* Col 5 derogatory true | false
|
| 32 |
+
* Col 6 colloquial true | false
|
| 33 |
+
* Col 7 forms (boolean flag — ignored; inflected forms in col 10)
|
| 34 |
+
* Col 8 positivity double [0,1]
|
| 35 |
+
* Col 9 negativity double [0,1]
|
| 36 |
+
* Col 10 inflectedForms pipe-separated surface forms, or empty
|
| 37 |
+
*
|
| 38 |
+
*
|
| 39 |
+
*/
|
| 40 |
+
public class BiasLexicon {
|
| 41 |
+
|
| 42 |
+
// -----------------------------------------------------------------------
|
| 43 |
+
// Indexes
|
| 44 |
+
// -----------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
/**
|
| 47 |
+
* Primary form index: lowercased surface form → BiasEntry.
|
| 48 |
+
* A single form can only map to one entry (first one wins if there are
|
| 49 |
+
* duplicates — extremely rare in the dictionary).
|
| 50 |
+
*/
|
| 51 |
+
private final Map<String, BiasEntry> formIndex = new HashMap<>();
|
| 52 |
+
|
| 53 |
+
/**
|
| 54 |
+
* Canonical word index: lowercased lemma → BiasEntry.
|
| 55 |
+
* Useful when you already have the base form.
|
| 56 |
+
*/
|
| 57 |
+
private final Map<String, BiasEntry> wordIndex = new HashMap<>();
|
| 58 |
+
|
| 59 |
+
/** All entries in load order. */
|
| 60 |
+
private final List<BiasEntry> entries = new ArrayList<>();
|
| 61 |
+
|
| 62 |
+
// -----------------------------------------------------------------------
|
| 63 |
+
// Loading statistics
|
| 64 |
+
// -----------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
private int loadedEntries = 0;
|
| 67 |
+
private int skippedLines = 0;
|
| 68 |
+
private int formConflicts = 0;
|
| 69 |
+
|
| 70 |
+
// -----------------------------------------------------------------------
|
| 71 |
+
// Constructor
|
| 72 |
+
// -----------------------------------------------------------------------
|
| 73 |
+
|
| 74 |
+
/**
|
| 75 |
+
* Loads the bias dictionary from a TSV file.
|
| 76 |
+
*
|
| 77 |
+
* @param tsvPath absolute path to the TSV file
|
| 78 |
+
* @throws RuntimeException if the file cannot be read
|
| 79 |
+
*/
|
| 80 |
+
public BiasLexicon(String tsvPath) {
|
| 81 |
+
load(tsvPath);
|
| 82 |
+
System.out.printf("[BiasLexicon] Loaded %d entries, %d form keys, "
|
| 83 |
+
+ "%d skipped lines, %d form conflicts.%n",
|
| 84 |
+
loadedEntries, formIndex.size(), skippedLines, formConflicts);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
// -----------------------------------------------------------------------
|
| 88 |
+
// Lookup API
|
| 89 |
+
// -----------------------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
/**
|
| 92 |
+
* Looks up a surface token (case-insensitive) and returns the
|
| 93 |
+
* matching {@link BiasEntry}, or {@code null} if not found.
|
| 94 |
+
*
|
| 95 |
+
* @param token any surface form (inflected or base)
|
| 96 |
+
*/
|
| 97 |
+
public BiasEntry lookup(String token) {
|
| 98 |
+
if (token == null || token.isBlank()) return null;
|
| 99 |
+
return formIndex.get(token.toLowerCase().trim());
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
/**
|
| 103 |
+
* Returns true if the token (any form) is present in the lexicon.
|
| 104 |
+
*
|
| 105 |
+
* @param token surface form to check
|
| 106 |
+
*/
|
| 107 |
+
public boolean contains(String token) {
|
| 108 |
+
return lookup(token) != null;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/**
|
| 112 |
+
* Looks up a canonical lemma directly.
|
| 113 |
+
*
|
| 114 |
+
* @param lemma the base/dictionary form
|
| 115 |
+
*/
|
| 116 |
+
public BiasEntry lookupLemma(String lemma) {
|
| 117 |
+
if (lemma == null || lemma.isBlank()) return null;
|
| 118 |
+
return wordIndex.get(lemma.toLowerCase().trim());
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
// -----------------------------------------------------------------------
|
| 122 |
+
// Filtered views
|
| 123 |
+
// -----------------------------------------------------------------------
|
| 124 |
+
|
| 125 |
+
/**
|
| 126 |
+
* Returns all entries whose {@code biasType} matches the given category
|
| 127 |
+
* (case-insensitive), plus all general entries (empty biasType).
|
| 128 |
+
*
|
| 129 |
+
* @param biasType e.g. "gender", "disability"
|
| 130 |
+
*/
|
| 131 |
+
public List<BiasEntry> getByType(String biasType) {
|
| 132 |
+
List<BiasEntry> result = new ArrayList<>();
|
| 133 |
+
String target = biasType == null ? "" : biasType.toLowerCase().trim();
|
| 134 |
+
for (BiasEntry e : entries)
|
| 135 |
+
if (e.getBiasType().equalsIgnoreCase(target) || e.getBiasType().isEmpty())
|
| 136 |
+
result.add(e);
|
| 137 |
+
return result;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/**
|
| 141 |
+
* Returns all entries that are marked as signals (signal=true) for
|
| 142 |
+
* the given bias category, or all signal entries if biasType is null/empty.
|
| 143 |
+
*/
|
| 144 |
+
public List<BiasEntry> getSignals(String biasType) {
|
| 145 |
+
List<BiasEntry> result = new ArrayList<>();
|
| 146 |
+
for (BiasEntry e : entries) {
|
| 147 |
+
if (!e.isSignal()) continue;
|
| 148 |
+
if (biasType == null || biasType.isBlank()
|
| 149 |
+
|| e.getBiasType().isEmpty()
|
| 150 |
+
|| e.getBiasType().equalsIgnoreCase(biasType))
|
| 151 |
+
result.add(e);
|
| 152 |
+
}
|
| 153 |
+
return result;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
/** Returns an unmodifiable view of all loaded entries. */
|
| 157 |
+
public Collection<BiasEntry> getAll() {
|
| 158 |
+
return Collections.unmodifiableList(entries);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/** Number of loaded dictionary entries. */
|
| 162 |
+
public int size() { return entries.size(); }
|
| 163 |
+
|
| 164 |
+
// -----------------------------------------------------------------------
|
| 165 |
+
// Internal loading
|
| 166 |
+
// -----------------------------------------------------------------------
|
| 167 |
+
|
| 168 |
+
private void load(String tsvPath) {
|
| 169 |
+
try (BufferedReader br = new BufferedReader(
|
| 170 |
+
new InputStreamReader(new FileInputStream(tsvPath),
|
| 171 |
+
StandardCharsets.UTF_8))) {
|
| 172 |
+
|
| 173 |
+
String headerLine = br.readLine(); // skip header
|
| 174 |
+
if (headerLine == null) {
|
| 175 |
+
System.err.println("[BiasLexicon] Empty file: " + tsvPath);
|
| 176 |
+
return;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
String line;
|
| 180 |
+
int lineNum = 1; // already read header as line 1
|
| 181 |
+
|
| 182 |
+
while ((line = br.readLine()) != null) {
|
| 183 |
+
lineNum++;
|
| 184 |
+
if (line.isBlank()) { skippedLines++; continue; }
|
| 185 |
+
|
| 186 |
+
String[] cols = line.split("\t", -1);
|
| 187 |
+
|
| 188 |
+
// Minimum viable: need at least 10 columns
|
| 189 |
+
if (cols.length < 10) {
|
| 190 |
+
System.err.printf("[BiasLexicon] Line %d: only %d columns, skipping.%n",
|
| 191 |
+
lineNum, cols.length);
|
| 192 |
+
skippedLines++;
|
| 193 |
+
continue;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
try {
|
| 197 |
+
String word = cols[0].trim();
|
| 198 |
+
String pos = cols[1].trim();
|
| 199 |
+
boolean signal = "true".equalsIgnoreCase(cols[2].trim());
|
| 200 |
+
String biasType = cols[3].trim();
|
| 201 |
+
String biasValue = cols[4].trim();
|
| 202 |
+
boolean derog = "true".equalsIgnoreCase(cols[5].trim());
|
| 203 |
+
boolean coll = "true".equalsIgnoreCase(cols[6].trim());
|
| 204 |
+
// cols[7] is a boolean forms-flag (ignored)
|
| 205 |
+
double positivity = parseDouble(cols[8], lineNum);
|
| 206 |
+
double negativity = parseDouble(cols[9], lineNum);
|
| 207 |
+
|
| 208 |
+
// Inflected forms: pipe-separated in col 10 (if present)
|
| 209 |
+
Set<String> formsSet = new HashSet<>();
|
| 210 |
+
formsSet.add(word.toLowerCase()); // always include the lemma
|
| 211 |
+
|
| 212 |
+
if (cols.length > 10 && !cols[10].isBlank()) {
|
| 213 |
+
for (String f : cols[10].split("\\|")) {
|
| 214 |
+
String fc = f.trim().toLowerCase();
|
| 215 |
+
if (!fc.isEmpty()) formsSet.add(fc);
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
BiasEntry entry = new BiasEntry(word, pos, signal,
|
| 220 |
+
biasType, biasValue, derog, coll,
|
| 221 |
+
positivity, negativity, formsSet);
|
| 222 |
+
|
| 223 |
+
entries.add(entry);
|
| 224 |
+
wordIndex.put(word.toLowerCase(), entry);
|
| 225 |
+
|
| 226 |
+
for (String form : formsSet) {
|
| 227 |
+
if (formIndex.containsKey(form)) {
|
| 228 |
+
formConflicts++;
|
| 229 |
+
// Keep first entry — do not overwrite
|
| 230 |
+
} else {
|
| 231 |
+
formIndex.put(form, entry);
|
| 232 |
+
}
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
loadedEntries++;
|
| 236 |
+
|
| 237 |
+
} catch (Exception e) {
|
| 238 |
+
System.err.printf("[BiasLexicon] Line %d: parse error — %s%n",
|
| 239 |
+
lineNum, e.getMessage());
|
| 240 |
+
skippedLines++;
|
| 241 |
+
}
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
} catch (Exception e) {
|
| 245 |
+
throw new RuntimeException("Failed to load bias lexicon from: " + tsvPath, e);
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
private double parseDouble(String s, int lineNum) {
|
| 250 |
+
try {
|
| 251 |
+
return Double.parseDouble(s.trim());
|
| 252 |
+
} catch (NumberFormatException e) {
|
| 253 |
+
System.err.printf("[BiasLexicon] Line %d: cannot parse double '%s', using 0.0%n",
|
| 254 |
+
lineNum, s);
|
| 255 |
+
return 0.0;
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
}
|
java/bg/bas/dcl/LLMs/BulgarianSentenceSplitter.java
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileInputStream;
|
| 5 |
+
import java.io.InputStream;
|
| 6 |
+
import java.util.ArrayList;
|
| 7 |
+
import java.util.Arrays;
|
| 8 |
+
import java.util.List;
|
| 9 |
+
|
| 10 |
+
import opennlp.tools.sentdetect.SentenceDetectorME;
|
| 11 |
+
import opennlp.tools.sentdetect.SentenceModel;
|
| 12 |
+
|
| 13 |
+
/**
|
| 14 |
+
* BulgarianSentenceSplitter
|
| 15 |
+
*
|
| 16 |
+
* Wraps the Apache OpenNLP sentence detection model for Bulgarian, providing
|
| 17 |
+
* a clean, reusable API for all other pipeline components.
|
| 18 |
+
*
|
| 19 |
+
* -----------------------------------------------------------------------
|
| 20 |
+
* MAVEN DEPENDENCIES (add to pom.xml):
|
| 21 |
+
*
|
| 22 |
+
* <!-- OpenNLP toolkit -->
|
| 23 |
+
* <dependency>
|
| 24 |
+
* <groupId>org.apache.opennlp</groupId>
|
| 25 |
+
* <artifactId>opennlp-tools</artifactId>
|
| 26 |
+
* <version>2.4.0</version>
|
| 27 |
+
* </dependency>
|
| 28 |
+
*
|
| 29 |
+
* <!-- Bulgarian sentence-detection model (UD-based, Apache 2.0) -->
|
| 30 |
+
* <dependency>
|
| 31 |
+
* <groupId>org.apache.opennlp</groupId>
|
| 32 |
+
* <artifactId>opennlp-models-sentdetect-bg</artifactId>
|
| 33 |
+
* <version>1.2</version>
|
| 34 |
+
* </dependency>
|
| 35 |
+
*
|
| 36 |
+
* The model JAR bundles the binary model at:
|
| 37 |
+
* opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin
|
| 38 |
+
* You can also supply an external model file via the two-argument constructor.
|
| 39 |
+
*
|
| 40 |
+
* -------------------------------------------------
|
| 41 |
+
*/
|
| 42 |
+
public class BulgarianSentenceSplitter {
|
| 43 |
+
|
| 44 |
+
// -----------------------------------------------------------------------
|
| 45 |
+
// Constants
|
| 46 |
+
// -----------------------------------------------------------------------
|
| 47 |
+
|
| 48 |
+
/**
|
| 49 |
+
* Classpath location of the bundled Bulgarian sentence-detection model.
|
| 50 |
+
* Matches the path inside the opennlp-models-sentdetect-bg JAR.
|
| 51 |
+
*/
|
| 52 |
+
private static final String BUNDLED_MODEL_PATH =
|
| 53 |
+
"opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin";
|
| 54 |
+
|
| 55 |
+
/**
|
| 56 |
+
* Minimum character length for a string to be considered a valid sentence.
|
| 57 |
+
* Shorter strings are returned as-is without splitting.
|
| 58 |
+
*/
|
| 59 |
+
private static final int MIN_TEXT_LENGTH = 5;
|
| 60 |
+
|
| 61 |
+
// -----------------------------------------------------------------------
|
| 62 |
+
// State
|
| 63 |
+
// -----------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
private final SentenceDetectorME detector;
|
| 66 |
+
|
| 67 |
+
// -----------------------------------------------------------------------
|
| 68 |
+
// Constructors
|
| 69 |
+
// -----------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Loads the Bulgarian sentence-detection model from the bundled Maven JAR.
|
| 73 |
+
* Requires the opennlp-models-sentdetect-bg artifact on the classpath.
|
| 74 |
+
*
|
| 75 |
+
* @throws RuntimeException if the model cannot be loaded
|
| 76 |
+
*/
|
| 77 |
+
public BulgarianSentenceSplitter() {
|
| 78 |
+
this(null);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
/**
|
| 82 |
+
* Loads the Bulgarian sentence-detection model.
|
| 83 |
+
*
|
| 84 |
+
* @param modelPath absolute path to a .bin OpenNLP sentence-detection model,
|
| 85 |
+
* or {@code null} / empty string to load from the classpath JAR
|
| 86 |
+
* @throws RuntimeException if the model cannot be loaded
|
| 87 |
+
*/
|
| 88 |
+
public BulgarianSentenceSplitter(String modelPath) {
|
| 89 |
+
try {
|
| 90 |
+
InputStream stream;
|
| 91 |
+
|
| 92 |
+
if (modelPath == null || modelPath.isBlank()) {
|
| 93 |
+
// Load from the bundled JAR on the classpath
|
| 94 |
+
stream = getClass().getClassLoader()
|
| 95 |
+
.getResourceAsStream(BUNDLED_MODEL_PATH);
|
| 96 |
+
if (stream == null) {
|
| 97 |
+
throw new IllegalStateException(
|
| 98 |
+
"Bulgarian sentence model not found .");
|
| 99 |
+
}
|
| 100 |
+
System.out.println("[SentenceSplitter] Loaded bundled model: " + BUNDLED_MODEL_PATH);
|
| 101 |
+
} else {
|
| 102 |
+
File f = new File(modelPath);
|
| 103 |
+
if (!f.exists())
|
| 104 |
+
throw new IllegalArgumentException(
|
| 105 |
+
"Sentence model file not found: " + modelPath);
|
| 106 |
+
stream = new FileInputStream(f);
|
| 107 |
+
System.out.println("[SentenceSplitter] Loaded external model: " + modelPath);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
SentenceModel model = new SentenceModel(stream);
|
| 111 |
+
stream.close();
|
| 112 |
+
detector = new SentenceDetectorME(model);
|
| 113 |
+
|
| 114 |
+
} catch (Exception e) {
|
| 115 |
+
throw new RuntimeException("Failed to load Bulgarian sentence model", e);
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// -----------------------------------------------------------------------
|
| 120 |
+
// Core API
|
| 121 |
+
// -----------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
public String[] split(String text) {
|
| 125 |
+
if (text == null) return new String[0];
|
| 126 |
+
String trimmed = text.trim();
|
| 127 |
+
if (trimmed.length() < MIN_TEXT_LENGTH) {
|
| 128 |
+
return trimmed.isEmpty() ? new String[0] : new String[]{trimmed};
|
| 129 |
+
}
|
| 130 |
+
return detector.sentDetect(trimmed);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
public List<String> splitToList(String text) {
|
| 135 |
+
return new ArrayList<>(Arrays.asList(split(text)));
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
public List<String> splitParagraphs(String[] paragraphs) {
|
| 140 |
+
List<String> all = new ArrayList<>();
|
| 141 |
+
if (paragraphs == null) return all;
|
| 142 |
+
for (String para : paragraphs) {
|
| 143 |
+
if (para != null && !para.isBlank())
|
| 144 |
+
all.addAll(splitToList(para));
|
| 145 |
+
}
|
| 146 |
+
return all;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
public double[] getSentenceProbabilities() {
|
| 151 |
+
return detector.getSentenceProbabilities();
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
public List<String> splitAndFilter(String text, int minWords) {
|
| 156 |
+
List<String> result = new ArrayList<>();
|
| 157 |
+
for (String sent : split(text)) {
|
| 158 |
+
if (sent.split("\\s+").length >= minWords)
|
| 159 |
+
result.add(sent);
|
| 160 |
+
}
|
| 161 |
+
return result;
|
| 162 |
+
}
|
| 163 |
+
}
|
java/bg/bas/dcl/LLMs/DeduplicationProcessor.java
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.PrintWriter;
|
| 7 |
+
import java.io.Writer;
|
| 8 |
+
import java.nio.file.Files;
|
| 9 |
+
import java.nio.file.StandardCopyOption;
|
| 10 |
+
import java.util.ArrayList;
|
| 11 |
+
import java.util.Collections;
|
| 12 |
+
import java.util.HashMap;
|
| 13 |
+
import java.util.HashSet;
|
| 14 |
+
import java.util.LinkedHashMap;
|
| 15 |
+
import java.util.List;
|
| 16 |
+
import java.util.Map;
|
| 17 |
+
import java.util.Scanner;
|
| 18 |
+
import java.util.Set;
|
| 19 |
+
import java.util.TreeSet;
|
| 20 |
+
|
| 21 |
+
import info.debatty.java.lsh.MinHash;
|
| 22 |
+
|
| 23 |
+
import bg.bas.dcl.general.FileHandler;
|
| 24 |
+
|
| 25 |
+
/**
|
| 26 |
+
* DeduplicationProcessor — sentence-level near-duplicate detection
|
| 27 |
+
* using MinHash + LSH (Jaccard similarity).
|
| 28 |
+
*
|
| 29 |
+
* -----------------------------------------------------------------------
|
| 30 |
+
* MAVEN DEPENDENCY (add to pom.xml):
|
| 31 |
+
*
|
| 32 |
+
* <dependency>
|
| 33 |
+
* <groupId>info.debatty</groupId>
|
| 34 |
+
* <artifactId>java-lsh</artifactId>
|
| 35 |
+
* <version>0.12</version>
|
| 36 |
+
* </dependency>
|
| 37 |
+
*
|
| 38 |
+
* -----------------------------------------------------------------------
|
| 39 |
+
* HOW IT WORKS
|
| 40 |
+
*
|
| 41 |
+
* 1. INDEX phase — reads all .txt files in the "full corpus" directory.
|
| 42 |
+
* Each sentence is shingled into character n-grams, converted to a
|
| 43 |
+
* boolean vector over a shared vocabulary, and a MinHash signature
|
| 44 |
+
* is computed. All signatures are stored in an in-memory index keyed
|
| 45 |
+
* by (file, lineNumber).
|
| 46 |
+
*
|
| 47 |
+
* 2. QUERY phase — reads every sentence in the "new folder".
|
| 48 |
+
* For each sentence its MinHash signature is compared against every
|
| 49 |
+
* indexed corpus signature (approximate Jaccard via signature similarity).
|
| 50 |
+
* Pairs whose estimated Jaccard similarity ≥ threshold are reported.
|
| 51 |
+
*
|
| 52 |
+
* 3. REPORT — a TSV report is written listing every duplicate pair:
|
| 53 |
+
* new-file | new-line | corpus-file | corpus-line | similarity | sentence
|
| 54 |
+
*
|
| 55 |
+
* 4. OPTIONAL REMOVE — sentences in the new folder that are duplicates of
|
| 56 |
+
* corpus sentences are stripped from their file (originals backed up).
|
| 57 |
+
* Files that become empty after removal are deleted.
|
| 58 |
+
*
|
| 59 |
+
* -----------------------------------------------------------------------
|
| 60 |
+
* PARAMETERS
|
| 61 |
+
*
|
| 62 |
+
* threshold — Jaccard similarity to call a near-duplicate (default 0.90)
|
| 63 |
+
* shingleSize — character n-gram size for shingling (default 5)
|
| 64 |
+
* numHashes — number of hash functions for MinHash (default 200)
|
| 65 |
+
* More hashes → better accuracy, slower index.
|
| 66 |
+
*
|
| 67 |
+
* -----------------------------------------------------------------------
|
| 68 |
+
* USAGE
|
| 69 |
+
*
|
| 70 |
+
* DeduplicationProcessor dp = new DeduplicationProcessor(0.90);
|
| 71 |
+
* dp.indexCorpus("/path/to/full/corpus/");
|
| 72 |
+
* dp.detectDuplicates("/path/to/new/folder/", "/path/to/report.tsv");
|
| 73 |
+
* dp.removeDuplicatesFromNewFolder("/path/to/new/folder/", true); // true=keep .bak
|
| 74 |
+
*/
|
| 75 |
+
public class DeduplicationProcessor {
|
| 76 |
+
|
| 77 |
+
// -----------------------------------------------------------------------
|
| 78 |
+
// Configuration
|
| 79 |
+
// -----------------------------------------------------------------------
|
| 80 |
+
|
| 81 |
+
private final double threshold; // Jaccard similarity cut-off
|
| 82 |
+
private final int shingleSize; // character n-gram size
|
| 83 |
+
private final int numHashes; // MinHash signature length
|
| 84 |
+
|
| 85 |
+
// -----------------------------------------------------------------------
|
| 86 |
+
// Index state (built during indexCorpus)
|
| 87 |
+
// -----------------------------------------------------------------------
|
| 88 |
+
|
| 89 |
+
/** Shared vocabulary: every distinct shingle seen across all corpus sentences. */
|
| 90 |
+
private final Set<String> vocabulary = new HashSet<>();
|
| 91 |
+
|
| 92 |
+
/**
|
| 93 |
+
* Corpus index: maps SentenceKey → raw sentence text + MinHash signature.
|
| 94 |
+
* Built in two passes to allow vocabulary to be finalised before signing.
|
| 95 |
+
*/
|
| 96 |
+
private final Map<SentenceKey, IndexedSentence> corpusIndex = new LinkedHashMap<>();
|
| 97 |
+
|
| 98 |
+
/** MinHash object — initialised once vocabulary size is known. */
|
| 99 |
+
private MinHash minHash;
|
| 100 |
+
|
| 101 |
+
// -----------------------------------------------------------------------
|
| 102 |
+
// Duplicate results (populated by detectDuplicates)
|
| 103 |
+
// -----------------------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
/** All duplicate pairs found in the last detectDuplicates run. */
|
| 106 |
+
private final List<DuplicatePair> duplicatePairs = new ArrayList<>();
|
| 107 |
+
|
| 108 |
+
/**
|
| 109 |
+
* Set of SentenceKeys in the NEW folder that are duplicates.
|
| 110 |
+
* Used by removeDuplicatesFromNewFolder.
|
| 111 |
+
*/
|
| 112 |
+
private final Set<SentenceKey> duplicateNewSentences = new HashSet<>();
|
| 113 |
+
|
| 114 |
+
// -----------------------------------------------------------------------
|
| 115 |
+
// Constructor
|
| 116 |
+
// -----------------------------------------------------------------------
|
| 117 |
+
|
| 118 |
+
public DeduplicationProcessor(double threshold) {
|
| 119 |
+
this(threshold, 5, 200);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
public DeduplicationProcessor(double threshold, int shingleSize, int numHashes) {
|
| 123 |
+
if (threshold < 0 || threshold > 1)
|
| 124 |
+
throw new IllegalArgumentException("Threshold must be in [0, 1].");
|
| 125 |
+
this.threshold = threshold;
|
| 126 |
+
this.shingleSize = shingleSize;
|
| 127 |
+
this.numHashes = numHashes;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// -----------------------------------------------------------------------
|
| 131 |
+
// Phase 1 — Index the full corpus
|
| 132 |
+
// -----------------------------------------------------------------------
|
| 133 |
+
|
| 134 |
+
/**
|
| 135 |
+
* Reads all .txt files in {@code corpusDir}, shingles every sentence,
|
| 136 |
+
* builds a shared vocabulary, and computes MinHash signatures.
|
| 137 |
+
*
|
| 138 |
+
* This must be called before {@link #detectDuplicates}.
|
| 139 |
+
*
|
| 140 |
+
* @param corpusDir directory of clean .txt files representing the full corpus
|
| 141 |
+
*/
|
| 142 |
+
public void indexCorpus(String corpusDir) {
|
| 143 |
+
System.out.println("[Index] Scanning corpus: " + corpusDir);
|
| 144 |
+
try {
|
| 145 |
+
FileHandler fh = new FileHandler();
|
| 146 |
+
|
| 147 |
+
// --- Pass 1: collect sentences and build vocabulary ---
|
| 148 |
+
// Temporary store: key → raw text + shingle set (signatures computed later)
|
| 149 |
+
Map<SentenceKey, Set<String>> rawShingles = new LinkedHashMap<>();
|
| 150 |
+
|
| 151 |
+
for (File f : fh.getFileListing(new File(corpusDir))) {
|
| 152 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 153 |
+
|
| 154 |
+
Scanner sc = new Scanner(f, "UTF-8");
|
| 155 |
+
int lineNum = 0;
|
| 156 |
+
while (sc.hasNextLine()) {
|
| 157 |
+
String line = sc.nextLine().trim();
|
| 158 |
+
lineNum++;
|
| 159 |
+
if (line.length() < shingleSize) continue;
|
| 160 |
+
|
| 161 |
+
Set<String> shingles = shingle(line);
|
| 162 |
+
vocabulary.addAll(shingles);
|
| 163 |
+
rawShingles.put(new SentenceKey(f.getName(), lineNum), shingles);
|
| 164 |
+
}
|
| 165 |
+
sc.close();
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
System.out.println("[Index] Vocabulary size: " + vocabulary.size()
|
| 169 |
+
+ " Sentences: " + rawShingles.size());
|
| 170 |
+
|
| 171 |
+
if (vocabulary.isEmpty()) {
|
| 172 |
+
System.err.println("[Index] No sentences found — aborting.");
|
| 173 |
+
return;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
// --- Initialise MinHash with finalised vocabulary size ---
|
| 177 |
+
// Error parameter 0.05 → ~400 hashes needed; we use numHashes directly.
|
| 178 |
+
// The debatty MinHash constructor accepts (error, dictSize).
|
| 179 |
+
// We use the lower-level approach: fix numHashes via the signature size.
|
| 180 |
+
// info.debatty MinHash(double error, int dictSize) chooses hash count itself.
|
| 181 |
+
// For explicit control we pass a small error so it aligns with numHashes.
|
| 182 |
+
minHash = new MinHash(numHashes, vocabulary.size());
|
| 183 |
+
|
| 184 |
+
// --- Pass 2: compute and store signatures ---
|
| 185 |
+
List<String> vocabList = new ArrayList<>(vocabulary);
|
| 186 |
+
corpusIndex.clear();
|
| 187 |
+
|
| 188 |
+
// Also keep a raw-text map for the report
|
| 189 |
+
Map<SentenceKey, String> rawTexts = new HashMap<>();
|
| 190 |
+
// re-scan to get raw text (we only stored shingles above)
|
| 191 |
+
for (File f : fh.getFileListing(new File(corpusDir))) {
|
| 192 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 193 |
+
Scanner sc = new Scanner(f, "UTF-8");
|
| 194 |
+
int lineNum = 0;
|
| 195 |
+
while (sc.hasNextLine()) {
|
| 196 |
+
String line = sc.nextLine().trim();
|
| 197 |
+
lineNum++;
|
| 198 |
+
if (line.length() < shingleSize) continue;
|
| 199 |
+
rawTexts.put(new SentenceKey(f.getName(), lineNum), line);
|
| 200 |
+
}
|
| 201 |
+
sc.close();
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
for (Map.Entry<SentenceKey, Set<String>> entry : rawShingles.entrySet()) {
|
| 205 |
+
SentenceKey key = entry.getKey();
|
| 206 |
+
boolean[] vector = toVector(entry.getValue(), vocabList);
|
| 207 |
+
int[] sig = minHash.signature(vector);
|
| 208 |
+
String rawText = rawTexts.getOrDefault(key, "");
|
| 209 |
+
corpusIndex.put(key, new IndexedSentence(rawText, sig));
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
System.out.println("[Index] Corpus index built: "
|
| 213 |
+
+ corpusIndex.size() + " sentences.");
|
| 214 |
+
|
| 215 |
+
} catch (Exception e) {
|
| 216 |
+
e.printStackTrace();
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
// -----------------------------------------------------------------------
|
| 221 |
+
// Phase 2 — Detect duplicates in new folder
|
| 222 |
+
// -----------------------------------------------------------------------
|
| 223 |
+
|
| 224 |
+
/**
|
| 225 |
+
* Compares every sentence in {@code newDir} against the corpus index.
|
| 226 |
+
* Pairs with estimated Jaccard ≥ threshold are recorded as duplicates
|
| 227 |
+
* and written to {@code reportPath}.
|
| 228 |
+
*
|
| 229 |
+
* Call {@link #indexCorpus} first.
|
| 230 |
+
*
|
| 231 |
+
* @param newDir directory of new .txt files to check
|
| 232 |
+
* @param reportPath destination TSV report file
|
| 233 |
+
*/
|
| 234 |
+
public void detectDuplicates(String newDir, String reportPath) {
|
| 235 |
+
if (corpusIndex.isEmpty()) {
|
| 236 |
+
System.err.println("[Detect] Corpus index is empty. Call indexCorpus() first.");
|
| 237 |
+
return;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
System.out.println("[Detect] Comparing new folder against corpus index...");
|
| 241 |
+
duplicatePairs.clear();
|
| 242 |
+
duplicateNewSentences.clear();
|
| 243 |
+
|
| 244 |
+
List<String> vocabList = new ArrayList<>(vocabulary);
|
| 245 |
+
|
| 246 |
+
try {
|
| 247 |
+
FileHandler fh = new FileHandler();
|
| 248 |
+
|
| 249 |
+
for (File f : fh.getFileListing(new File(newDir))) {
|
| 250 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 251 |
+
|
| 252 |
+
System.out.println("[Detect] Checking: " + f.getName());
|
| 253 |
+
|
| 254 |
+
Scanner sc = new Scanner(f, "UTF-8");
|
| 255 |
+
int lineNum = 0;
|
| 256 |
+
|
| 257 |
+
while (sc.hasNextLine()) {
|
| 258 |
+
String line = sc.nextLine().trim();
|
| 259 |
+
lineNum++;
|
| 260 |
+
if (line.length() < shingleSize) continue;
|
| 261 |
+
|
| 262 |
+
Set<String> shingles = shingle(line);
|
| 263 |
+
|
| 264 |
+
// Only shingles already in vocabulary are meaningful
|
| 265 |
+
Set<String> filtered = new HashSet<>(shingles);
|
| 266 |
+
filtered.retainAll(vocabulary);
|
| 267 |
+
|
| 268 |
+
// If almost none of the shingles are in vocab → skip
|
| 269 |
+
// (the sentence is likely from a very different domain)
|
| 270 |
+
if (filtered.isEmpty()) continue;
|
| 271 |
+
|
| 272 |
+
boolean[] newVec = toVector(filtered, vocabList);
|
| 273 |
+
int[] newSig = minHash.signature(newVec);
|
| 274 |
+
|
| 275 |
+
SentenceKey newKey = new SentenceKey(f.getName(), lineNum);
|
| 276 |
+
|
| 277 |
+
// Compare against all corpus sentences
|
| 278 |
+
// For large corpora, replace this loop with an LSH band index
|
| 279 |
+
for (Map.Entry<SentenceKey, IndexedSentence> entry : corpusIndex.entrySet()) {
|
| 280 |
+
double sim = minHash.similarity(newSig, entry.getValue().signature);
|
| 281 |
+
if (sim >= threshold) {
|
| 282 |
+
DuplicatePair pair = new DuplicatePair(
|
| 283 |
+
newKey, line,
|
| 284 |
+
entry.getKey(), entry.getValue().text,
|
| 285 |
+
sim);
|
| 286 |
+
duplicatePairs.add(pair);
|
| 287 |
+
duplicateNewSentences.add(newKey);
|
| 288 |
+
// Don't break — report ALL corpus matches for transparency
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
sc.close();
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
System.out.println("[Detect] Duplicate sentence pairs found: "
|
| 296 |
+
+ duplicatePairs.size());
|
| 297 |
+
System.out.println("[Detect] Unique new sentences flagged: "
|
| 298 |
+
+ duplicateNewSentences.size());
|
| 299 |
+
|
| 300 |
+
writeReport(reportPath);
|
| 301 |
+
|
| 302 |
+
} catch (Exception e) {
|
| 303 |
+
e.printStackTrace();
|
| 304 |
+
}
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
// -----------------------------------------------------------------------
|
| 308 |
+
// Phase 3 — Optionally remove duplicates from new folder
|
| 309 |
+
// -----------------------------------------------------------------------
|
| 310 |
+
|
| 311 |
+
/**
|
| 312 |
+
* Removes from every file in {@code newDir} any sentence whose
|
| 313 |
+
* (file, lineNumber) is in the duplicate set detected by
|
| 314 |
+
* {@link #detectDuplicates}.
|
| 315 |
+
*
|
| 316 |
+
* Files that become empty after removal are deleted.
|
| 317 |
+
* Must be called after {@link #detectDuplicates}.
|
| 318 |
+
*
|
| 319 |
+
* @param newDir directory of new .txt files to clean
|
| 320 |
+
* @param keepBackup if true, originals are renamed to *.bak first
|
| 321 |
+
*/
|
| 322 |
+
public void removeDuplicatesFromNewFolder(String newDir, boolean keepBackup) {
|
| 323 |
+
if (duplicateNewSentences.isEmpty()) {
|
| 324 |
+
System.out.println("[Remove] No duplicates to remove.");
|
| 325 |
+
return;
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
System.out.println("[Remove] Removing "
|
| 329 |
+
+ duplicateNewSentences.size() + " duplicate sentences...");
|
| 330 |
+
|
| 331 |
+
try {
|
| 332 |
+
FileHandler fh = new FileHandler();
|
| 333 |
+
int filesModified = 0;
|
| 334 |
+
int totalRemoved = 0;
|
| 335 |
+
|
| 336 |
+
for (File f : fh.getFileListing(new File(newDir))) {
|
| 337 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 338 |
+
|
| 339 |
+
List<String> inputLines = new ArrayList<>();
|
| 340 |
+
Scanner sc = new Scanner(f, "UTF-8");
|
| 341 |
+
int lineNum = 0;
|
| 342 |
+
while (sc.hasNextLine()) {
|
| 343 |
+
inputLines.add(sc.nextLine());
|
| 344 |
+
lineNum++;
|
| 345 |
+
}
|
| 346 |
+
sc.close();
|
| 347 |
+
|
| 348 |
+
List<String> outputLines = new ArrayList<>();
|
| 349 |
+
int removed = 0;
|
| 350 |
+
|
| 351 |
+
for (int i = 0; i < inputLines.size(); i++) {
|
| 352 |
+
String trimmed = inputLines.get(i).trim();
|
| 353 |
+
// +1 because lineNum was 1-based during indexing
|
| 354 |
+
SentenceKey key = new SentenceKey(f.getName(), i + 1);
|
| 355 |
+
|
| 356 |
+
if (trimmed.length() >= shingleSize
|
| 357 |
+
&& duplicateNewSentences.contains(key)) {
|
| 358 |
+
removed++;
|
| 359 |
+
} else {
|
| 360 |
+
outputLines.add(inputLines.get(i));
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
if (removed > 0) {
|
| 365 |
+
if (keepBackup) {
|
| 366 |
+
Files.copy(f.toPath(),
|
| 367 |
+
new File(f.getAbsolutePath() + ".bak").toPath(),
|
| 368 |
+
StandardCopyOption.REPLACE_EXISTING);
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
// Check if file would become empty (only blank lines)
|
| 372 |
+
boolean allBlank = outputLines.stream()
|
| 373 |
+
.allMatch(String::isBlank);
|
| 374 |
+
|
| 375 |
+
if (allBlank) {
|
| 376 |
+
f.delete();
|
| 377 |
+
System.out.println("[Remove] Deleted (empty after dedup): "
|
| 378 |
+
+ f.getName());
|
| 379 |
+
} else {
|
| 380 |
+
Writer w = new OutputStreamWriter(
|
| 381 |
+
new FileOutputStream(f), "UTF-8");
|
| 382 |
+
for (String l : outputLines) {
|
| 383 |
+
w.write(l + "\n");
|
| 384 |
+
}
|
| 385 |
+
w.flush();
|
| 386 |
+
w.close();
|
| 387 |
+
System.out.println("[Remove] " + f.getName()
|
| 388 |
+
+ " — removed " + removed + " sentences.");
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
filesModified++;
|
| 392 |
+
totalRemoved += removed;
|
| 393 |
+
}
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
System.out.println("[Remove] Done. Files modified: " + filesModified
|
| 397 |
+
+ " Sentences removed: " + totalRemoved);
|
| 398 |
+
|
| 399 |
+
} catch (Exception e) {
|
| 400 |
+
e.printStackTrace();
|
| 401 |
+
}
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
// -----------------------------------------------------------------------
|
| 405 |
+
// Report writer
|
| 406 |
+
// -----------------------------------------------------------------------
|
| 407 |
+
|
| 408 |
+
private void writeReport(String reportPath) throws Exception {
|
| 409 |
+
try (PrintWriter pw = new PrintWriter(
|
| 410 |
+
new OutputStreamWriter(new FileOutputStream(reportPath), "UTF-8"))) {
|
| 411 |
+
|
| 412 |
+
// Header
|
| 413 |
+
pw.println("# DeduplicationProcessor report");
|
| 414 |
+
pw.println("# Threshold: " + threshold
|
| 415 |
+
+ " ShingleSize: " + shingleSize
|
| 416 |
+
+ " NumHashes: " + numHashes);
|
| 417 |
+
pw.println("# Duplicate pairs: " + duplicatePairs.size());
|
| 418 |
+
pw.println("# Unique new sentences flagged: " + duplicateNewSentences.size());
|
| 419 |
+
pw.println();
|
| 420 |
+
pw.println("NEW_FILE\tNEW_LINE\tCORPUS_FILE\tCORPUS_LINE\tSIMILARITY\tNEW_SENTENCE\tCORPUS_SENTENCE");
|
| 421 |
+
|
| 422 |
+
// Sort by similarity descending, then new file, then line
|
| 423 |
+
List<DuplicatePair> sorted = new ArrayList<>(duplicatePairs);
|
| 424 |
+
sorted.sort((a, b) -> {
|
| 425 |
+
int cmp = Double.compare(b.similarity, a.similarity);
|
| 426 |
+
if (cmp != 0) return cmp;
|
| 427 |
+
cmp = a.newKey.fileName.compareTo(b.newKey.fileName);
|
| 428 |
+
if (cmp != 0) return cmp;
|
| 429 |
+
return Integer.compare(a.newKey.lineNumber, b.newKey.lineNumber);
|
| 430 |
+
});
|
| 431 |
+
|
| 432 |
+
for (DuplicatePair p : sorted) {
|
| 433 |
+
pw.printf("%s\t%d\t%s\t%d\t%.4f\t%s\t%s%n",
|
| 434 |
+
p.newKey.fileName,
|
| 435 |
+
p.newKey.lineNumber,
|
| 436 |
+
p.corpusKey.fileName,
|
| 437 |
+
p.corpusKey.lineNumber,
|
| 438 |
+
p.similarity,
|
| 439 |
+
sanitiseTsv(p.newText),
|
| 440 |
+
sanitiseTsv(p.corpusText));
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
System.out.println("[Report] Written to: " + reportPath);
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
// -----------------------------------------------------------------------
|
| 447 |
+
// Shingling and vectorisation helpers
|
| 448 |
+
// -----------------------------------------------------------------------
|
| 449 |
+
|
| 450 |
+
/**
|
| 451 |
+
* Produces the set of character n-grams (shingles) for a sentence.
|
| 452 |
+
* Lowercased so matching is case-insensitive.
|
| 453 |
+
*/
|
| 454 |
+
private Set<String> shingle(String text) {
|
| 455 |
+
Set<String> shingles = new TreeSet<>();
|
| 456 |
+
String lower = text.toLowerCase();
|
| 457 |
+
for (int i = 0; i <= lower.length() - shingleSize; i++) {
|
| 458 |
+
shingles.add(lower.substring(i, i + shingleSize));
|
| 459 |
+
}
|
| 460 |
+
return shingles;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
/**
|
| 464 |
+
* Converts a shingle set to a boolean presence vector over the shared vocabulary.
|
| 465 |
+
*
|
| 466 |
+
* @param shingles shingle set for this sentence
|
| 467 |
+
* @param vocabList ordered list of all vocabulary shingles
|
| 468 |
+
* @return boolean[] where true = shingle present
|
| 469 |
+
*/
|
| 470 |
+
private boolean[] toVector(Set<String> shingles, List<String> vocabList) {
|
| 471 |
+
boolean[] vector = new boolean[vocabList.size()];
|
| 472 |
+
for (int i = 0; i < vocabList.size(); i++) {
|
| 473 |
+
vector[i] = shingles.contains(vocabList.get(i));
|
| 474 |
+
}
|
| 475 |
+
return vector;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
// -----------------------------------------------------------------------
|
| 479 |
+
// Utility
|
| 480 |
+
// -----------------------------------------------------------------------
|
| 481 |
+
|
| 482 |
+
private String sanitiseTsv(String s) {
|
| 483 |
+
if (s == null) return "";
|
| 484 |
+
return s.replace("\t", " ").replace("\n", " ").replace("\r", "");
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
/** Returns an unmodifiable view of all detected duplicate pairs. */
|
| 488 |
+
public List<DuplicatePair> getDuplicatePairs() {
|
| 489 |
+
return Collections.unmodifiableList(duplicatePairs);
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
/** Returns the number of corpus sentences indexed. */
|
| 493 |
+
public int getCorpusSize() {
|
| 494 |
+
return corpusIndex.size();
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
// -----------------------------------------------------------------------
|
| 498 |
+
// Inner data classes
|
| 499 |
+
// -----------------------------------------------------------------------
|
| 500 |
+
|
| 501 |
+
/**
|
| 502 |
+
* Uniquely identifies a sentence by its source file name and line number.
|
| 503 |
+
*/
|
| 504 |
+
public static class SentenceKey {
|
| 505 |
+
public final String fileName;
|
| 506 |
+
public final int lineNumber;
|
| 507 |
+
|
| 508 |
+
public SentenceKey(String fileName, int lineNumber) {
|
| 509 |
+
this.fileName = fileName;
|
| 510 |
+
this.lineNumber = lineNumber;
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
@Override
|
| 514 |
+
public boolean equals(Object o) {
|
| 515 |
+
if (!(o instanceof SentenceKey)) return false;
|
| 516 |
+
SentenceKey other = (SentenceKey) o;
|
| 517 |
+
return lineNumber == other.lineNumber
|
| 518 |
+
&& fileName.equals(other.fileName);
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
@Override
|
| 522 |
+
public int hashCode() {
|
| 523 |
+
return 31 * fileName.hashCode() + lineNumber;
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
@Override
|
| 527 |
+
public String toString() {
|
| 528 |
+
return fileName + ":" + lineNumber;
|
| 529 |
+
}
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
/**
|
| 533 |
+
* Holds the raw text and MinHash signature for an indexed corpus sentence.
|
| 534 |
+
*/
|
| 535 |
+
private static class IndexedSentence {
|
| 536 |
+
final String text;
|
| 537 |
+
final int[] signature;
|
| 538 |
+
|
| 539 |
+
IndexedSentence(String text, int[] signature) {
|
| 540 |
+
this.text = text;
|
| 541 |
+
this.signature = signature;
|
| 542 |
+
}
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
/**
|
| 546 |
+
* Represents a detected near-duplicate pair between a new sentence
|
| 547 |
+
* and a corpus sentence.
|
| 548 |
+
*/
|
| 549 |
+
public static class DuplicatePair {
|
| 550 |
+
public final SentenceKey newKey;
|
| 551 |
+
public final String newText;
|
| 552 |
+
public final SentenceKey corpusKey;
|
| 553 |
+
public final String corpusText;
|
| 554 |
+
public final double similarity;
|
| 555 |
+
|
| 556 |
+
public DuplicatePair(SentenceKey newKey, String newText,
|
| 557 |
+
SentenceKey corpusKey, String corpusText,
|
| 558 |
+
double similarity) {
|
| 559 |
+
this.newKey = newKey;
|
| 560 |
+
this.newText = newText;
|
| 561 |
+
this.corpusKey = corpusKey;
|
| 562 |
+
this.corpusText = corpusText;
|
| 563 |
+
this.similarity = similarity;
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
@Override
|
| 567 |
+
public String toString() {
|
| 568 |
+
return String.format("[%.2f] %s ↔ %s", similarity, newKey, corpusKey);
|
| 569 |
+
}
|
| 570 |
+
}
|
| 571 |
+
}
|
java/bg/bas/dcl/LLMs/FileCleanProcessor.java
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.PrintWriter;
|
| 7 |
+
import java.io.Writer;
|
| 8 |
+
import java.nio.file.Files;
|
| 9 |
+
import java.nio.file.StandardCopyOption;
|
| 10 |
+
import java.util.ArrayList;
|
| 11 |
+
import java.util.Arrays;
|
| 12 |
+
import java.util.HashMap;
|
| 13 |
+
import java.util.HashSet;
|
| 14 |
+
import java.util.LinkedHashMap;
|
| 15 |
+
import java.util.List;
|
| 16 |
+
import java.util.Map;
|
| 17 |
+
import java.util.Scanner;
|
| 18 |
+
import java.util.Set;
|
| 19 |
+
import java.util.regex.Pattern;
|
| 20 |
+
|
| 21 |
+
import bg.bas.dcl.general.FileHandler;
|
| 22 |
+
|
| 23 |
+
/**
|
| 24 |
+
* FileCleanProcessor — corpus boilerplate remover.
|
| 25 |
+
*
|
| 26 |
+
* Two-phase cleaning:
|
| 27 |
+
*
|
| 28 |
+
* Phase 1 — LEARN (from a sample directory):
|
| 29 |
+
* Scans every .txt file in the sample dir and records how many files each
|
| 30 |
+
* non-empty line appears in. Lines that appear in ≥ THRESHOLD of the
|
| 31 |
+
* sample files are added to the "common lines" blocklist.
|
| 32 |
+
* The blocklist is also saved to disk for inspection / reuse.
|
| 33 |
+
*
|
| 34 |
+
* Phase 2 — CLEAN (over the full data directory):
|
| 35 |
+
* For every .txt file, removes lines that:
|
| 36 |
+
* (a) appear in the learned common-lines blocklist, OR
|
| 37 |
+
* (b) match any of the hardcoded boilerplate regex patterns
|
| 38 |
+
* (HTML/XML tags, PHP markers, navigation patterns,
|
| 39 |
+
* URLs, e-mail addresses, cookie/GDPR banners).
|
| 40 |
+
* Cleaned files overwrite the originals (a .bak backup is kept by default).
|
| 41 |
+
*
|
| 42 |
+
* Usage:
|
| 43 |
+
* FileCleanProcessor fcp = new FileCleanProcessor(0.50); // 50 % threshold
|
| 44 |
+
* fcp.learnFromSample("/path/to/sample/dir/");
|
| 45 |
+
* fcp.saveBlocklist("/path/to/blocklist.txt"); // optional
|
| 46 |
+
* fcp.cleanDirectory("/path/to/full/data/dir/", true); // true = keep .bak
|
| 47 |
+
*/
|
| 48 |
+
public class FileCleanProcessor {
|
| 49 |
+
|
| 50 |
+
// -----------------------------------------------------------------------
|
| 51 |
+
// Configuration
|
| 52 |
+
// -----------------------------------------------------------------------
|
| 53 |
+
|
| 54 |
+
/** Fraction of sample files a line must appear in to be considered boilerplate. */
|
| 55 |
+
private final double threshold;
|
| 56 |
+
|
| 57 |
+
/** Minimum non-whitespace characters a line must have to be evaluated (avoids
|
| 58 |
+
* treating every blank separator the same way). */
|
| 59 |
+
private static final int MIN_LINE_LENGTH = 3;
|
| 60 |
+
|
| 61 |
+
// -----------------------------------------------------------------------
|
| 62 |
+
// State
|
| 63 |
+
// -----------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
/** Lines found to be common across the sample (Phase 1 output). */
|
| 66 |
+
private final Set<String> commonLines = new HashSet<>();
|
| 67 |
+
|
| 68 |
+
/** Diagnostic: line → number of sample files it appeared in. */
|
| 69 |
+
private final Map<String, Integer> lineFrequency = new LinkedHashMap<>();
|
| 70 |
+
|
| 71 |
+
// -----------------------------------------------------------------------
|
| 72 |
+
// Hardcoded boilerplate patterns (always applied regardless of frequency)
|
| 73 |
+
// -----------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
private static final List<Pattern> BOILERPLATE_PATTERNS = Arrays.asList(
|
| 76 |
+
|
| 77 |
+
// ---- HTML / XML tags ------------------------------------------------
|
| 78 |
+
Pattern.compile("(?i)^\\s*<[^>]+>\\s*$"), // whole-line tag
|
| 79 |
+
Pattern.compile("(?i).*<(script|style|head|meta|link|iframe)[^>]*>.*"),
|
| 80 |
+
Pattern.compile("(?i).*</(script|style|head|body|html)>.*"),
|
| 81 |
+
Pattern.compile("(?i).*<!--.*-->.*"), // HTML comment
|
| 82 |
+
Pattern.compile("(?i).*&(nbsp|amp|lt|gt|quot|apos);.*"), // HTML entities
|
| 83 |
+
|
| 84 |
+
// ---- PHP / server-side markers --------------------------------------
|
| 85 |
+
Pattern.compile("(?i).*<\\?php.*"),
|
| 86 |
+
Pattern.compile("(?i).*\\?>\\s*"),
|
| 87 |
+
Pattern.compile("(?i).*<%.*%>.*"), // ASP-style tags
|
| 88 |
+
|
| 89 |
+
// ---- Navigation / menu patterns ------------------------------------
|
| 90 |
+
Pattern.compile("(?i)^\\s*(home|начало|меню|menu|навигация|navigation"
|
| 91 |
+
+ "|търсене|search|вход|login|изход|logout"
|
| 92 |
+
+ "|регистрация|register|контакти|contacts"
|
| 93 |
+
+ "|за нас|about us|sitemap|карта на сайта)\\s*$"),
|
| 94 |
+
Pattern.compile("(?i)^\\s*(next|prev|previous|следващ|предишен"
|
| 95 |
+
+ "|напред|назад|нагоре|back|forward|top|горе)\\s*$"),
|
| 96 |
+
Pattern.compile("(?i)^\\s*\\|\\s*(.*\\|\\s*)+$"), // pipe-separated nav bars
|
| 97 |
+
Pattern.compile("(?i)^\\s*(>\\s*){2,}"), // breadcrumb: A > B > C
|
| 98 |
+
Pattern.compile("(?i)^\\s*(\\d+\\.?\\s+){3,}$"), // numbered nav lists
|
| 99 |
+
|
| 100 |
+
// ---- URLs ----------------------------------------------------------
|
| 101 |
+
Pattern.compile("(?i)\\bhttps?://\\S+"),
|
| 102 |
+
Pattern.compile("(?i)\\bwww\\.\\S+\\.\\S+"),
|
| 103 |
+
Pattern.compile("(?i)\\bftp://\\S+"),
|
| 104 |
+
|
| 105 |
+
// ---- E-mail addresses ----------------------------------------------
|
| 106 |
+
Pattern.compile("[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"),
|
| 107 |
+
|
| 108 |
+
// ---- Cookie / GDPR banners -----------------------------------------
|
| 109 |
+
Pattern.compile("(?i).*(бисквитки|cookies|gdpr|privacy policy|поверителност"
|
| 110 |
+
+ "|приемам|accept all|отхвърлям|decline|consent"
|
| 111 |
+
+ "|лични данни|personal data|условия за ползване"
|
| 112 |
+
+ "|terms of (use|service)|политика за).*"),
|
| 113 |
+
|
| 114 |
+
// ---- Social / sharing buttons --------------------------------------
|
| 115 |
+
Pattern.compile("(?i)^\\s*(share|сподели|like|харесай|tweet|retweet"
|
| 116 |
+
+ "|pinterest|linkedin|facebook|twitter|instagram"
|
| 117 |
+
+ "|google\\+?|youtube|tiktok|viber|whatsapp)\\s*$"),
|
| 118 |
+
|
| 119 |
+
// ---- Counters / analytics snippets ---------------------------------
|
| 120 |
+
Pattern.compile("(?i).*google.analytics.*"),
|
| 121 |
+
Pattern.compile("(?i).*ga\\s*\\(\\s*['\"].*"),
|
| 122 |
+
Pattern.compile("(?i).*gtag\\s*\\(.*"),
|
| 123 |
+
Pattern.compile("(?i).*_gaq\\.push.*"),
|
| 124 |
+
|
| 125 |
+
// ---- Print / date / page artefacts ---------------------------------
|
| 126 |
+
Pattern.compile("(?i)^\\s*страница\\s+\\d+\\s*(от\\s+\\d+)?\\s*$"), // "страница 1 от 5"
|
| 127 |
+
Pattern.compile("(?i)^\\s*page\\s+\\d+\\s*(of\\s+\\d+)?\\s*$"),
|
| 128 |
+
Pattern.compile("(?i)^\\s*©.*$"), // copyright line
|
| 129 |
+
Pattern.compile("(?i)^\\s*all rights reserved.*$"),
|
| 130 |
+
Pattern.compile("(?i)^\\s*права запазени.*$"),
|
| 131 |
+
|
| 132 |
+
// ---- Lines that are purely punctuation / symbols -------------------
|
| 133 |
+
Pattern.compile("^[\\s\\p{Punct}\\|\\-_=*~`^]+$")
|
| 134 |
+
);
|
| 135 |
+
|
| 136 |
+
// -----------------------------------------------------------------------
|
| 137 |
+
// Constructor
|
| 138 |
+
// -----------------------------------------------------------------------
|
| 139 |
+
|
| 140 |
+
/**
|
| 141 |
+
* @param threshold fraction [0,1] of sample files a line must appear in
|
| 142 |
+
* to be added to the blocklist (e.g. 0.50 for 50 %).
|
| 143 |
+
*/
|
| 144 |
+
public FileCleanProcessor(double threshold) {
|
| 145 |
+
if (threshold < 0 || threshold > 1)
|
| 146 |
+
throw new IllegalArgumentException("Threshold must be in [0, 1].");
|
| 147 |
+
this.threshold = threshold;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// -----------------------------------------------------------------------
|
| 151 |
+
// Phase 1 — Learn from sample
|
| 152 |
+
// -----------------------------------------------------------------------
|
| 153 |
+
|
| 154 |
+
/**
|
| 155 |
+
* Scans all .txt files in {@code sampleDir}, counts how many files each
|
| 156 |
+
* trimmed non-empty line appears in, and populates {@link #commonLines}
|
| 157 |
+
* with those meeting the threshold.
|
| 158 |
+
*
|
| 159 |
+
* @param sampleDir directory containing representative sample .txt files
|
| 160 |
+
*/
|
| 161 |
+
public void learnFromSample(String sampleDir) {
|
| 162 |
+
try {
|
| 163 |
+
FileHandler fh = new FileHandler();
|
| 164 |
+
List<File> sampleFiles = new ArrayList<>();
|
| 165 |
+
|
| 166 |
+
for (File f : fh.getFileListing(new File(sampleDir))) {
|
| 167 |
+
if (f.isFile() && f.getName().endsWith(".txt"))
|
| 168 |
+
sampleFiles.add(f);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
int total = sampleFiles.size();
|
| 172 |
+
if (total == 0) {
|
| 173 |
+
System.err.println("[LearnPhase] No .txt files found in: " + sampleDir);
|
| 174 |
+
return;
|
| 175 |
+
}
|
| 176 |
+
System.out.println("[LearnPhase] Scanning " + total + " sample files...");
|
| 177 |
+
|
| 178 |
+
// For each file, collect the *distinct* lines it contains so a
|
| 179 |
+
// repeated line inside one document only counts once.
|
| 180 |
+
Map<String, Integer> fileCount = new HashMap<>();
|
| 181 |
+
|
| 182 |
+
for (File f : sampleFiles) {
|
| 183 |
+
Set<String> seenInFile = new HashSet<>();
|
| 184 |
+
Scanner s = new Scanner(f, "UTF-8");
|
| 185 |
+
while (s.hasNextLine()) {
|
| 186 |
+
String line = s.nextLine().trim();
|
| 187 |
+
if (line.length() < MIN_LINE_LENGTH) continue;
|
| 188 |
+
if (seenInFile.add(line)) { // first occurrence in this file
|
| 189 |
+
fileCount.merge(line, 1, Integer::sum);
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
s.close();
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
// Apply threshold
|
| 196 |
+
commonLines.clear();
|
| 197 |
+
lineFrequency.clear();
|
| 198 |
+
|
| 199 |
+
double cutoff = threshold * total;
|
| 200 |
+
for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
|
| 201 |
+
lineFrequency.put(entry.getKey(), entry.getValue());
|
| 202 |
+
if (entry.getValue() >= cutoff) {
|
| 203 |
+
commonLines.add(entry.getKey());
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
System.out.println("[LearnPhase] Common lines identified: " + commonLines.size()
|
| 208 |
+
+ " (threshold=" + (int)(threshold * 100) + "%, files=" + total + ")");
|
| 209 |
+
|
| 210 |
+
} catch (Exception e) {
|
| 211 |
+
e.printStackTrace();
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
/**
|
| 216 |
+
* Replaces the learned common-lines set with a pre-built one.
|
| 217 |
+
* Useful when loading a previously saved blocklist.
|
| 218 |
+
*
|
| 219 |
+
* @param lines set of exact line strings to treat as boilerplate
|
| 220 |
+
*/
|
| 221 |
+
public void setCommonLines(Set<String> lines) {
|
| 222 |
+
commonLines.clear();
|
| 223 |
+
commonLines.addAll(lines);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
// -----------------------------------------------------------------------
|
| 227 |
+
// Blocklist persistence
|
| 228 |
+
// -----------------------------------------------------------------------
|
| 229 |
+
|
| 230 |
+
/**
|
| 231 |
+
* Saves the learned blocklist to a plain-text file (one line per entry),
|
| 232 |
+
* preceded by a frequency comment for human review.
|
| 233 |
+
*
|
| 234 |
+
* @param outPath destination file path
|
| 235 |
+
*/
|
| 236 |
+
public void saveBlocklist(String outPath) {
|
| 237 |
+
try (PrintWriter pw = new PrintWriter(
|
| 238 |
+
new OutputStreamWriter(new FileOutputStream(outPath), "UTF-8"))) {
|
| 239 |
+
|
| 240 |
+
pw.println("# FileCleanProcessor blocklist");
|
| 241 |
+
pw.println("# threshold=" + threshold
|
| 242 |
+
+ " entries=" + commonLines.size());
|
| 243 |
+
pw.println("# Format: <frequency TAB line>");
|
| 244 |
+
pw.println();
|
| 245 |
+
|
| 246 |
+
// Sort by descending frequency for readability
|
| 247 |
+
lineFrequency.entrySet().stream()
|
| 248 |
+
.filter(e -> commonLines.contains(e.getKey()))
|
| 249 |
+
.sorted((a, b) -> b.getValue() - a.getValue())
|
| 250 |
+
.forEach(e -> pw.println(e.getValue() + "\t" + e.getKey()));
|
| 251 |
+
|
| 252 |
+
System.out.println("[Blocklist] Saved " + commonLines.size()
|
| 253 |
+
+ " entries to: " + outPath);
|
| 254 |
+
|
| 255 |
+
} catch (Exception e) {
|
| 256 |
+
e.printStackTrace();
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
/**
|
| 261 |
+
* Loads a blocklist previously saved by {@link #saveBlocklist}.
|
| 262 |
+
* Comment lines (starting with #) and blank lines are skipped.
|
| 263 |
+
*
|
| 264 |
+
* @param blocklistPath path to the blocklist file
|
| 265 |
+
*/
|
| 266 |
+
public void loadBlocklist(String blocklistPath) {
|
| 267 |
+
try {
|
| 268 |
+
commonLines.clear();
|
| 269 |
+
Scanner sc = new Scanner(new File(blocklistPath), "UTF-8");
|
| 270 |
+
while (sc.hasNextLine()) {
|
| 271 |
+
String line = sc.nextLine();
|
| 272 |
+
if (line.startsWith("#") || line.isBlank()) continue;
|
| 273 |
+
// Format: "<freq>\t<content>" or bare "<content>"
|
| 274 |
+
int tab = line.indexOf('\t');
|
| 275 |
+
String content = (tab >= 0) ? line.substring(tab + 1) : line;
|
| 276 |
+
if (!content.isBlank()) commonLines.add(content.trim());
|
| 277 |
+
}
|
| 278 |
+
sc.close();
|
| 279 |
+
System.out.println("[Blocklist] Loaded " + commonLines.size()
|
| 280 |
+
+ " entries from: " + blocklistPath);
|
| 281 |
+
} catch (Exception e) {
|
| 282 |
+
e.printStackTrace();
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
// -----------------------------------------------------------------------
|
| 287 |
+
// Phase 2 — Clean full directory
|
| 288 |
+
// -----------------------------------------------------------------------
|
| 289 |
+
|
| 290 |
+
/**
|
| 291 |
+
* Cleans every .txt file in {@code dataDir} by removing lines that are
|
| 292 |
+
* in the learned blocklist or match a hardcoded boilerplate pattern.
|
| 293 |
+
*
|
| 294 |
+
* @param dataDir directory containing corpus .txt files to clean
|
| 295 |
+
* @param keepBackup if true, originals are renamed to *.bak before overwriting
|
| 296 |
+
*/
|
| 297 |
+
public void cleanDirectory(String dataDir, boolean keepBackup) {
|
| 298 |
+
try {
|
| 299 |
+
if (commonLines.isEmpty()) {
|
| 300 |
+
System.out.println("[CleanPhase] Warning: no common lines loaded. "
|
| 301 |
+
+ "Only regex patterns will be applied.");
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
FileHandler fh = new FileHandler();
|
| 305 |
+
int processed = 0, linesRemoved = 0;
|
| 306 |
+
|
| 307 |
+
for (File f : fh.getFileListing(new File(dataDir))) {
|
| 308 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 309 |
+
|
| 310 |
+
CleanResult result = cleanFile(f, keepBackup);
|
| 311 |
+
processed++;
|
| 312 |
+
linesRemoved += result.linesRemoved;
|
| 313 |
+
|
| 314 |
+
if (result.linesRemoved > 0) {
|
| 315 |
+
System.out.println("[CleanPhase] " + f.getName()
|
| 316 |
+
+ " — removed " + result.linesRemoved + " lines.");
|
| 317 |
+
}
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
System.out.println("[CleanPhase] Done. Files processed: " + processed
|
| 321 |
+
+ " Total lines removed: " + linesRemoved);
|
| 322 |
+
|
| 323 |
+
} catch (Exception e) {
|
| 324 |
+
e.printStackTrace();
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
/**
|
| 329 |
+
* Cleans a single file in place.
|
| 330 |
+
*
|
| 331 |
+
* @param file the .txt file to clean
|
| 332 |
+
* @param keepBackup if true, a .bak copy of the original is kept
|
| 333 |
+
* @return CleanResult with statistics
|
| 334 |
+
*/
|
| 335 |
+
public CleanResult cleanFile(File file, boolean keepBackup) {
|
| 336 |
+
int removed = 0;
|
| 337 |
+
try {
|
| 338 |
+
// Read all lines
|
| 339 |
+
List<String> inputLines = new ArrayList<>();
|
| 340 |
+
Scanner sc = new Scanner(file, "UTF-8");
|
| 341 |
+
while (sc.hasNextLine()) inputLines.add(sc.nextLine());
|
| 342 |
+
sc.close();
|
| 343 |
+
|
| 344 |
+
// Filter
|
| 345 |
+
List<String> outputLines = new ArrayList<>();
|
| 346 |
+
for (String line : inputLines) {
|
| 347 |
+
if (shouldRemove(line)) {
|
| 348 |
+
removed++;
|
| 349 |
+
} else {
|
| 350 |
+
outputLines.add(line);
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
if (removed > 0) {
|
| 355 |
+
// Backup
|
| 356 |
+
if (keepBackup) {
|
| 357 |
+
File bak = new File(file.getAbsolutePath() + ".bak");
|
| 358 |
+
Files.copy(file.toPath(), bak.toPath(),
|
| 359 |
+
StandardCopyOption.REPLACE_EXISTING);
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
// Overwrite
|
| 363 |
+
Writer w = new OutputStreamWriter(
|
| 364 |
+
new FileOutputStream(file), "UTF-8");
|
| 365 |
+
for (String l : outputLines) {
|
| 366 |
+
w.write(l + "\n");
|
| 367 |
+
}
|
| 368 |
+
w.flush();
|
| 369 |
+
w.close();
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
} catch (Exception e) {
|
| 373 |
+
e.printStackTrace();
|
| 374 |
+
}
|
| 375 |
+
return new CleanResult(file, removed);
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
// -----------------------------------------------------------------------
|
| 379 |
+
// Core line decision
|
| 380 |
+
// -----------------------------------------------------------------------
|
| 381 |
+
|
| 382 |
+
/**
|
| 383 |
+
* Returns true if the line should be removed.
|
| 384 |
+
*
|
| 385 |
+
* A line is removed if:
|
| 386 |
+
* 1. Its trimmed form is in the learned common-lines blocklist, OR
|
| 387 |
+
* 2. It matches any hardcoded boilerplate regex pattern.
|
| 388 |
+
*
|
| 389 |
+
* Blank lines shorter than MIN_LINE_LENGTH are always kept so that
|
| 390 |
+
* paragraph structure is preserved.
|
| 391 |
+
*
|
| 392 |
+
* @param rawLine the original line from the file (not yet trimmed)
|
| 393 |
+
*/
|
| 394 |
+
public boolean shouldRemove(String rawLine) {
|
| 395 |
+
String trimmed = rawLine.trim();
|
| 396 |
+
|
| 397 |
+
// Always keep blank/very-short lines (paragraph separators)
|
| 398 |
+
if (trimmed.length() < MIN_LINE_LENGTH) return false;
|
| 399 |
+
|
| 400 |
+
// 1. Exact-match blocklist
|
| 401 |
+
if (commonLines.contains(trimmed)) return true;
|
| 402 |
+
|
| 403 |
+
// 2. Regex boilerplate patterns
|
| 404 |
+
for (Pattern p : BOILERPLATE_PATTERNS) {
|
| 405 |
+
if (p.matcher(trimmed).matches() || p.matcher(trimmed).find()) {
|
| 406 |
+
return true;
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
return false;
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
// -----------------------------------------------------------------------
|
| 414 |
+
// Diagnostic helpers
|
| 415 |
+
// -----------------------------------------------------------------------
|
| 416 |
+
|
| 417 |
+
/** Returns an unmodifiable view of the learned common-lines set. */
|
| 418 |
+
public Set<String> getCommonLines() {
|
| 419 |
+
return java.util.Collections.unmodifiableSet(commonLines);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
/** Returns a copy of the frequency map (line → number of sample files). */
|
| 423 |
+
public Map<String, Integer> getLineFrequency() {
|
| 424 |
+
return java.util.Collections.unmodifiableMap(lineFrequency);
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
/**
|
| 428 |
+
* Prints a summary of the top {@code n} most-frequent common lines to stdout.
|
| 429 |
+
*/
|
| 430 |
+
public void printTopCommonLines(int n) {
|
| 431 |
+
System.out.println("--- Top " + n + " common lines (by sample frequency) ---");
|
| 432 |
+
lineFrequency.entrySet().stream()
|
| 433 |
+
.filter(e -> commonLines.contains(e.getKey()))
|
| 434 |
+
.sorted((a, b) -> b.getValue() - a.getValue())
|
| 435 |
+
.limit(n)
|
| 436 |
+
.forEach(e -> System.out.printf(" [%4d] %s%n", e.getValue(), e.getKey()));
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
// -----------------------------------------------------------------------
|
| 440 |
+
// Inner result class
|
| 441 |
+
// -----------------------------------------------------------------------
|
| 442 |
+
|
| 443 |
+
/** Simple value object returned by {@link #cleanFile}. */
|
| 444 |
+
public static class CleanResult {
|
| 445 |
+
public final File file;
|
| 446 |
+
public final int linesRemoved;
|
| 447 |
+
|
| 448 |
+
public CleanResult(File file, int linesRemoved) {
|
| 449 |
+
this.file = file;
|
| 450 |
+
this.linesRemoved = linesRemoved;
|
| 451 |
+
}
|
| 452 |
+
}
|
| 453 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/.BulNCProcessor.java.kate-swp
ADDED
|
Binary file (348 Bytes). View file
|
|
|
java/bg/bas/dcl/LLMs/IfGPTDataset/.CurlicatProcessor.java.kate-swp
ADDED
|
Binary file (98 Bytes). View file
|
|
|
java/bg/bas/dcl/LLMs/IfGPTDataset/BaseSourceProcessor.java
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.FileWriter;
|
| 4 |
+
import java.io.PrintWriter;
|
| 5 |
+
import java.util.ArrayList;
|
| 6 |
+
import java.util.LinkedHashSet;
|
| 7 |
+
|
| 8 |
+
import org.json.simple.JSONArray;
|
| 9 |
+
import org.json.simple.JSONObject;
|
| 10 |
+
|
| 11 |
+
import bg.bas.dcl.general.JSONProcessor;
|
| 12 |
+
|
| 13 |
+
import java.io.File;
|
| 14 |
+
|
| 15 |
+
/**
|
| 16 |
+
* Abstract base for all source processors.
|
| 17 |
+
*
|
| 18 |
+
* Provides shared utilities:
|
| 19 |
+
* - convertJsonToCSV: write a metadata JSONObject to a CSV file
|
| 20 |
+
* - estimateTokenCount: simple punctuation-aware token estimator
|
| 21 |
+
*
|
| 22 |
+
* Each concrete subclass implements {@link SourceProcessor#process(String, String)}
|
| 23 |
+
* with source-specific parsing logic.
|
| 24 |
+
*/
|
| 25 |
+
public abstract class BaseSourceProcessor implements SourceProcessor {
|
| 26 |
+
|
| 27 |
+
// -----------------------------------------------------------------------
|
| 28 |
+
// CSV export
|
| 29 |
+
// -----------------------------------------------------------------------
|
| 30 |
+
|
| 31 |
+
/**
|
| 32 |
+
* Reads a metadata.json file from disk and writes a CSV alongside it.
|
| 33 |
+
*
|
| 34 |
+
* @param metadataJsonPath path to the metadata JSON file
|
| 35 |
+
*/
|
| 36 |
+
public void convertJsonToCSV(String metadataJsonPath) {
|
| 37 |
+
try {
|
| 38 |
+
JSONProcessor pr = new JSONProcessor();
|
| 39 |
+
JSONObject json = pr.readJSON(new File(metadataJsonPath));
|
| 40 |
+
convertJsonToCSV(json, metadataJsonPath + "_CSV.csv");
|
| 41 |
+
} catch (Exception e) {
|
| 42 |
+
e.printStackTrace();
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
/**
|
| 47 |
+
* Writes the "metadata" array inside {@code json} to a CSV at {@code outCsvPath}.
|
| 48 |
+
* Reports structural inconsistencies (missing/extra fields) to stderr.
|
| 49 |
+
*
|
| 50 |
+
* @param json JSONObject that contains a "metadata" JSONArray
|
| 51 |
+
* @param outCsvPath destination CSV file path
|
| 52 |
+
*/
|
| 53 |
+
public void convertJsonToCSV(JSONObject json, String outCsvPath) {
|
| 54 |
+
try {
|
| 55 |
+
JSONArray array = (JSONArray) json.get("metadata");
|
| 56 |
+
|
| 57 |
+
if (array == null || array.isEmpty()) {
|
| 58 |
+
System.err.println("[INCONSISTENCY] 'metadata' array is null or empty in: " + outCsvPath);
|
| 59 |
+
return;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
// Collect all unique field names, preserving insertion order
|
| 63 |
+
LinkedHashSet<String> headersSet = new LinkedHashSet<>();
|
| 64 |
+
for (Object obj : array) {
|
| 65 |
+
if (obj instanceof JSONObject) {
|
| 66 |
+
headersSet.addAll(((JSONObject) obj).keySet());
|
| 67 |
+
} else {
|
| 68 |
+
System.err.println("[INCONSISTENCY] Non-JSONObject entry found in metadata array.");
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
ArrayList<String> headers = new ArrayList<>(headersSet);
|
| 73 |
+
|
| 74 |
+
try (PrintWriter writer = new PrintWriter(new FileWriter(outCsvPath))) {
|
| 75 |
+
|
| 76 |
+
// Header row
|
| 77 |
+
writer.println(String.join(",", headers));
|
| 78 |
+
|
| 79 |
+
// Data rows
|
| 80 |
+
for (int i = 0; i < array.size(); i++) {
|
| 81 |
+
Object obj = array.get(i);
|
| 82 |
+
|
| 83 |
+
if (!(obj instanceof JSONObject)) {
|
| 84 |
+
System.err.println("[INCONSISTENCY] Row " + i + " is not a JSONObject, skipping.");
|
| 85 |
+
continue;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
JSONObject row = (JSONObject) obj;
|
| 89 |
+
|
| 90 |
+
// Structural checks
|
| 91 |
+
for (String header : headers) {
|
| 92 |
+
if (!row.containsKey(header)) {
|
| 93 |
+
System.err.println("[INCONSISTENCY] Row " + i + " missing field: '" + header + "'");
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
for (Object key : row.keySet()) {
|
| 97 |
+
if (!headersSet.contains(key.toString())) {
|
| 98 |
+
System.err.println("[INCONSISTENCY] Row " + i + " has unexpected field: '" + key + "'");
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
// Build CSV line with RFC-4180 escaping
|
| 103 |
+
ArrayList<String> values = new ArrayList<>();
|
| 104 |
+
for (String header : headers) {
|
| 105 |
+
Object value = row.get(header);
|
| 106 |
+
if (value == null) {
|
| 107 |
+
values.add("");
|
| 108 |
+
} else {
|
| 109 |
+
String strVal = value.toString();
|
| 110 |
+
if (strVal.contains(",") || strVal.contains("\"") || strVal.contains("\n")) {
|
| 111 |
+
strVal = "\"" + strVal.replace("\"", "\"\"") + "\"";
|
| 112 |
+
}
|
| 113 |
+
values.add(strVal);
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
writer.println(String.join(",", values));
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
System.out.println("CSV written to: " + outCsvPath);
|
| 121 |
+
|
| 122 |
+
} catch (Exception e) {
|
| 123 |
+
e.printStackTrace();
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
// -----------------------------------------------------------------------
|
| 128 |
+
// Shared helpers
|
| 129 |
+
// -----------------------------------------------------------------------
|
| 130 |
+
|
| 131 |
+
/**
|
| 132 |
+
* Estimates the number of tokens in a sentence by counting words plus
|
| 133 |
+
* standalone punctuation characters (.,;:?!()-).
|
| 134 |
+
*
|
| 135 |
+
* @param sentence whitespace-tokenised sentence string
|
| 136 |
+
* @return estimated token count
|
| 137 |
+
*/
|
| 138 |
+
protected int estimateTokenCount(String sentence) {
|
| 139 |
+
String[] words = sentence.split(" ");
|
| 140 |
+
int punctCount = sentence.length()
|
| 141 |
+
- sentence.replaceAll("[.,;:()?!\\-]", "").length();
|
| 142 |
+
return words.length + punctCount;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
/**
|
| 146 |
+
* Creates a JSONObject pre-populated with the metadata fields that are
|
| 147 |
+
* common to every source (counts start at 0).
|
| 148 |
+
*
|
| 149 |
+
* @param identifier unique document identifier
|
| 150 |
+
* @return partially initialised JSONObject
|
| 151 |
+
*/
|
| 152 |
+
@SuppressWarnings("unchecked")
|
| 153 |
+
protected JSONObject newBaseDescriptor(String identifier) {
|
| 154 |
+
JSONObject fdescr = new JSONObject();
|
| 155 |
+
fdescr.put("Identifier", identifier);
|
| 156 |
+
fdescr.put("Licence", "");
|
| 157 |
+
fdescr.put("LicenceLink", "");
|
| 158 |
+
fdescr.put("PublicationDate", "");
|
| 159 |
+
fdescr.put("DocumentTitle", "");
|
| 160 |
+
fdescr.put("Source", "");
|
| 161 |
+
fdescr.put("Author", "");
|
| 162 |
+
fdescr.put("Style", "");
|
| 163 |
+
fdescr.put("Type", "");
|
| 164 |
+
fdescr.put("Subdomain", "");
|
| 165 |
+
fdescr.put("TranslatedDocument", "");
|
| 166 |
+
fdescr.put("CollectionDate", "");
|
| 167 |
+
fdescr.put("Medium", "text");
|
| 168 |
+
fdescr.put("Url", "");
|
| 169 |
+
fdescr.put("Domain", "");
|
| 170 |
+
fdescr.put("Keywords", "");
|
| 171 |
+
fdescr.put("PersonallyIdentifiableInformation", "");
|
| 172 |
+
fdescr.put("BiasedInformation", "");
|
| 173 |
+
fdescr.put("TaskCategories", "");
|
| 174 |
+
fdescr.put("NumberWords", 0);
|
| 175 |
+
fdescr.put("NumberSentences", 0);
|
| 176 |
+
fdescr.put("NumberParagraphs", 0);
|
| 177 |
+
fdescr.put("NumberTokens", 0);
|
| 178 |
+
return fdescr;
|
| 179 |
+
}
|
| 180 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCProcessor.java
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.Writer;
|
| 7 |
+
import java.util.Scanner;
|
| 8 |
+
|
| 9 |
+
import org.json.simple.JSONArray;
|
| 10 |
+
import org.json.simple.JSONObject;
|
| 11 |
+
|
| 12 |
+
import bg.bas.dcl.monolingual.bg.TextProcessor;
|
| 13 |
+
|
| 14 |
+
/**
|
| 15 |
+
* Processes the Bulgarian National Corpus (BulNC) — general subcorpora.
|
| 16 |
+
*
|
| 17 |
+
* Unlike MARCELL/CURLICAT, BulNC metadata is supplied via an external
|
| 18 |
+
* tab-separated description file (BulNC-description.txt) rather than
|
| 19 |
+
* inline CoNLL-UP comments. Plain-text source files are read directly.
|
| 20 |
+
*
|
| 21 |
+
* Subcorpora included (controlled by {@link #isIncluded}):
|
| 22 |
+
* A-Administrative, B-Science, C-MassMedia, D-Fiction
|
| 23 |
+
* (edit the method to adjust the filter)
|
| 24 |
+
*
|
| 25 |
+
* SETimes articles are excluded regardless of subcorpus.
|
| 26 |
+
*
|
| 27 |
+
* Licence rules:
|
| 28 |
+
* A-Administrative → CC0
|
| 29 |
+
* B-Science → Restricted
|
| 30 |
+
* C-MassMedia → Restricted
|
| 31 |
+
* D-Fiction → Restricted
|
| 32 |
+
*
|
| 33 |
+
* Description file column indices (0-based):
|
| 34 |
+
* 0 filename stem | 1 relative path | 2 collection date
|
| 35 |
+
* 4 author | 8 title | 9 publication date
|
| 36 |
+
* 12 url | 13 translated | 17 type
|
| 37 |
+
* 19 domain | 21 subdomain (optional)
|
| 38 |
+
*/
|
| 39 |
+
public class BulNCProcessor extends BaseSourceProcessor {
|
| 40 |
+
|
| 41 |
+
private static final String CC0_LICENCE = "CC0";
|
| 42 |
+
private static final String CC0_LICENCE_LINK =
|
| 43 |
+
"https://creativecommons.org/public-domain/cc0/";
|
| 44 |
+
private static final String RESTRICTED = "Restricted";
|
| 45 |
+
|
| 46 |
+
private final String metaFilePath; // path to BulNC-description.txt
|
| 47 |
+
private final TextProcessor tp = new TextProcessor();
|
| 48 |
+
|
| 49 |
+
/**
|
| 50 |
+
* @param metaFilePath absolute path to BulNC-description.txt
|
| 51 |
+
*/
|
| 52 |
+
public BulNCProcessor(String metaFilePath) {
|
| 53 |
+
this.metaFilePath = metaFilePath;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
/**
|
| 57 |
+
* @param indir root directory of the BulNC corpus
|
| 58 |
+
* @param outdir output directory for .txt files and metadata
|
| 59 |
+
*/
|
| 60 |
+
@Override
|
| 61 |
+
public void process(String indir, String outdir) {
|
| 62 |
+
try {
|
| 63 |
+
JSONObject json = new JSONObject();
|
| 64 |
+
JSONArray descrArray = new JSONArray();
|
| 65 |
+
|
| 66 |
+
Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
|
| 67 |
+
while (sme.hasNextLine()) {
|
| 68 |
+
String[] dat = sme.nextLine().split("\t");
|
| 69 |
+
|
| 70 |
+
String relativePath = dat[1];
|
| 71 |
+
System.out.println("Checking: " + relativePath);
|
| 72 |
+
|
| 73 |
+
// --- Subcorpus filter ---
|
| 74 |
+
if (!isIncluded(relativePath)) continue;
|
| 75 |
+
|
| 76 |
+
// --- SETimes exclusion ---
|
| 77 |
+
if (dat[12].contains("setimes")) continue;
|
| 78 |
+
|
| 79 |
+
String fname = indir + relativePath;
|
| 80 |
+
File f = new File(fname);
|
| 81 |
+
if (!f.exists()) {
|
| 82 |
+
System.err.println("[MISSING] " + fname);
|
| 83 |
+
continue;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
String tfname = "bg_bnc_" + dat[0];
|
| 87 |
+
|
| 88 |
+
JSONObject fdescr = newBaseDescriptor(tfname);
|
| 89 |
+
applyLicence(fdescr, relativePath);
|
| 90 |
+
|
| 91 |
+
fdescr.put("PublicationDate", dat[9].replaceAll("\\.", "-"));
|
| 92 |
+
fdescr.put("DocumentTitle", dat[8]);
|
| 93 |
+
fdescr.put("Author", dat[4]);
|
| 94 |
+
fdescr.put("Style", "Administrative");
|
| 95 |
+
fdescr.put("Type", dat[17]);
|
| 96 |
+
fdescr.put("Subdomain", dat.length > 21 ? dat[21] : "");
|
| 97 |
+
fdescr.put("TranslatedDocument", dat[13]);
|
| 98 |
+
fdescr.put("CollectionDate", dat[2]);
|
| 99 |
+
fdescr.put("Url", dat[12]);
|
| 100 |
+
fdescr.put("Domain", dat[19]);
|
| 101 |
+
|
| 102 |
+
Writer out = new OutputStreamWriter(
|
| 103 |
+
new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
|
| 104 |
+
|
| 105 |
+
Scanner s = new Scanner(f, "UTF-8");
|
| 106 |
+
int nw = 0, ns = 0, np = 0, nt = 0;
|
| 107 |
+
|
| 108 |
+
while (s.hasNextLine()) {
|
| 109 |
+
String text = s.nextLine();
|
| 110 |
+
np++;
|
| 111 |
+
|
| 112 |
+
out.write(text + "\n");
|
| 113 |
+
out.flush();
|
| 114 |
+
|
| 115 |
+
for (String sent : tp.splitToSentences(text)) {
|
| 116 |
+
ns++;
|
| 117 |
+
String[] words = sent.split(" ");
|
| 118 |
+
nw += words.length;
|
| 119 |
+
nt += estimateTokenCount(sent);
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
s.close();
|
| 124 |
+
out.flush();
|
| 125 |
+
out.close();
|
| 126 |
+
|
| 127 |
+
fdescr.put("NumberWords", nw);
|
| 128 |
+
fdescr.put("NumberSentences", ns);
|
| 129 |
+
fdescr.put("NumberParagraphs", np);
|
| 130 |
+
fdescr.put("NumberTokens", nt);
|
| 131 |
+
|
| 132 |
+
descrArray.add(fdescr);
|
| 133 |
+
}
|
| 134 |
+
sme.close();
|
| 135 |
+
|
| 136 |
+
json.put("metadata", descrArray);
|
| 137 |
+
|
| 138 |
+
System.out.println("Total documents processed: " + descrArray.size());
|
| 139 |
+
writeMetadata(json, outdir, "metadata_BNC_mm.json");
|
| 140 |
+
|
| 141 |
+
} catch (Exception e) {
|
| 142 |
+
e.printStackTrace();
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// -----------------------------------------------------------------------
|
| 147 |
+
// Helpers
|
| 148 |
+
// -----------------------------------------------------------------------
|
| 149 |
+
|
| 150 |
+
/**
|
| 151 |
+
* Returns true for subcorpora that should be processed.
|
| 152 |
+
* Edit this method to change the filter.
|
| 153 |
+
*/
|
| 154 |
+
protected boolean isIncluded(String relativePath) {
|
| 155 |
+
return relativePath.contains("C-MassMedia/");
|
| 156 |
+
// Uncomment to add more subcorpora:
|
| 157 |
+
// || relativePath.contains("A-Administrative/")
|
| 158 |
+
// || relativePath.contains("B-Science/")
|
| 159 |
+
// || relativePath.contains("D-Fiction/")
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
@SuppressWarnings("unchecked")
|
| 163 |
+
private void applyLicence(JSONObject fdescr, String relativePath) {
|
| 164 |
+
if (relativePath.contains("B-Science/")
|
| 165 |
+
|| relativePath.contains("C-MassMedia/")
|
| 166 |
+
|| relativePath.contains("D-Fiction/")) {
|
| 167 |
+
fdescr.put("Licence", RESTRICTED);
|
| 168 |
+
fdescr.put("LicenceLink", "");
|
| 169 |
+
} else {
|
| 170 |
+
fdescr.put("Licence", CC0_LICENCE);
|
| 171 |
+
fdescr.put("LicenceLink", CC0_LICENCE_LINK);
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
@SuppressWarnings("unchecked")
|
| 176 |
+
private void writeMetadata(JSONObject json, String outdir, String filename)
|
| 177 |
+
throws Exception {
|
| 178 |
+
String outMetaPath = outdir + filename;
|
| 179 |
+
Writer outMeta = new OutputStreamWriter(
|
| 180 |
+
new FileOutputStream(outMetaPath), "UTF-8");
|
| 181 |
+
json.writeJSONString(outMeta);
|
| 182 |
+
outMeta.flush();
|
| 183 |
+
outMeta.close();
|
| 184 |
+
|
| 185 |
+
convertJsonToCSV(json, outMetaPath + "_CSV.csv");
|
| 186 |
+
System.out.println("Metadata written to: " + outMetaPath);
|
| 187 |
+
}
|
| 188 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCWikiProcessor.java
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.Writer;
|
| 7 |
+
import java.util.Scanner;
|
| 8 |
+
|
| 9 |
+
import org.json.simple.JSONArray;
|
| 10 |
+
import org.json.simple.JSONObject;
|
| 11 |
+
|
| 12 |
+
import bg.bas.dcl.general.JSONProcessor;
|
| 13 |
+
import bg.bas.dcl.monolingual.bg.TextProcessor;
|
| 14 |
+
|
| 15 |
+
/**
|
| 16 |
+
* Processes the BulNC "F-InformalFiction" (Wiki/Informal) subcorpus.
|
| 17 |
+
*
|
| 18 |
+
|
| 19 |
+
*/
|
| 20 |
+
public class BulNCWikiProcessor extends BaseSourceProcessor {
|
| 21 |
+
|
| 22 |
+
private static final String CC0_LICENCE = "CC0";
|
| 23 |
+
private static final String CC0_LICENCE_LINK =
|
| 24 |
+
"https://creativecommons.org/public-domain/cc0/";
|
| 25 |
+
|
| 26 |
+
private final String metaFilePath;
|
| 27 |
+
private final String existingMetaJson; // may be null
|
| 28 |
+
private final TextProcessor tp = new TextProcessor();
|
| 29 |
+
|
| 30 |
+
public BulNCWikiProcessor(String metaFilePath, String existingMetaJson) {
|
| 31 |
+
this.metaFilePath = metaFilePath;
|
| 32 |
+
this.existingMetaJson = existingMetaJson;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
/**
|
| 36 |
+
|
| 37 |
+
*/
|
| 38 |
+
@Override
|
| 39 |
+
public void process(String indir, String outdir) {
|
| 40 |
+
try {
|
| 41 |
+
// Load existing metadata if provided, otherwise start fresh
|
| 42 |
+
JSONObject json;
|
| 43 |
+
JSONArray descrArray;
|
| 44 |
+
|
| 45 |
+
if (existingMetaJson != null && new File(existingMetaJson).exists()) {
|
| 46 |
+
JSONProcessor jp = new JSONProcessor();
|
| 47 |
+
json = jp.readJSON(new File(existingMetaJson));
|
| 48 |
+
descrArray = (JSONArray) json.get("metadata");
|
| 49 |
+
System.out.println("Loaded existing metadata with "
|
| 50 |
+
+ descrArray.size() + " entries.");
|
| 51 |
+
} else {
|
| 52 |
+
json = new JSONObject();
|
| 53 |
+
descrArray = new JSONArray();
|
| 54 |
+
json.put("metadata", descrArray);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
int newDocs = 0;
|
| 58 |
+
long totalTokens = 0;
|
| 59 |
+
|
| 60 |
+
Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
|
| 61 |
+
while (sme.hasNextLine()) {
|
| 62 |
+
String[] dat = sme.nextLine().split("\t");
|
| 63 |
+
|
| 64 |
+
String relativePath = dat[1];
|
| 65 |
+
System.out.println("Checking: " + relativePath);
|
| 66 |
+
|
| 67 |
+
if (!relativePath.contains("F-InformalFiction")) continue;
|
| 68 |
+
|
| 69 |
+
String fname = indir + relativePath;
|
| 70 |
+
File f = new File(fname);
|
| 71 |
+
if (!f.exists()) {
|
| 72 |
+
System.err.println("[MISSING] " + fname);
|
| 73 |
+
continue;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
String tfname = "bg_bnc_" + dat[0];
|
| 77 |
+
|
| 78 |
+
JSONObject fdescr = newBaseDescriptor(tfname);
|
| 79 |
+
fdescr.put("Licence", CC0_LICENCE);
|
| 80 |
+
fdescr.put("LicenceLink", CC0_LICENCE_LINK);
|
| 81 |
+
fdescr.put("PublicationDate", dat[9].replaceAll("\\.", "-"));
|
| 82 |
+
fdescr.put("DocumentTitle", dat[8]);
|
| 83 |
+
fdescr.put("Author", dat[4]);
|
| 84 |
+
fdescr.put("Style", "Administrative");
|
| 85 |
+
fdescr.put("Type", dat[17]);
|
| 86 |
+
fdescr.put("Subdomain", dat.length > 21 ? dat[21] : "");
|
| 87 |
+
fdescr.put("TranslatedDocument", dat[13]);
|
| 88 |
+
fdescr.put("CollectionDate", dat[2]);
|
| 89 |
+
fdescr.put("Url", dat[12]);
|
| 90 |
+
fdescr.put("Domain", dat[19]);
|
| 91 |
+
|
| 92 |
+
Writer out = new OutputStreamWriter(
|
| 93 |
+
new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
|
| 94 |
+
|
| 95 |
+
Scanner s = new Scanner(f, "UTF-8");
|
| 96 |
+
int nw = 0, ns = 0, np = 0, nt = 0;
|
| 97 |
+
|
| 98 |
+
while (s.hasNextLine()) {
|
| 99 |
+
String text = s.nextLine();
|
| 100 |
+
np++;
|
| 101 |
+
|
| 102 |
+
out.write(text + "\n");
|
| 103 |
+
out.flush();
|
| 104 |
+
|
| 105 |
+
for (String sent : tp.splitToSentences(text)) {
|
| 106 |
+
ns++;
|
| 107 |
+
String[] words = sent.split(" ");
|
| 108 |
+
nw += words.length;
|
| 109 |
+
nt += estimateTokenCount(sent);
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
s.close();
|
| 114 |
+
out.flush();
|
| 115 |
+
out.close();
|
| 116 |
+
|
| 117 |
+
fdescr.put("NumberWords", nw);
|
| 118 |
+
fdescr.put("NumberSentences", ns);
|
| 119 |
+
fdescr.put("NumberParagraphs", np);
|
| 120 |
+
fdescr.put("NumberTokens", nt);
|
| 121 |
+
|
| 122 |
+
descrArray.add(fdescr);
|
| 123 |
+
newDocs++;
|
| 124 |
+
totalTokens += nt;
|
| 125 |
+
}
|
| 126 |
+
sme.close();
|
| 127 |
+
|
| 128 |
+
System.out.println("New F-InformalFiction documents added: " + newDocs);
|
| 129 |
+
System.out.println("Total tokens in new documents: " + totalTokens);
|
| 130 |
+
System.out.println("Merged metadata total entries: " + descrArray.size());
|
| 131 |
+
|
| 132 |
+
writeMetadata(json, outdir, "metadata.json");
|
| 133 |
+
|
| 134 |
+
} catch (Exception e) {
|
| 135 |
+
e.printStackTrace();
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
// -----------------------------------------------------------------------
|
| 140 |
+
|
| 141 |
+
@SuppressWarnings("unchecked")
|
| 142 |
+
private void writeMetadata(JSONObject json, String outdir, String filename)
|
| 143 |
+
throws Exception {
|
| 144 |
+
String outMetaPath = outdir + filename;
|
| 145 |
+
Writer outMeta = new OutputStreamWriter(
|
| 146 |
+
new FileOutputStream(outMetaPath), "UTF-8");
|
| 147 |
+
json.writeJSONString(outMeta);
|
| 148 |
+
outMeta.flush();
|
| 149 |
+
outMeta.close();
|
| 150 |
+
|
| 151 |
+
System.out.println("Merged metadata written to: " + outMetaPath);
|
| 152 |
+
|
| 153 |
+
}
|
| 154 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/CurlicatProcessor.java
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.Writer;
|
| 7 |
+
import java.util.Scanner;
|
| 8 |
+
|
| 9 |
+
import org.json.simple.JSONArray;
|
| 10 |
+
import org.json.simple.JSONObject;
|
| 11 |
+
|
| 12 |
+
import bg.bas.dcl.general.FileHandler;
|
| 13 |
+
|
| 14 |
+
/**
|
| 15 |
+
* Processes the CURLICAT Bulgarian corpus.
|
| 16 |
+
*
|
| 17 |
+
* Input: CoNLL-UP files (.conllup) with richer inline metadata than MARCELL.
|
| 18 |
+
* Output: One plain-text .txt per document + metadata.json + metadata CSV.
|
| 19 |
+
*
|
| 20 |
+
* Metadata comment prefixes recognised:
|
| 21 |
+
* # PublicationDate = → PublicationDate
|
| 22 |
+
* # DocumentTitle = → DocumentTitle
|
| 23 |
+
* # Author = → Author
|
| 24 |
+
* # DocumentType = → Type
|
| 25 |
+
* # Url = → Url
|
| 26 |
+
* # Style = → Style
|
| 27 |
+
* # Domain = → Domain
|
| 28 |
+
* # Subdomain = → Subdomain
|
| 29 |
+
* # CollectionDate = → CollectionDate
|
| 30 |
+
* # License = → Licence (overrides default if present)
|
| 31 |
+
*
|
| 32 |
+
* Default licence: CC-BY-SA-4.0.
|
| 33 |
+
*/
|
| 34 |
+
public class CurlicatProcessor extends BaseSourceProcessor {
|
| 35 |
+
|
| 36 |
+
private static final String DEFAULT_LICENCE = "CC-BY-SA-4.0";
|
| 37 |
+
private static final String DEFAULT_LICENCE_LINK =
|
| 38 |
+
"https://elrc-share.eu/static/metashare/licences/CC-BY-SA-4.0.pdf";
|
| 39 |
+
private static final String PREFIX = "bg_CURLICAT_";
|
| 40 |
+
private static final String EXT = ".conllup";
|
| 41 |
+
|
| 42 |
+
@Override
|
| 43 |
+
public void process(String indir, String outdir) {
|
| 44 |
+
try {
|
| 45 |
+
FileHandler fh = new FileHandler();
|
| 46 |
+
JSONObject json = new JSONObject();
|
| 47 |
+
JSONArray descrArray = new JSONArray();
|
| 48 |
+
|
| 49 |
+
for (File f : fh.getFileListing(new File(indir))) {
|
| 50 |
+
if (!f.isFile()) continue;
|
| 51 |
+
|
| 52 |
+
System.out.println("Processing: " + f.getAbsolutePath());
|
| 53 |
+
|
| 54 |
+
String tfname = PREFIX + f.getName().replace(EXT, "");
|
| 55 |
+
|
| 56 |
+
JSONObject fdescr = newBaseDescriptor(tfname);
|
| 57 |
+
fdescr.put("Licence", DEFAULT_LICENCE);
|
| 58 |
+
fdescr.put("LicenceLink", DEFAULT_LICENCE_LINK);
|
| 59 |
+
|
| 60 |
+
Writer out = new OutputStreamWriter(
|
| 61 |
+
new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
|
| 62 |
+
|
| 63 |
+
Scanner s = new Scanner(f, "UTF-8");
|
| 64 |
+
int nw = 0, ns = 0, np = 0, nt = 0;
|
| 65 |
+
|
| 66 |
+
while (s.hasNextLine()) {
|
| 67 |
+
String line = s.nextLine();
|
| 68 |
+
|
| 69 |
+
// --- Metadata extraction ---
|
| 70 |
+
if (line.startsWith("# PublicationDate =")) {
|
| 71 |
+
fdescr.put("PublicationDate",
|
| 72 |
+
line.replace("# PublicationDate =", "").trim());
|
| 73 |
+
} else if (line.startsWith("# DocumentTitle =")) {
|
| 74 |
+
fdescr.put("DocumentTitle",
|
| 75 |
+
line.replace("# DocumentTitle =", "").trim());
|
| 76 |
+
} else if (line.startsWith("# Author =")) {
|
| 77 |
+
fdescr.put("Author",
|
| 78 |
+
line.replace("# Author =", "").trim());
|
| 79 |
+
} else if (line.startsWith("# DocumentType =")) {
|
| 80 |
+
fdescr.put("Type",
|
| 81 |
+
line.replace("# DocumentType =", "").trim());
|
| 82 |
+
} else if (line.startsWith("# Url =")) {
|
| 83 |
+
fdescr.put("Url",
|
| 84 |
+
line.replace("# Url =", "").trim());
|
| 85 |
+
} else if (line.startsWith("# Style =")) {
|
| 86 |
+
fdescr.put("Style",
|
| 87 |
+
line.replace("# Style =", "").trim());
|
| 88 |
+
} else if (line.startsWith("# Domain =")) {
|
| 89 |
+
fdescr.put("Domain",
|
| 90 |
+
line.replace("# Domain =", "").trim());
|
| 91 |
+
} else if (line.startsWith("# Subdomain =")) {
|
| 92 |
+
fdescr.put("Subdomain",
|
| 93 |
+
line.replace("# Subdomain =", "").trim());
|
| 94 |
+
} else if (line.startsWith("# CollectionDate =")) {
|
| 95 |
+
fdescr.put("CollectionDate",
|
| 96 |
+
line.replace("# CollectionDate =", "").trim());
|
| 97 |
+
} else if (line.startsWith("# License =")) {
|
| 98 |
+
// Override default licence if the file declares one
|
| 99 |
+
fdescr.put("Licence",
|
| 100 |
+
line.replace("# License =", "").trim());
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
// --- Structure counting ---
|
| 104 |
+
else if (line.startsWith("# sent_id =")) {
|
| 105 |
+
ns++;
|
| 106 |
+
} else if (line.startsWith("# newpar id =")) {
|
| 107 |
+
np++;
|
| 108 |
+
out.write("\n");
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
// --- Text output ---
|
| 112 |
+
else if (line.startsWith("# text =")) {
|
| 113 |
+
out.write(line.replace("# text =", "").trim() + "\n");
|
| 114 |
+
out.flush();
|
| 115 |
+
} else {
|
| 116 |
+
// CoNLL-UP token line
|
| 117 |
+
String[] cols = line.split("\t");
|
| 118 |
+
if (cols.length > 5) {
|
| 119 |
+
nt++;
|
| 120 |
+
if (!cols[3].equals("PUNCT")) nw++;
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
s.close();
|
| 126 |
+
out.flush();
|
| 127 |
+
out.close();
|
| 128 |
+
|
| 129 |
+
fdescr.put("NumberWords", nw);
|
| 130 |
+
fdescr.put("NumberSentences", ns);
|
| 131 |
+
fdescr.put("NumberParagraphs", np);
|
| 132 |
+
fdescr.put("NumberTokens", nt);
|
| 133 |
+
|
| 134 |
+
descrArray.add(fdescr);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
json.put("metadata", descrArray);
|
| 138 |
+
writeMetadata(json, outdir, "metadata_CC.json");
|
| 139 |
+
|
| 140 |
+
} catch (Exception e) {
|
| 141 |
+
e.printStackTrace();
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
// -----------------------------------------------------------------------
|
| 146 |
+
|
| 147 |
+
@SuppressWarnings("unchecked")
|
| 148 |
+
private void writeMetadata(JSONObject json, String outdir, String filename)
|
| 149 |
+
throws Exception {
|
| 150 |
+
String outMetaPath = outdir + filename;
|
| 151 |
+
Writer outMeta = new OutputStreamWriter(
|
| 152 |
+
new FileOutputStream(outMetaPath), "UTF-8");
|
| 153 |
+
json.writeJSONString(outMeta);
|
| 154 |
+
outMeta.flush();
|
| 155 |
+
outMeta.close();
|
| 156 |
+
|
| 157 |
+
convertJsonToCSV(json, outMetaPath + "_CSV.csv");
|
| 158 |
+
System.out.println("Metadata written to: " + outMetaPath);
|
| 159 |
+
}
|
| 160 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/DocumentMetadata.java
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.util.ArrayList;
|
| 4 |
+
import java.util.Arrays;
|
| 5 |
+
import java.util.Collections;
|
| 6 |
+
import java.util.List;
|
| 7 |
+
|
| 8 |
+
import org.json.simple.JSONArray;
|
| 9 |
+
import org.json.simple.JSONObject;
|
| 10 |
+
|
| 11 |
+
/**
|
| 12 |
+
* DocumentMetadata
|
| 13 |
+
*
|
| 14 |
+
* Canonical in-memory representation of the ifGPT dataset metadata schema.
|
| 15 |
+
|
| 16 |
+
*/
|
| 17 |
+
@SuppressWarnings("unchecked")
|
| 18 |
+
public class DocumentMetadata {
|
| 19 |
+
|
| 20 |
+
// -----------------------------------------------------------------------
|
| 21 |
+
// ── MANDATORY (15) ──────────────────────────────────────────────────────
|
| 22 |
+
// -----------------------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
/** Unique document identifier with the language prefix "bg". */
|
| 25 |
+
private String identifier = "";
|
| 26 |
+
|
| 27 |
+
/** Licence name (open, restricted, …). */
|
| 28 |
+
private String licence = "";
|
| 29 |
+
|
| 30 |
+
/** Publication date yyyy-mm-dd. */
|
| 31 |
+
private String publicationDate = "";
|
| 32 |
+
|
| 33 |
+
/** Title of the document. */
|
| 34 |
+
private String documentTitle = "";
|
| 35 |
+
|
| 36 |
+
/** Publishing organisation / media outlet / institutional originator. */
|
| 37 |
+
private String source = "";
|
| 38 |
+
|
| 39 |
+
/** Modality: "textual" | "multimodal". */
|
| 40 |
+
private String medium = "textual";
|
| 41 |
+
|
| 42 |
+
/** Original web address. */
|
| 43 |
+
private String url = "";
|
| 44 |
+
|
| 45 |
+
/** Up to six subject-area labels from a controlled vocabulary. */
|
| 46 |
+
private List<String> domain = new ArrayList<>();
|
| 47 |
+
|
| 48 |
+
/** Up to six free-text keywords. */
|
| 49 |
+
private List<String> keywords = new ArrayList<>();
|
| 50 |
+
|
| 51 |
+
/** Total word count (non-punctuation tokens). */
|
| 52 |
+
private int numberWords = 0;
|
| 53 |
+
|
| 54 |
+
/** Total sentence count. */
|
| 55 |
+
private int numberSentences = 0;
|
| 56 |
+
|
| 57 |
+
/** Total paragraph count. */
|
| 58 |
+
private int numberParagraphs = 0;
|
| 59 |
+
|
| 60 |
+
/** Total token count (words + punctuation). */
|
| 61 |
+
private int numberTokens = 0;
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* Per-sentence PII coverage vector.
|
| 65 |
+
* Entry i = proportion of tokens in sentence i flagged as PII ∈ [0,1].
|
| 66 |
+
* Length == numberSentences after pipeline completion.
|
| 67 |
+
*/
|
| 68 |
+
private List<Double> piiVector = new ArrayList<>();
|
| 69 |
+
|
| 70 |
+
/**
|
| 71 |
+
* Per-sentence bias coverage vector.
|
| 72 |
+
* Entry i = proportion of tokens in sentence i flagged as biased ∈ [0,1].
|
| 73 |
+
* Length == numberSentences after pipeline completion.
|
| 74 |
+
*/
|
| 75 |
+
private List<Double> biasVector = new ArrayList<>();
|
| 76 |
+
|
| 77 |
+
// -----------------------------------------------------------------------
|
| 78 |
+
// ── OPTIONAL (8) ────────────────────────────────────────────────────────
|
| 79 |
+
// -----------------------------------------------------------------------
|
| 80 |
+
|
| 81 |
+
/** Name(s) of the author(s). */
|
| 82 |
+
private List<String> author = new ArrayList<>();
|
| 83 |
+
|
| 84 |
+
/** Stylistic register: legal | journalistic | administrative | … */
|
| 85 |
+
private String style = "";
|
| 86 |
+
|
| 87 |
+
/** Document genre: book | document | article | … */
|
| 88 |
+
private String type = "";
|
| 89 |
+
|
| 90 |
+
/** Narrower thematic classification, hierarchically linked to Domain. */
|
| 91 |
+
private List<String> subdomain = new ArrayList<>();
|
| 92 |
+
|
| 93 |
+
/** true = translation, false = original Bulgarian text. */
|
| 94 |
+
private Boolean translatedDocument = null; // null = unknown
|
| 95 |
+
|
| 96 |
+
/** Date of acquisition yyyy-mm-dd. */
|
| 97 |
+
private String collectionDate = "";
|
| 98 |
+
|
| 99 |
+
/** URL of the licence text. */
|
| 100 |
+
private String licenceLink = "";
|
| 101 |
+
|
| 102 |
+
/** Anticipated NLP applications from a predefined list. */
|
| 103 |
+
private List<String> taskCategories = new ArrayList<>();
|
| 104 |
+
|
| 105 |
+
// -----------------------------------------------------------------------
|
| 106 |
+
// Constructor
|
| 107 |
+
// -----------------------------------------------------------------------
|
| 108 |
+
|
| 109 |
+
public DocumentMetadata() {}
|
| 110 |
+
|
| 111 |
+
public DocumentMetadata(String identifier) {
|
| 112 |
+
this.identifier = identifier;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// -----------------------------------------------------------------------
|
| 116 |
+
// Fluent setters — mandatory
|
| 117 |
+
// -----------------------------------------------------------------------
|
| 118 |
+
|
| 119 |
+
public DocumentMetadata setIdentifier(String v) { identifier = v; return this; }
|
| 120 |
+
public DocumentMetadata setLicence(String v) { licence = v; return this; }
|
| 121 |
+
public DocumentMetadata setPublicationDate(String v) { publicationDate = v; return this; }
|
| 122 |
+
public DocumentMetadata setDocumentTitle(String v) { documentTitle = v; return this; }
|
| 123 |
+
public DocumentMetadata setSource(String v) { source = v; return this; }
|
| 124 |
+
public DocumentMetadata setMedium(String v) { medium = v; return this; }
|
| 125 |
+
public DocumentMetadata setUrl(String v) { url = v; return this; }
|
| 126 |
+
public DocumentMetadata setDomain(List<String> v) { domain = v != null ? v : new ArrayList<>(); return this; }
|
| 127 |
+
public DocumentMetadata addDomain(String v) { domain.add(v); return this; }
|
| 128 |
+
public DocumentMetadata setKeywords(List<String> v) { keywords = v != null ? v : new ArrayList<>(); return this; }
|
| 129 |
+
public DocumentMetadata addKeyword(String v) { keywords.add(v); return this; }
|
| 130 |
+
public DocumentMetadata setNumberWords(int v) { numberWords = v; return this; }
|
| 131 |
+
public DocumentMetadata setNumberSentences(int v) { numberSentences = v; return this; }
|
| 132 |
+
public DocumentMetadata setNumberParagraphs(int v) { numberParagraphs = v; return this; }
|
| 133 |
+
public DocumentMetadata setNumberTokens(int v) { numberTokens = v; return this; }
|
| 134 |
+
public DocumentMetadata setPiiVector(List<Double> v) { piiVector = v != null ? v : new ArrayList<>(); return this; }
|
| 135 |
+
public DocumentMetadata setBiasVector(List<Double> v) { biasVector = v != null ? v : new ArrayList<>(); return this; }
|
| 136 |
+
|
| 137 |
+
// Fluent setters — optional
|
| 138 |
+
public DocumentMetadata setAuthor(List<String> v) { author = v != null ? v : new ArrayList<>(); return this; }
|
| 139 |
+
public DocumentMetadata addAuthor(String v) { author.add(v); return this; }
|
| 140 |
+
public DocumentMetadata setStyle(String v) { style = v; return this; }
|
| 141 |
+
public DocumentMetadata setType(String v) { type = v; return this; }
|
| 142 |
+
public DocumentMetadata setSubdomain(List<String> v) { subdomain = v != null ? v : new ArrayList<>(); return this; }
|
| 143 |
+
public DocumentMetadata addSubdomain(String v) { subdomain.add(v); return this; }
|
| 144 |
+
public DocumentMetadata setTranslatedDocument(Boolean v) { translatedDocument= v; return this; }
|
| 145 |
+
public DocumentMetadata setCollectionDate(String v) { collectionDate = v; return this; }
|
| 146 |
+
public DocumentMetadata setLicenceLink(String v) { licenceLink = v; return this; }
|
| 147 |
+
public DocumentMetadata setTaskCategories(List<String> v) { taskCategories = v != null ? v : new ArrayList<>(); return this; }
|
| 148 |
+
public DocumentMetadata addTaskCategory(String v) { taskCategories.add(v); return this; }
|
| 149 |
+
|
| 150 |
+
// -----------------------------------------------------------------------
|
| 151 |
+
// Getters
|
| 152 |
+
// -----------------------------------------------------------------------
|
| 153 |
+
|
| 154 |
+
public String getIdentifier() { return identifier; }
|
| 155 |
+
public String getLicence() { return licence; }
|
| 156 |
+
public String getPublicationDate() { return publicationDate; }
|
| 157 |
+
public String getDocumentTitle() { return documentTitle; }
|
| 158 |
+
public String getSource() { return source; }
|
| 159 |
+
public String getMedium() { return medium; }
|
| 160 |
+
public String getUrl() { return url; }
|
| 161 |
+
public List<String> getDomain() { return Collections.unmodifiableList(domain); }
|
| 162 |
+
public List<String> getKeywords() { return Collections.unmodifiableList(keywords); }
|
| 163 |
+
public int getNumberWords() { return numberWords; }
|
| 164 |
+
public int getNumberSentences() { return numberSentences; }
|
| 165 |
+
public int getNumberParagraphs() { return numberParagraphs; }
|
| 166 |
+
public int getNumberTokens() { return numberTokens; }
|
| 167 |
+
public List<Double> getPiiVector() { return Collections.unmodifiableList(piiVector); }
|
| 168 |
+
public List<Double> getBiasVector() { return Collections.unmodifiableList(biasVector); }
|
| 169 |
+
|
| 170 |
+
public List<String> getAuthor() { return Collections.unmodifiableList(author); }
|
| 171 |
+
public String getStyle() { return style; }
|
| 172 |
+
public String getType() { return type; }
|
| 173 |
+
public List<String> getSubdomain() { return Collections.unmodifiableList(subdomain); }
|
| 174 |
+
public Boolean getTranslatedDocument(){ return translatedDocument; }
|
| 175 |
+
public String getCollectionDate() { return collectionDate; }
|
| 176 |
+
public String getLicenceLink() { return licenceLink; }
|
| 177 |
+
public List<String> getTaskCategories() { return Collections.unmodifiableList(taskCategories); }
|
| 178 |
+
|
| 179 |
+
// -----------------------------------------------------------------------
|
| 180 |
+
// Validation
|
| 181 |
+
// -----------------------------------------------------------------------
|
| 182 |
+
|
| 183 |
+
/**
|
| 184 |
+
* Returns a list of missing mandatory fields.
|
| 185 |
+
* An empty list means the record is complete.
|
| 186 |
+
*/
|
| 187 |
+
public List<String> missingMandatoryFields() {
|
| 188 |
+
List<String> missing = new ArrayList<>();
|
| 189 |
+
if (identifier.isBlank()) missing.add("Identifier");
|
| 190 |
+
if (licence.isBlank()) missing.add("Licence");
|
| 191 |
+
if (medium.isBlank()) missing.add("Medium");
|
| 192 |
+
if (numberWords == 0) missing.add("NumberWords");
|
| 193 |
+
if (numberSentences == 0) missing.add("NumberSentences");
|
| 194 |
+
if (numberParagraphs == 0) missing.add("NumberParagraphs");
|
| 195 |
+
if (numberTokens == 0) missing.add("NumberTokens");
|
| 196 |
+
// piiVector and biasVector may legitimately be empty for clean docs
|
| 197 |
+
return missing;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
// -----------------------------------------------------------------------
|
| 201 |
+
// JSON serialisation (json-simple)
|
| 202 |
+
// -----------------------------------------------------------------------
|
| 203 |
+
|
| 204 |
+
/** Serialises this record to a json-simple JSONObject. */
|
| 205 |
+
public JSONObject toJson() {
|
| 206 |
+
JSONObject o = new JSONObject();
|
| 207 |
+
|
| 208 |
+
// Mandatory
|
| 209 |
+
o.put("Identifier", identifier);
|
| 210 |
+
o.put("Licence", licence);
|
| 211 |
+
o.put("PublicationDate", publicationDate);
|
| 212 |
+
o.put("DocumentTitle", documentTitle);
|
| 213 |
+
o.put("Source", source);
|
| 214 |
+
o.put("Medium", medium);
|
| 215 |
+
o.put("Url", url);
|
| 216 |
+
o.put("Domain", toJsonArray(domain));
|
| 217 |
+
o.put("Keywords", toJsonArray(keywords));
|
| 218 |
+
o.put("NumberWords", numberWords);
|
| 219 |
+
o.put("NumberSentences", numberSentences);
|
| 220 |
+
o.put("NumberParagraphs", numberParagraphs);
|
| 221 |
+
o.put("NumberTokens", numberTokens);
|
| 222 |
+
o.put("PersonallyIdentifiableInformation",toJsonDoubleArray(piiVector));
|
| 223 |
+
o.put("BiasedInformation", toJsonDoubleArray(biasVector));
|
| 224 |
+
|
| 225 |
+
// Optional
|
| 226 |
+
o.put("Author", toJsonArray(author));
|
| 227 |
+
o.put("Style", style);
|
| 228 |
+
o.put("Type", type);
|
| 229 |
+
o.put("Subdomain", toJsonArray(subdomain));
|
| 230 |
+
o.put("TranslatedDocument",
|
| 231 |
+
translatedDocument == null ? "" : translatedDocument.toString());
|
| 232 |
+
o.put("CollectionDate", collectionDate);
|
| 233 |
+
o.put("LicenceLink", licenceLink);
|
| 234 |
+
o.put("TaskCategories", toJsonArray(taskCategories));
|
| 235 |
+
|
| 236 |
+
return o;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
/**
|
| 240 |
+
* Populates a DocumentMetadata from a json-simple JSONObject previously
|
| 241 |
+
* produced by {@link #toJson()}.
|
| 242 |
+
*/
|
| 243 |
+
public static DocumentMetadata fromJson(JSONObject o) {
|
| 244 |
+
DocumentMetadata m = new DocumentMetadata();
|
| 245 |
+
|
| 246 |
+
m.identifier = str(o, "Identifier");
|
| 247 |
+
m.licence = str(o, "Licence");
|
| 248 |
+
m.publicationDate = str(o, "PublicationDate");
|
| 249 |
+
m.documentTitle = str(o, "DocumentTitle");
|
| 250 |
+
m.source = str(o, "Source");
|
| 251 |
+
m.medium = str(o, "Medium");
|
| 252 |
+
m.url = str(o, "Url");
|
| 253 |
+
m.domain = strList(o, "Domain");
|
| 254 |
+
m.keywords = strList(o, "Keywords");
|
| 255 |
+
m.numberWords = intVal(o, "NumberWords");
|
| 256 |
+
m.numberSentences = intVal(o, "NumberSentences");
|
| 257 |
+
m.numberParagraphs = intVal(o, "NumberParagraphs");
|
| 258 |
+
m.numberTokens = intVal(o, "NumberTokens");
|
| 259 |
+
m.piiVector = doubleList(o, "PersonallyIdentifiableInformation");
|
| 260 |
+
m.biasVector = doubleList(o, "BiasedInformation");
|
| 261 |
+
|
| 262 |
+
m.author = strList(o, "Author");
|
| 263 |
+
m.style = str(o, "Style");
|
| 264 |
+
m.type = str(o, "Type");
|
| 265 |
+
m.subdomain = strList(o, "Subdomain");
|
| 266 |
+
String td = str(o, "TranslatedDocument");
|
| 267 |
+
m.translatedDocument= td.isBlank() ? null : Boolean.parseBoolean(td);
|
| 268 |
+
m.collectionDate = str(o, "CollectionDate");
|
| 269 |
+
m.licenceLink = str(o, "LicenceLink");
|
| 270 |
+
m.taskCategories = strList(o, "TaskCategories");
|
| 271 |
+
|
| 272 |
+
return m;
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
// -----------------------------------------------------------------------
|
| 276 |
+
// Interop with legacy JSONObject format (used by source processors)
|
| 277 |
+
// -----------------------------------------------------------------------
|
| 278 |
+
|
| 279 |
+
/**
|
| 280 |
+
* Merges fields from a legacy source-processor JSONObject (the format
|
| 281 |
+
* produced by MarcellProcessor, BulNCProcessor, etc.) into this record.
|
| 282 |
+
* Fields already set on {@code this} are NOT overwritten.
|
| 283 |
+
*/
|
| 284 |
+
public void mergeLegacy(JSONObject legacy) {
|
| 285 |
+
if (identifier.isBlank()) setIdentifier(str(legacy, "Identifier"));
|
| 286 |
+
if (licence.isBlank()) setLicence(str(legacy, "Licence"));
|
| 287 |
+
if (licenceLink.isBlank()) setLicenceLink(str(legacy, "LicenceLink"));
|
| 288 |
+
if (publicationDate.isBlank()) setPublicationDate(str(legacy, "PublicationDate"));
|
| 289 |
+
if (documentTitle.isBlank()) setDocumentTitle(str(legacy, "DocumentTitle"));
|
| 290 |
+
if (source.isBlank()) setSource(str(legacy, "Source"));
|
| 291 |
+
if (url.isBlank()) setUrl(str(legacy, "Url"));
|
| 292 |
+
if (style.isBlank()) setStyle(str(legacy, "Style"));
|
| 293 |
+
if (type.isBlank()) setType(str(legacy, "Type"));
|
| 294 |
+
if (collectionDate.isBlank()) setCollectionDate(str(legacy, "CollectionDate"));
|
| 295 |
+
|
| 296 |
+
if (author.isEmpty()) {
|
| 297 |
+
String a = str(legacy, "Author");
|
| 298 |
+
if (!a.isBlank()) author.add(a);
|
| 299 |
+
}
|
| 300 |
+
if (domain.isEmpty()) {
|
| 301 |
+
String d = str(legacy, "Domain");
|
| 302 |
+
if (!d.isBlank()) domain.add(d);
|
| 303 |
+
}
|
| 304 |
+
if (subdomain.isEmpty()) {
|
| 305 |
+
String s = str(legacy, "Subdomain");
|
| 306 |
+
if (!s.isBlank()) subdomain.add(s);
|
| 307 |
+
}
|
| 308 |
+
if (numberWords == 0) numberWords = intVal(legacy, "NumberWords");
|
| 309 |
+
if (numberSentences == 0) numberSentences = intVal(legacy, "NumberSentences");
|
| 310 |
+
if (numberParagraphs == 0) numberParagraphs = intVal(legacy, "NumberParagraphs");
|
| 311 |
+
if (numberTokens == 0) numberTokens = intVal(legacy, "NumberTokens");
|
| 312 |
+
|
| 313 |
+
String translated = str(legacy, "TranslatedDocument");
|
| 314 |
+
if (translatedDocument == null && !translated.isBlank())
|
| 315 |
+
translatedDocument = Boolean.parseBoolean(translated);
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
// -----------------------------------------------------------------------
|
| 319 |
+
// Private helpers
|
| 320 |
+
// -----------------------------------------------------------------------
|
| 321 |
+
|
| 322 |
+
private static String str(JSONObject o, String key) {
|
| 323 |
+
Object v = o.get(key);
|
| 324 |
+
return v == null ? "" : v.toString().trim();
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
private static int intVal(JSONObject o, String key) {
|
| 328 |
+
Object v = o.get(key);
|
| 329 |
+
if (v == null) return 0;
|
| 330 |
+
try { return Integer.parseInt(v.toString().trim()); }
|
| 331 |
+
catch (NumberFormatException e) { return 0; }
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
private static List<String> strList(JSONObject o, String key) {
|
| 335 |
+
Object v = o.get(key);
|
| 336 |
+
List<String> list = new ArrayList<>();
|
| 337 |
+
if (v instanceof JSONArray) {
|
| 338 |
+
for (Object item : (JSONArray) v)
|
| 339 |
+
if (item != null) list.add(item.toString());
|
| 340 |
+
} else if (v != null && !v.toString().isBlank()) {
|
| 341 |
+
list.add(v.toString().trim());
|
| 342 |
+
}
|
| 343 |
+
return list;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
private static List<Double> doubleList(JSONObject o, String key) {
|
| 347 |
+
Object v = o.get(key);
|
| 348 |
+
List<Double> list = new ArrayList<>();
|
| 349 |
+
if (v instanceof JSONArray) {
|
| 350 |
+
for (Object item : (JSONArray) v) {
|
| 351 |
+
try { list.add(Double.parseDouble(item.toString())); }
|
| 352 |
+
catch (NumberFormatException ignored) {}
|
| 353 |
+
}
|
| 354 |
+
}
|
| 355 |
+
return list;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
private JSONArray toJsonArray(List<String> list) {
|
| 359 |
+
JSONArray a = new JSONArray();
|
| 360 |
+
if (list != null) a.addAll(list);
|
| 361 |
+
return a;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
private JSONArray toJsonDoubleArray(List<Double> list) {
|
| 365 |
+
JSONArray a = new JSONArray();
|
| 366 |
+
if (list != null) a.addAll(list);
|
| 367 |
+
return a;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
@Override
|
| 371 |
+
public String toString() {
|
| 372 |
+
return String.format(
|
| 373 |
+
"DocumentMetadata{id='%s', sentences=%d, words=%d, piiEntries=%d, biasEntries=%d}",
|
| 374 |
+
identifier, numberSentences, numberWords, piiVector.size(), biasVector.size());
|
| 375 |
+
}
|
| 376 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTDatasetProcessor.java
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
/**
|
| 4 |
+
* IfGPTDatasetProcessor
|
| 5 |
+
*
|
| 6 |
+
|
| 7 |
+
*/
|
| 8 |
+
public class IfGPTDatasetProcessor {
|
| 9 |
+
|
| 10 |
+
// -----------------------------------------------------------------------
|
| 11 |
+
// Shared paths
|
| 12 |
+
// -----------------------------------------------------------------------
|
| 13 |
+
|
| 14 |
+
// New batch being ingested
|
| 15 |
+
static final String NEW_DATA_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/data/";
|
| 16 |
+
static final String NEW_META_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/metadata/";
|
| 17 |
+
static final String SAMPLE_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/sample/";
|
| 18 |
+
static final String BLOCKLIST_FILE = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/blocklist.txt";
|
| 19 |
+
static final String DEDUP_REPORT = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/dedup_report.tsv";
|
| 20 |
+
|
| 21 |
+
// Shared resources
|
| 22 |
+
static final String BULNC_META_FILE = "/home/ivelina/SVN_CORPUS/BulNC/BulNC-description.txt";
|
| 23 |
+
static final String BIAS_DICT = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
|
| 24 |
+
+ "bulgarian_bias_dictionary_v4.tsv";
|
| 25 |
+
|
| 26 |
+
// -----------------------------------------------------------------------
|
| 27 |
+
// Main
|
| 28 |
+
// -----------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
public static void main(String[] args) {
|
| 31 |
+
|
| 32 |
+
// ==================================================================
|
| 33 |
+
// MODE A — FULL PIPELINE (one call runs all 8 stages)
|
| 34 |
+
// ==================================================================
|
| 35 |
+
// Choose the source processor that matches the new batch format,
|
| 36 |
+
// then call pipeline.run().
|
| 37 |
+
|
| 38 |
+
// --- BulNC Mass Media batch ---
|
| 39 |
+
runBulNCPipeline();
|
| 40 |
+
|
| 41 |
+
// --- MARCELL batch ---
|
| 42 |
+
// runMarcellPipeline();
|
| 43 |
+
|
| 44 |
+
// --- CURLICAT batch ---
|
| 45 |
+
// runCurlicatPipeline();
|
| 46 |
+
|
| 47 |
+
// --- BulNC Wiki/InformalFiction batch ---
|
| 48 |
+
// runBulNCWikiPipeline();
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
// ==================================================================
|
| 52 |
+
// MODE B — INDIVIDUAL STAGES
|
| 53 |
+
// ==================================================================
|
| 54 |
+
|
| 55 |
+
// --- 1. Extract only ---
|
| 56 |
+
// new BulNCProcessor(BULNC_META_FILE).process(NEW_DATA_DIR, NEW_META_DIR);
|
| 57 |
+
|
| 58 |
+
// --- 3. Clean only (learn + apply) ---
|
| 59 |
+
// FileCleanProcessor fcp = new FileCleanProcessor(0.50);
|
| 60 |
+
// fcp.learnFromSample(SAMPLE_DIR);
|
| 61 |
+
// fcp.printTopCommonLines(30);
|
| 62 |
+
// fcp.saveBlocklist(BLOCKLIST_FILE);
|
| 63 |
+
// fcp.cleanDirectory(NEW_DATA_DIR, true);
|
| 64 |
+
|
| 65 |
+
// --- 4. Deduplication only ---
|
| 66 |
+
// DeduplicationProcessor dp = new DeduplicationProcessor(0.90, 5, 200);
|
| 67 |
+
// dp.indexCorpus(IfGPTPipeline.FULL_DATA_DIR);
|
| 68 |
+
// dp.detectDuplicates(NEW_DATA_DIR, DEDUP_REPORT);
|
| 69 |
+
// dp.removeDuplicatesFromNewFolder(NEW_DATA_DIR, true); // optional
|
| 70 |
+
|
| 71 |
+
// --- 5/6. PII + Bias annotation only (on already-split sentences) ---
|
| 72 |
+
// bg.bas.dcl.LLMs.BulgarianSentenceSplitter splitter =
|
| 73 |
+
// new bg.bas.dcl.LLMs.BulgarianSentenceSplitter();
|
| 74 |
+
// bg.bas.dcl.LLMs.PIIDetector pii = new bg.bas.dcl.LLMs.PIIDetector(splitter);
|
| 75 |
+
// pii.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "pii_report.tsv");
|
| 76 |
+
//
|
| 77 |
+
// bg.bas.dcl.LLMs.BiasLexicon lex =
|
| 78 |
+
// new bg.bas.dcl.LLMs.BiasLexicon(BIAS_DICT);
|
| 79 |
+
// bg.bas.dcl.LLMs.BiasAnalyser bias =
|
| 80 |
+
// new bg.bas.dcl.LLMs.BiasAnalyser(lex, splitter);
|
| 81 |
+
// bias.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "bias_report.tsv");
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
// ==================================================================
|
| 85 |
+
// MODE C — UTILITIES
|
| 86 |
+
// ==================================================================
|
| 87 |
+
|
| 88 |
+
// Convert an existing metadata JSON to CSV
|
| 89 |
+
// new MarcellProcessor().convertJsonToCSV(
|
| 90 |
+
// IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json");
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// -----------------------------------------------------------------------
|
| 94 |
+
// Pipeline factory methods (one per source type)
|
| 95 |
+
// -----------------------------------------------------------------------
|
| 96 |
+
|
| 97 |
+
private static void runBulNCPipeline() {
|
| 98 |
+
new IfGPTPipeline()
|
| 99 |
+
.setSourceProcessor(new BulNCProcessor(BULNC_META_FILE))
|
| 100 |
+
.setNewDataDir(NEW_DATA_DIR)
|
| 101 |
+
.setSampleDir(SAMPLE_DIR)
|
| 102 |
+
.setNewMetaDir(NEW_META_DIR)
|
| 103 |
+
.setBlocklistFile(BLOCKLIST_FILE)
|
| 104 |
+
.setDedupReport(DEDUP_REPORT)
|
| 105 |
+
.setBiasDictPath(BIAS_DICT)
|
| 106 |
+
.setBoilerplateThreshold(0.50)
|
| 107 |
+
.setDedupThreshold(0.90)
|
| 108 |
+
.setRemoveDuplicates(false) // set true to delete dup sentences
|
| 109 |
+
.setKeepBackups(true)
|
| 110 |
+
.run();
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
private static void runMarcellPipeline() {
|
| 114 |
+
String indirMarcell = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/bg-annotated/";
|
| 115 |
+
String outdirMarcell= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/texts/";
|
| 116 |
+
|
| 117 |
+
new IfGPTPipeline()
|
| 118 |
+
.setSourceProcessor(new MarcellProcessor())
|
| 119 |
+
.setNewDataDir(outdirMarcell)
|
| 120 |
+
.setSampleDir(SAMPLE_DIR)
|
| 121 |
+
.setNewMetaDir(NEW_META_DIR)
|
| 122 |
+
.setBlocklistFile(BLOCKLIST_FILE)
|
| 123 |
+
.setDedupReport(DEDUP_REPORT)
|
| 124 |
+
.setBiasDictPath(BIAS_DICT)
|
| 125 |
+
.setSkipClean(false)
|
| 126 |
+
.setSkipDedup(false)
|
| 127 |
+
.run();
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
private static void runCurlicatPipeline() {
|
| 131 |
+
String indirCurlicat = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/archive/"
|
| 132 |
+
+ "Bulgarian_Curlicat_corpus/";
|
| 133 |
+
String outdirCurlicat= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/texts/";
|
| 134 |
+
|
| 135 |
+
new IfGPTPipeline()
|
| 136 |
+
.setSourceProcessor(new CurlicatProcessor())
|
| 137 |
+
.setNewDataDir(outdirCurlicat)
|
| 138 |
+
.setSampleDir(SAMPLE_DIR)
|
| 139 |
+
.setNewMetaDir(NEW_META_DIR)
|
| 140 |
+
.setBlocklistFile(BLOCKLIST_FILE)
|
| 141 |
+
.setDedupReport(DEDUP_REPORT)
|
| 142 |
+
.setBiasDictPath(BIAS_DICT)
|
| 143 |
+
.run();
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
private static void runBulNCWikiPipeline() {
|
| 147 |
+
String existingMeta = IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json";
|
| 148 |
+
String outdirWiki = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/BulNC/wiki-texts/";
|
| 149 |
+
|
| 150 |
+
new IfGPTPipeline()
|
| 151 |
+
.setSourceProcessor(new BulNCWikiProcessor(BULNC_META_FILE, existingMeta))
|
| 152 |
+
.setNewDataDir(outdirWiki)
|
| 153 |
+
.setSampleDir(SAMPLE_DIR)
|
| 154 |
+
.setNewMetaDir(NEW_META_DIR)
|
| 155 |
+
.setBlocklistFile(BLOCKLIST_FILE)
|
| 156 |
+
.setDedupReport(DEDUP_REPORT)
|
| 157 |
+
.setBiasDictPath(BIAS_DICT)
|
| 158 |
+
.run();
|
| 159 |
+
}
|
| 160 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTPipeline.java
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.Writer;
|
| 7 |
+
import java.nio.charset.StandardCharsets;
|
| 8 |
+
import java.nio.file.Files;
|
| 9 |
+
import java.nio.file.StandardCopyOption;
|
| 10 |
+
import java.util.ArrayList;
|
| 11 |
+
import java.util.List;
|
| 12 |
+
import java.util.Properties;
|
| 13 |
+
import java.util.Scanner;
|
| 14 |
+
|
| 15 |
+
import org.json.simple.JSONArray;
|
| 16 |
+
import org.json.simple.JSONObject;
|
| 17 |
+
|
| 18 |
+
import bg.bas.dcl.LLMs.BiasAnalyser;
|
| 19 |
+
import bg.bas.dcl.LLMs.BiasLexicon;
|
| 20 |
+
import bg.bas.dcl.LLMs.BulgarianSentenceSplitter;
|
| 21 |
+
import bg.bas.dcl.LLMs.PIIDetector;
|
| 22 |
+
import bg.bas.dcl.LLMs.SentenceBiasScore;
|
| 23 |
+
import bg.bas.dcl.general.FileHandler;
|
| 24 |
+
import bg.bas.dcl.general.JSONProcessor;
|
| 25 |
+
|
| 26 |
+
/**
|
| 27 |
+
* IfGPTPipeline
|
| 28 |
+
*
|
| 29 |
+
* Pipeline for the ifGPT Bulgarian language dataset.
|
| 30 |
+
*
|
| 31 |
+
* -----------------------------------------------------------------------
|
| 32 |
+
-----------------------------------------------------------------------
|
| 33 |
+
* PIPELINE STAGES (executed in order by {@link #run()})
|
| 34 |
+
*
|
| 35 |
+
* 1. EXTRACT
|
| 36 |
+
* 2. SPLIT
|
| 37 |
+
* 3. CLEAN
|
| 38 |
+
* 4. DEDUPLICATE
|
| 39 |
+
* 5. PII
|
| 40 |
+
* 6. BIAS
|
| 41 |
+
* 7. COUNTS — word / sentence / token counts are recomputed on the cleaned, deduplicated text
|
| 42 |
+
* FULL_DATA_DIR / FULL_META_DIR
|
| 43 |
+
*
|
| 44 |
+
* -----------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
*/
|
| 47 |
+
@SuppressWarnings("unchecked")
|
| 48 |
+
public class IfGPTPipeline {
|
| 49 |
+
|
| 50 |
+
// -----------------------------------------------------------------------
|
| 51 |
+
// Fixed paths
|
| 52 |
+
// -----------------------------------------------------------------------
|
| 53 |
+
|
| 54 |
+
public static final String FULL_DATA_DIR =
|
| 55 |
+
"/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-DATA/";
|
| 56 |
+
public static final String FULL_META_DIR =
|
| 57 |
+
"/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-METADATA/";
|
| 58 |
+
|
| 59 |
+
// -----------------------------------------------------------------------
|
| 60 |
+
// Configurable paths and options
|
| 61 |
+
// -----------------------------------------------------------------------
|
| 62 |
+
|
| 63 |
+
private SourceProcessor sourceProcessor; // mandatory
|
| 64 |
+
private String newDataDir; // mandatory: incoming texts
|
| 65 |
+
private String sampleDir; // mandatory: boilerplate sample
|
| 66 |
+
private String newMetaDir; // mandatory: output metadata
|
| 67 |
+
private String blocklistFile; // boilerplate blocklist file
|
| 68 |
+
private String dedupReport; // dedup TSV report path
|
| 69 |
+
private String biasDictPath; // bias dictionary TSV
|
| 70 |
+
private String openNlpModelPath = null; // null = bundled JAR model
|
| 71 |
+
private double boilerplateThreshold = 0.50; // FileCleanProcessor threshold
|
| 72 |
+
private double dedupThreshold = 0.90; // DeduplicationProcessor threshold
|
| 73 |
+
private int dedupShingleSize = 5;
|
| 74 |
+
private int dedupNumHashes = 200;
|
| 75 |
+
private boolean removeDuplicates = false; // whether to strip dup sentences
|
| 76 |
+
private boolean keepBackups = true; // keep .bak on file modification
|
| 77 |
+
private boolean skipClean = false; // skip boilerplate cleaning
|
| 78 |
+
private boolean skipDedup = false; // skip deduplication
|
| 79 |
+
private boolean skipPii = false; // skip PII scoring
|
| 80 |
+
private boolean skipBias = false; // skip bias scoring
|
| 81 |
+
|
| 82 |
+
// -----------------------------------------------------------------------
|
| 83 |
+
//
|
| 84 |
+
// -----------------------------------------------------------------------
|
| 85 |
+
|
| 86 |
+
public IfGPTPipeline setSourceProcessor(SourceProcessor p) { sourceProcessor = p; return this; }
|
| 87 |
+
public IfGPTPipeline setNewDataDir(String p) { newDataDir = p; return this; }
|
| 88 |
+
public IfGPTPipeline setSampleDir(String p) { sampleDir = p; return this; }
|
| 89 |
+
public IfGPTPipeline setNewMetaDir(String p) { newMetaDir = p; return this; }
|
| 90 |
+
public IfGPTPipeline setBlocklistFile(String p) { blocklistFile = p; return this; }
|
| 91 |
+
public IfGPTPipeline setDedupReport(String p) { dedupReport = p; return this; }
|
| 92 |
+
public IfGPTPipeline setBiasDictPath(String p) { biasDictPath = p; return this; }
|
| 93 |
+
public IfGPTPipeline setOpenNlpModelPath(String p) { openNlpModelPath = p; return this; }
|
| 94 |
+
public IfGPTPipeline setBoilerplateThreshold(double t) { boilerplateThreshold = t; return this; }
|
| 95 |
+
public IfGPTPipeline setDedupThreshold(double t) { dedupThreshold = t; return this; }
|
| 96 |
+
public IfGPTPipeline setDedupShingleSize(int n) { dedupShingleSize = n; return this; }
|
| 97 |
+
public IfGPTPipeline setDedupNumHashes(int n) { dedupNumHashes = n; return this; }
|
| 98 |
+
public IfGPTPipeline setRemoveDuplicates(boolean b) { removeDuplicates = b; return this; }
|
| 99 |
+
public IfGPTPipeline setKeepBackups(boolean b) { keepBackups = b; return this; }
|
| 100 |
+
public IfGPTPipeline setSkipClean(boolean b) { skipClean = b; return this; }
|
| 101 |
+
public IfGPTPipeline setSkipDedup(boolean b) { skipDedup = b; return this; }
|
| 102 |
+
public IfGPTPipeline setSkipPii(boolean b) { skipPii = b; return this; }
|
| 103 |
+
public IfGPTPipeline setSkipBias(boolean b) { skipBias = b; return this; }
|
| 104 |
+
|
| 105 |
+
// -----------------------------------------------------------------------
|
| 106 |
+
// -----------------------------------------------------------------------
|
| 107 |
+
|
| 108 |
+
/**
|
| 109 |
+
* Executes all stages in order.
|
| 110 |
+
* Throws {@link IllegalStateException} if mandatory configuration is missing.
|
| 111 |
+
*/
|
| 112 |
+
public void run() {
|
| 113 |
+
validateConfig();
|
| 114 |
+
ensureDirs(newMetaDir, FULL_DATA_DIR, FULL_META_DIR);
|
| 115 |
+
|
| 116 |
+
banner("STAGE 1 — SOURCE EXTRACTION");
|
| 117 |
+
runExtraction();
|
| 118 |
+
|
| 119 |
+
// Shared NLP components (initialised once, reused across stages)
|
| 120 |
+
BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter(openNlpModelPath);
|
| 121 |
+
|
| 122 |
+
banner("STAGE 2 — SENTENCE SPLITTING & INITIAL METADATA");
|
| 123 |
+
runSentenceSplitting(splitter);
|
| 124 |
+
|
| 125 |
+
if (!skipClean) {
|
| 126 |
+
banner("STAGE 3 — BOILERPLATE CLEANING");
|
| 127 |
+
runCleaning();
|
| 128 |
+
} else {
|
| 129 |
+
log("STAGE 3 skipped (skipClean=true)");
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
if (!skipDedup) {
|
| 133 |
+
banner("STAGE 4 — DEDUPLICATION");
|
| 134 |
+
runDeduplication();
|
| 135 |
+
} else {
|
| 136 |
+
log("STAGE 4 skipped (skipDedup=true)");
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
PIIDetector piiDetector = skipPii ? null : new PIIDetector(splitter);
|
| 140 |
+
BiasAnalyser biasAnalyser = skipBias ? null : buildBiasAnalyser(splitter);
|
| 141 |
+
|
| 142 |
+
banner("STAGES 5-7 — PII, BIAS & FINAL COUNTS");
|
| 143 |
+
runAnnotationAndCounts(splitter, piiDetector, biasAnalyser);
|
| 144 |
+
|
| 145 |
+
banner("STAGE 8 — PERSIST TO FULL CORPUS");
|
| 146 |
+
runPersist();
|
| 147 |
+
|
| 148 |
+
banner("PIPELINE COMPLETE");
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
// -----------------------------------------------------------------------
|
| 152 |
+
// Stage 1 — Extraction
|
| 153 |
+
// -----------------------------------------------------------------------
|
| 154 |
+
|
| 155 |
+
private void runExtraction() {
|
| 156 |
+
// The source processor writes plain-text files to newDataDir and
|
| 157 |
+
// seed metadata JSON to newMetaDir.
|
| 158 |
+
sourceProcessor.process(newDataDir, newMetaDir);
|
| 159 |
+
log("Extraction complete → " + newDataDir);
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
// -----------------------------------------------------------------------
|
| 163 |
+
// Stage 2 — Sentence splitting
|
| 164 |
+
// -----------------------------------------------------------------------
|
| 165 |
+
|
| 166 |
+
/**
|
| 167 |
+
* Reads each metadata JSON produced by the source processor, then for
|
| 168 |
+
* each document text file counts sentences properly using the OpenNLP
|
| 169 |
+
* splitter and writes the sentence list to a parallel .sentences file
|
| 170 |
+
* (one sentence per line) used by later stages.
|
| 171 |
+
*/
|
| 172 |
+
private void runSentenceSplitting(BulgarianSentenceSplitter splitter) {
|
| 173 |
+
try {
|
| 174 |
+
FileHandler fh = new FileHandler();
|
| 175 |
+
int docs = 0;
|
| 176 |
+
|
| 177 |
+
for (File txtFile : fh.getFileListing(new File(newDataDir))) {
|
| 178 |
+
if (!txtFile.isFile() || !txtFile.getName().endsWith(".txt")) continue;
|
| 179 |
+
|
| 180 |
+
// Read document text
|
| 181 |
+
StringBuilder sb = new StringBuilder();
|
| 182 |
+
try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) {
|
| 183 |
+
while (sc.hasNextLine()) sb.append(sc.nextLine()).append('\n');
|
| 184 |
+
}
|
| 185 |
+
String text = sb.toString().trim();
|
| 186 |
+
|
| 187 |
+
// Split into sentences and persist to .sentences sidecar file
|
| 188 |
+
String[] sentences = splitter.split(text);
|
| 189 |
+
File sentFile = new File(newDataDir, txtFile.getName()
|
| 190 |
+
.replace(".txt", ".sentences"));
|
| 191 |
+
|
| 192 |
+
try (Writer w = new OutputStreamWriter(
|
| 193 |
+
new FileOutputStream(sentFile), StandardCharsets.UTF_8)) {
|
| 194 |
+
for (String sent : sentences) {
|
| 195 |
+
if (!sent.isBlank()) {
|
| 196 |
+
w.write(sent.trim());
|
| 197 |
+
w.write('\n');
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
}
|
| 201 |
+
docs++;
|
| 202 |
+
}
|
| 203 |
+
log("Sentence splitting complete. Documents: " + docs);
|
| 204 |
+
} catch (Exception e) {
|
| 205 |
+
e.printStackTrace();
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
// -----------------------------------------------------------------------
|
| 210 |
+
// Stage 3 — Boilerplate cleaning
|
| 211 |
+
// -----------------------------------------------------------------------
|
| 212 |
+
|
| 213 |
+
private void runCleaning() {
|
| 214 |
+
FileCleanProcessor fcp = new FileCleanProcessor(boilerplateThreshold);
|
| 215 |
+
|
| 216 |
+
// Learn from sample
|
| 217 |
+
fcp.learnFromSample(sampleDir);
|
| 218 |
+
fcp.printTopCommonLines(20);
|
| 219 |
+
|
| 220 |
+
// Save blocklist for audit / reproducibility
|
| 221 |
+
if (blocklistFile != null && !blocklistFile.isBlank()) {
|
| 222 |
+
fcp.saveBlocklist(blocklistFile);
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
// Clean the new data directory
|
| 226 |
+
fcp.cleanDirectory(newDataDir, keepBackups);
|
| 227 |
+
log("Boilerplate cleaning complete → " + newDataDir);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
// -----------------------------------------------------------------------
|
| 231 |
+
// Stage 4 — Deduplication
|
| 232 |
+
// -----------------------------------------------------------------------
|
| 233 |
+
|
| 234 |
+
private void runDeduplication() {
|
| 235 |
+
DeduplicationProcessor dp = new DeduplicationProcessor(
|
| 236 |
+
dedupThreshold, dedupShingleSize, dedupNumHashes);
|
| 237 |
+
|
| 238 |
+
// Index the full existing corpus
|
| 239 |
+
log("Indexing full corpus for deduplication…");
|
| 240 |
+
dp.indexCorpus(FULL_DATA_DIR);
|
| 241 |
+
log("Corpus indexed. Sentences: " + dp.getCorpusSize());
|
| 242 |
+
|
| 243 |
+
// Detect near-duplicates in new data
|
| 244 |
+
String report = dedupReport != null
|
| 245 |
+
? dedupReport
|
| 246 |
+
: newMetaDir + "dedup_report.tsv";
|
| 247 |
+
dp.detectDuplicates(newDataDir, report);
|
| 248 |
+
|
| 249 |
+
if (removeDuplicates) {
|
| 250 |
+
dp.removeDuplicatesFromNewFolder(newDataDir, keepBackups);
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
// -----------------------------------------------------------------------
|
| 255 |
+
// Stages 5-7 — PII, Bias annotation + final counts
|
| 256 |
+
// -----------------------------------------------------------------------
|
| 257 |
+
|
| 258 |
+
/**
|
| 259 |
+
* For each document:
|
| 260 |
+
* a) reads the (cleaned, deduplicated) .sentences sidecar file,
|
| 261 |
+
* b) runs PII and/or Bias scoring per sentence,
|
| 262 |
+
* c) recomputes word/sentence/token counts on the surviving text,
|
| 263 |
+
* d) merges all computed values into a DocumentMetadata and writes
|
| 264 |
+
* the final metadata JSON to newMetaDir.
|
| 265 |
+
*/
|
| 266 |
+
private void runAnnotationAndCounts(BulgarianSentenceSplitter splitter,
|
| 267 |
+
PIIDetector piiDetector,
|
| 268 |
+
BiasAnalyser biasAnalyser) {
|
| 269 |
+
try {
|
| 270 |
+
FileHandler fh = new FileHandler();
|
| 271 |
+
JSONProcessor jp = new JSONProcessor();
|
| 272 |
+
int docs = 0, errors = 0;
|
| 273 |
+
|
| 274 |
+
for (File sentFile : fh.getFileListing(new File(newDataDir))) {
|
| 275 |
+
if (!sentFile.isFile()
|
| 276 |
+
|| !sentFile.getName().endsWith(".sentences")) continue;
|
| 277 |
+
|
| 278 |
+
String stem = sentFile.getName().replace(".sentences", "");
|
| 279 |
+
|
| 280 |
+
// --- Load sentences ---
|
| 281 |
+
List<String> sentences = new ArrayList<>();
|
| 282 |
+
try (Scanner sc = new Scanner(sentFile, StandardCharsets.UTF_8)) {
|
| 283 |
+
while (sc.hasNextLine()) {
|
| 284 |
+
String s = sc.nextLine().trim();
|
| 285 |
+
if (!s.isBlank()) sentences.add(s);
|
| 286 |
+
}
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
if (sentences.isEmpty()) {
|
| 290 |
+
log("[WARN] No sentences for: " + stem);
|
| 291 |
+
errors++;
|
| 292 |
+
continue;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
// --- Load or create DocumentMetadata ---
|
| 296 |
+
DocumentMetadata meta = loadOrCreateMetadata(jp, stem);
|
| 297 |
+
|
| 298 |
+
// --- PII per sentence ---
|
| 299 |
+
List<Double> piiVec = new ArrayList<>();
|
| 300 |
+
if (piiDetector != null) {
|
| 301 |
+
int sentIdx = 0;
|
| 302 |
+
for (String sent : sentences) {
|
| 303 |
+
PIIDetector.SentencePIIScore score =
|
| 304 |
+
piiDetector.analyseSentence(sent, stem + "-" + sentIdx++);
|
| 305 |
+
piiVec.add(score.getPiiCoverage());
|
| 306 |
+
}
|
| 307 |
+
}
|
| 308 |
+
meta.setPiiVector(piiVec);
|
| 309 |
+
|
| 310 |
+
// --- Bias per sentence ---
|
| 311 |
+
List<Double> biasVec = new ArrayList<>();
|
| 312 |
+
if (biasAnalyser != null) {
|
| 313 |
+
for (String sent : sentences) {
|
| 314 |
+
SentenceBiasScore score = biasAnalyser.analyseSentence(sent);
|
| 315 |
+
biasVec.add(score.totalCoverage());
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
meta.setBiasVector(biasVec);
|
| 319 |
+
|
| 320 |
+
// --- Recompute counts from surviving sentences ---
|
| 321 |
+
int nSentences = sentences.size();
|
| 322 |
+
int nWords = 0;
|
| 323 |
+
int nTokens = 0;
|
| 324 |
+
|
| 325 |
+
for (String sent : sentences) {
|
| 326 |
+
String[] toks = sent.split("\\s+");
|
| 327 |
+
nWords += toks.length;
|
| 328 |
+
// estimate tokens: words + punctuation characters
|
| 329 |
+
nTokens += toks.length + sent.length()
|
| 330 |
+
- sent.replaceAll("[.,;:!?()\\-]", "").length();
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
// Paragraphs: count blank-line groups in the original text file
|
| 334 |
+
int nParagraphs = countParagraphs(new File(newDataDir, stem + ".txt"));
|
| 335 |
+
|
| 336 |
+
meta.setNumberSentences(nSentences)
|
| 337 |
+
.setNumberWords(nWords)
|
| 338 |
+
.setNumberTokens(nTokens)
|
| 339 |
+
.setNumberParagraphs(nParagraphs);
|
| 340 |
+
|
| 341 |
+
// --- Persist metadata JSON ---
|
| 342 |
+
writeMetadata(meta, newMetaDir + stem + "_meta.json");
|
| 343 |
+
docs++;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
log("Annotation & counts complete. Documents: " + docs
|
| 347 |
+
+ " Errors: " + errors);
|
| 348 |
+
|
| 349 |
+
} catch (Exception e) {
|
| 350 |
+
e.printStackTrace();
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
// -----------------------------------------------------------------------
|
| 355 |
+
// Stage 8
|
| 356 |
+
// -----------------------------------------------------------------------
|
| 357 |
+
|
| 358 |
+
/**
|
| 359 |
+
*/
|
| 360 |
+
private void runPersist() {
|
| 361 |
+
try {
|
| 362 |
+
FileHandler fh = new FileHandler();
|
| 363 |
+
int dataCopied = 0, metaCopied = 0;
|
| 364 |
+
|
| 365 |
+
// Copy text files
|
| 366 |
+
for (File f : fh.getFileListing(new File(newDataDir))) {
|
| 367 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 368 |
+
File dest = new File(FULL_DATA_DIR, f.getName());
|
| 369 |
+
if (!dest.exists()) {
|
| 370 |
+
Files.copy(f.toPath(), dest.toPath(),
|
| 371 |
+
StandardCopyOption.REPLACE_EXISTING);
|
| 372 |
+
dataCopied++;
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
// Copy metadata JSON files
|
| 377 |
+
for (File f : fh.getFileListing(new File(newMetaDir))) {
|
| 378 |
+
if (!f.isFile() || !f.getName().endsWith("_meta.json")) continue;
|
| 379 |
+
File dest = new File(FULL_META_DIR, f.getName());
|
| 380 |
+
if (!dest.exists()) {
|
| 381 |
+
Files.copy(f.toPath(), dest.toPath(),
|
| 382 |
+
StandardCopyOption.REPLACE_EXISTING);
|
| 383 |
+
metaCopied++;
|
| 384 |
+
}
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
log("Persist complete. Text files copied: " + dataCopied
|
| 388 |
+
+ " Metadata files copied: " + metaCopied);
|
| 389 |
+
log("FULL_DATA_DIR : " + FULL_DATA_DIR);
|
| 390 |
+
log("FULL_META_DIR : " + FULL_META_DIR);
|
| 391 |
+
|
| 392 |
+
} catch (Exception e) {
|
| 393 |
+
e.printStackTrace();
|
| 394 |
+
}
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
// -----------------------------------------------------------------------
|
| 398 |
+
// Helpers
|
| 399 |
+
// -----------------------------------------------------------------------
|
| 400 |
+
|
| 401 |
+
private DocumentMetadata loadOrCreateMetadata(JSONProcessor jp, String stem) {
|
| 402 |
+
// Try to find a seed metadata JSON written by the source processor
|
| 403 |
+
// Filename conventions: stem + ".json" or stem + "_meta.json"
|
| 404 |
+
String[] candidates = {
|
| 405 |
+
newMetaDir + stem + "_meta.json",
|
| 406 |
+
newMetaDir + stem + ".json"
|
| 407 |
+
};
|
| 408 |
+
for (String path : candidates) {
|
| 409 |
+
File f = new File(path);
|
| 410 |
+
if (f.exists()) {
|
| 411 |
+
try {
|
| 412 |
+
JSONObject raw = jp.readJSON(f);
|
| 413 |
+
// First try full schema, then legacy format
|
| 414 |
+
if (raw.containsKey("Identifier")) {
|
| 415 |
+
return DocumentMetadata.fromJson(raw);
|
| 416 |
+
} else {
|
| 417 |
+
DocumentMetadata m = new DocumentMetadata(stem);
|
| 418 |
+
m.mergeLegacy(raw);
|
| 419 |
+
return m;
|
| 420 |
+
}
|
| 421 |
+
} catch (Exception e) {
|
| 422 |
+
log("[WARN] Could not parse metadata JSON for " + stem + ": " + e.getMessage());
|
| 423 |
+
}
|
| 424 |
+
}
|
| 425 |
+
}
|
| 426 |
+
// Fall back to empty skeleton
|
| 427 |
+
return new DocumentMetadata(stem);
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
private void writeMetadata(DocumentMetadata meta, String outPath) throws Exception {
|
| 431 |
+
JSONObject json = meta.toJson();
|
| 432 |
+
try (Writer w = new OutputStreamWriter(
|
| 433 |
+
new FileOutputStream(outPath), StandardCharsets.UTF_8)) {
|
| 434 |
+
json.writeJSONString(w);
|
| 435 |
+
}
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
private int countParagraphs(File txtFile) {
|
| 439 |
+
if (!txtFile.exists()) return 0;
|
| 440 |
+
int count = 0;
|
| 441 |
+
boolean inPara = false;
|
| 442 |
+
try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) {
|
| 443 |
+
while (sc.hasNextLine()) {
|
| 444 |
+
String line = sc.nextLine();
|
| 445 |
+
if (line.isBlank()) {
|
| 446 |
+
inPara = false;
|
| 447 |
+
} else {
|
| 448 |
+
if (!inPara) { count++; inPara = true; }
|
| 449 |
+
}
|
| 450 |
+
}
|
| 451 |
+
} catch (Exception e) { /* ignored */ }
|
| 452 |
+
return Math.max(count, 1);
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
private BiasAnalyser buildBiasAnalyser(BulgarianSentenceSplitter splitter) {
|
| 456 |
+
if (biasDictPath == null || biasDictPath.isBlank()) {
|
| 457 |
+
log("[WARN] No bias dictionary path set — bias scoring disabled.");
|
| 458 |
+
return null;
|
| 459 |
+
}
|
| 460 |
+
BiasLexicon lexicon = new BiasLexicon(biasDictPath);
|
| 461 |
+
return new BiasAnalyser(lexicon, splitter);
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
private void validateConfig() {
|
| 465 |
+
List<String> missing = new ArrayList<>();
|
| 466 |
+
if (sourceProcessor == null) missing.add("sourceProcessor");
|
| 467 |
+
if (newDataDir == null || newDataDir.isBlank()) missing.add("newDataDir");
|
| 468 |
+
if (sampleDir == null || sampleDir.isBlank()) missing.add("sampleDir");
|
| 469 |
+
if (newMetaDir == null || newMetaDir.isBlank()) missing.add("newMetaDir");
|
| 470 |
+
if (!missing.isEmpty())
|
| 471 |
+
throw new IllegalStateException(
|
| 472 |
+
"Pipeline configuration missing: " + missing);
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
private void ensureDirs(String... paths) {
|
| 476 |
+
for (String p : paths) {
|
| 477 |
+
if (p != null) new File(p).mkdirs();
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
private void banner(String msg) {
|
| 482 |
+
System.out.println("\n" + "=".repeat(60));
|
| 483 |
+
System.out.println(" " + msg);
|
| 484 |
+
System.out.println("=".repeat(60));
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
private void log(String msg) {
|
| 488 |
+
System.out.println("[Pipeline] " + msg);
|
| 489 |
+
}
|
| 490 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/MarcellProcessor.java
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
import java.io.File;
|
| 4 |
+
import java.io.FileOutputStream;
|
| 5 |
+
import java.io.OutputStreamWriter;
|
| 6 |
+
import java.io.Writer;
|
| 7 |
+
import java.util.Scanner;
|
| 8 |
+
|
| 9 |
+
import org.json.simple.JSONArray;
|
| 10 |
+
import org.json.simple.JSONObject;
|
| 11 |
+
|
| 12 |
+
import bg.bas.dcl.general.FileHandler;
|
| 13 |
+
|
| 14 |
+
/**
|
| 15 |
+
* Processes the MARCELL Bulgarian annotated corpus.
|
| 16 |
+
*
|
| 17 |
+
* Licence: CC0-1.0 (fixed for all MARCELL documents).
|
| 18 |
+
* Domain: "Държавно управление" (State governance).
|
| 19 |
+
* Style: "Административен".
|
| 20 |
+
*/
|
| 21 |
+
public class MarcellProcessor extends BaseSourceProcessor {
|
| 22 |
+
|
| 23 |
+
private static final String LICENCE = "CC0-1.0";
|
| 24 |
+
private static final String LICENCE_LINK =
|
| 25 |
+
"https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf";
|
| 26 |
+
private static final String DOMAIN = "Държавно управление";
|
| 27 |
+
private static final String STYLE = "Административен";
|
| 28 |
+
private static final String PREFIX = "bg_MARCELL_";
|
| 29 |
+
private static final String EXT = ".conllup";
|
| 30 |
+
|
| 31 |
+
@Override
|
| 32 |
+
public void process(String indir, String outdir) {
|
| 33 |
+
try {
|
| 34 |
+
FileHandler fh = new FileHandler();
|
| 35 |
+
JSONObject json = new JSONObject();
|
| 36 |
+
JSONArray descrArray = new JSONArray();
|
| 37 |
+
|
| 38 |
+
for (File f : fh.getFileListing(new File(indir))) {
|
| 39 |
+
if (!f.isFile()) continue;
|
| 40 |
+
|
| 41 |
+
System.out.println("Processing: " + f.getAbsolutePath());
|
| 42 |
+
|
| 43 |
+
String tfname = PREFIX + f.getName().replace(EXT, "");
|
| 44 |
+
|
| 45 |
+
JSONObject fdescr = newBaseDescriptor(tfname);
|
| 46 |
+
fdescr.put("Licence", LICENCE);
|
| 47 |
+
fdescr.put("LicenceLink", LICENCE_LINK);
|
| 48 |
+
fdescr.put("Domain", DOMAIN);
|
| 49 |
+
fdescr.put("Style", STYLE);
|
| 50 |
+
|
| 51 |
+
Writer out = new OutputStreamWriter(
|
| 52 |
+
new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
|
| 53 |
+
|
| 54 |
+
Scanner s = new Scanner(f, "UTF-8");
|
| 55 |
+
int nw = 0, ns = 0, np = 0, nt = 0;
|
| 56 |
+
|
| 57 |
+
while (s.hasNextLine()) {
|
| 58 |
+
String line = s.nextLine();
|
| 59 |
+
|
| 60 |
+
// --- Metadata extraction ---
|
| 61 |
+
if (line.startsWith("# date =")) {
|
| 62 |
+
fdescr.put("PublicationDate", line.replace("# date =", "").trim());
|
| 63 |
+
} else if (line.startsWith("# title =")) {
|
| 64 |
+
fdescr.put("DocumentTitle", line.replace("# title =", "").trim());
|
| 65 |
+
} else if (line.startsWith("# issuer =")) {
|
| 66 |
+
fdescr.put("Author", line.replace("# issuer =", "").trim());
|
| 67 |
+
} else if (line.startsWith("# type =")) {
|
| 68 |
+
fdescr.put("Type", line.replace("# type =", "").trim());
|
| 69 |
+
} else if (line.startsWith("# url =")) {
|
| 70 |
+
fdescr.put("Url", line.replace("# url =", "").trim());
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
// --- Structure counting ---
|
| 74 |
+
else if (line.startsWith("# sent_id =")) {
|
| 75 |
+
ns++;
|
| 76 |
+
} else if (line.startsWith("# newpar id =")) {
|
| 77 |
+
np++;
|
| 78 |
+
out.write("\n");
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
// --- Text output ---
|
| 82 |
+
else if (line.startsWith("# text =")) {
|
| 83 |
+
out.write(line.replace("# text =", "").trim() + "\n");
|
| 84 |
+
out.flush();
|
| 85 |
+
} else {
|
| 86 |
+
// CoNLL-UP token line: count words and tokens
|
| 87 |
+
String[] cols = line.split("\t");
|
| 88 |
+
if (cols.length > 5) {
|
| 89 |
+
nt++;
|
| 90 |
+
if (!cols[3].equals("PUNCT")) nw++;
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
s.close();
|
| 96 |
+
out.flush();
|
| 97 |
+
out.close();
|
| 98 |
+
|
| 99 |
+
fdescr.put("NumberWords", nw);
|
| 100 |
+
fdescr.put("NumberSentences", ns);
|
| 101 |
+
fdescr.put("NumberParagraphs", np);
|
| 102 |
+
fdescr.put("NumberTokens", nt);
|
| 103 |
+
|
| 104 |
+
descrArray.add(fdescr);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
json.put("metadata", descrArray);
|
| 108 |
+
writeMetadata(json, outdir, "metadata.json");
|
| 109 |
+
|
| 110 |
+
} catch (Exception e) {
|
| 111 |
+
e.printStackTrace();
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// -----------------------------------------------------------------------
|
| 116 |
+
|
| 117 |
+
@SuppressWarnings("unchecked")
|
| 118 |
+
private void writeMetadata(JSONObject json, String outdir, String filename)
|
| 119 |
+
throws Exception {
|
| 120 |
+
String outMetaPath = outdir + filename;
|
| 121 |
+
Writer outMeta = new OutputStreamWriter(
|
| 122 |
+
new FileOutputStream(outMetaPath), "UTF-8");
|
| 123 |
+
json.writeJSONString(outMeta);
|
| 124 |
+
outMeta.flush();
|
| 125 |
+
outMeta.close();
|
| 126 |
+
|
| 127 |
+
convertJsonToCSV(json, outMetaPath + "_CSV.csv");
|
| 128 |
+
System.out.println("Metadata written to: " + outMetaPath);
|
| 129 |
+
}
|
| 130 |
+
}
|
java/bg/bas/dcl/LLMs/IfGPTDataset/SourceProcessor.java
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs.IfGPTDataset;
|
| 2 |
+
|
| 3 |
+
/**
|
| 4 |
+
*/
|
| 5 |
+
public interface SourceProcessor {
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
*/
|
| 9 |
+
void process(String indir, String outdir);
|
| 10 |
+
}
|
java/bg/bas/dcl/LLMs/PIIDetector.java
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.io.BufferedWriter;
|
| 4 |
+
import java.io.File;
|
| 5 |
+
import java.io.FileOutputStream;
|
| 6 |
+
import java.io.OutputStreamWriter;
|
| 7 |
+
import java.nio.charset.StandardCharsets;
|
| 8 |
+
import java.util.ArrayList;
|
| 9 |
+
import java.util.Collections;
|
| 10 |
+
import java.util.LinkedHashMap;
|
| 11 |
+
import java.util.List;
|
| 12 |
+
import java.util.Map;
|
| 13 |
+
import java.util.Properties;
|
| 14 |
+
import java.util.Scanner;
|
| 15 |
+
|
| 16 |
+
import ai.philterd.phileas.model.configuration.PhileasConfiguration;
|
| 17 |
+
import ai.philterd.phileas.model.policy.Policy;
|
| 18 |
+
import ai.philterd.phileas.model.responses.FilterResponse;
|
| 19 |
+
import ai.philterd.phileas.model.responses.Span;
|
| 20 |
+
import ai.philterd.phileas.services.PlainTextFilterService;
|
| 21 |
+
|
| 22 |
+
import bg.bas.dcl.general.FileHandler;
|
| 23 |
+
|
| 24 |
+
/**
|
| 25 |
+
* PIIDetector
|
| 26 |
+
*
|
| 27 |
+
* Detects Personally Identifiable Information (PII) in Bulgarian text at
|
| 28 |
+
* sentence level using the <b>Phileas</b> library (ai.philterd:phileas).
|
| 29 |
+
*
|
| 30 |
+
* -----------------------------------------------------------------------
|
| 31 |
+
* NOTE ON "PIISA"
|
| 32 |
+
* PIISA (https://piisa.org) is a Python-only PII framework with no Java
|
| 33 |
+
* bindings. The closest Java-native equivalent with a compatible
|
| 34 |
+
* detection scope is Phileas (Apache 2.0, Maven Central, actively
|
| 35 |
+
* maintained as of 2025). This component uses Phileas and documents
|
| 36 |
+
* all places where a future PIISA Java binding could be substituted.
|
| 37 |
+
* -----------------------------------------------------------------------
|
| 38 |
+
*
|
| 39 |
+
* MAVEN DEPENDENCY (pom.xml):
|
| 40 |
+
* <pre>
|
| 41 |
+
* <dependency>
|
| 42 |
+
* <groupId>ai.philterd</groupId>
|
| 43 |
+
* <artifactId>phileas</artifactId>
|
| 44 |
+
* <version>3.1.0</version>
|
| 45 |
+
* </dependency>
|
| 46 |
+
* </pre>
|
| 47 |
+
*
|
| 48 |
+
* -----------------------------------------------------------------------
|
| 49 |
+
* PII TYPES DETECTED (Phileas built-in, language-agnostic unless noted):
|
| 50 |
+
*
|
| 51 |
+
* Person names (NER + census dictionary) | Ages | Email addresses
|
| 52 |
+
* Phone numbers | IP addresses (v4 + v6) | URLs | Credit card numbers
|
| 53 |
+
* SSN / TIN | IBAN codes | Bank account numbers | Dates | Zip codes
|
| 54 |
+
* MAC addresses | Bitcoin addresses | VINs | Passport numbers
|
| 55 |
+
* Driver licence numbers | Medical conditions
|
| 56 |
+
*
|
| 57 |
+
* Language note: NER-based person-name detection uses English models by
|
| 58 |
+
* default. For Bulgarian names, supply a custom dictionary filter
|
| 59 |
+
* (see {@link #buildPolicy()}) or integrate a Bulgarian NER model.
|
| 60 |
+
* Regex-based filters (emails, phones, IPs, etc.) are language-independent
|
| 61 |
+
* and work directly on Bulgarian text.
|
| 62 |
+
*
|
| 63 |
+
* -----------------------------------------------------------------------
|
| 64 |
+
* ALGORITHM (per sentence):
|
| 65 |
+
*
|
| 66 |
+
* 1. Phileas scans the sentence and returns a list of PII *spans*, each
|
| 67 |
+
* carrying a character start/end offset and a PII type label.
|
| 68 |
+
* 2. We map spans back to word tokens by checking which token positions
|
| 69 |
+
* overlap any detected span.
|
| 70 |
+
* 3. piiCoverage = |tokens overlapping PII spans| / |total word tokens|
|
| 71 |
+
*
|
| 72 |
+
* -----------------------------------------------------------------------
|
| 73 |
+
* USAGE
|
| 74 |
+
*
|
| 75 |
+
* BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
|
| 76 |
+
* PIIDetector detector = new PIIDetector(splitter);
|
| 77 |
+
*
|
| 78 |
+
* List<SentencePIIScore> scores = detector.analyseText("Иван Петров живее на ул. Роза 5.");
|
| 79 |
+
* for (SentencePIIScore s : scores) {
|
| 80 |
+
* System.out.printf("%.1f%% PII — %s%n", s.getPiiCoveragePercent(), s.getSentence());
|
| 81 |
+
* }
|
| 82 |
+
*
|
| 83 |
+
* // Corpus-level processing with TSV output
|
| 84 |
+
* detector.analyseDirectory("/path/to/corpus/", "/path/to/pii_report.tsv");
|
| 85 |
+
*/
|
| 86 |
+
public class PIIDetector {
|
| 87 |
+
|
| 88 |
+
// -----------------------------------------------------------------------
|
| 89 |
+
// Constants
|
| 90 |
+
// -----------------------------------------------------------------------
|
| 91 |
+
|
| 92 |
+
/** Context string passed to Phileas (arbitrary; used for logging/caching). */
|
| 93 |
+
private static final String CONTEXT = "bg-corpus";
|
| 94 |
+
|
| 95 |
+
/** Document ID prefix; a counter suffix is appended per sentence. */
|
| 96 |
+
private static final String DOC_ID = "sent-";
|
| 97 |
+
|
| 98 |
+
/** Minimum word count for a sentence to be analysed. */
|
| 99 |
+
private static final int MIN_WORDS = 3;
|
| 100 |
+
|
| 101 |
+
// -----------------------------------------------------------------------
|
| 102 |
+
// Dependencies
|
| 103 |
+
// -----------------------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
private final BulgarianSentenceSplitter splitter;
|
| 106 |
+
private final PlainTextFilterService filterService;
|
| 107 |
+
private final List<Policy> policies;
|
| 108 |
+
|
| 109 |
+
// -----------------------------------------------------------------------
|
| 110 |
+
// Constructors
|
| 111 |
+
// -----------------------------------------------------------------------
|
| 112 |
+
|
| 113 |
+
/**
|
| 114 |
+
* Creates a PIIDetector with the default policy (all built-in Phileas
|
| 115 |
+
* filters active, REDACT strategy so spans are easy to count).
|
| 116 |
+
*
|
| 117 |
+
* @param splitter an initialised {@link BulgarianSentenceSplitter}
|
| 118 |
+
*/
|
| 119 |
+
public PIIDetector(BulgarianSentenceSplitter splitter) {
|
| 120 |
+
this(splitter, null);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/**
|
| 124 |
+
* Creates a PIIDetector with a custom Phileas {@link Policy}.
|
| 125 |
+
* Pass {@code null} to use the built-in all-PII policy.
|
| 126 |
+
*
|
| 127 |
+
* @param splitter an initialised {@link BulgarianSentenceSplitter}
|
| 128 |
+
* @param customPolicy a pre-built Phileas Policy, or null for default
|
| 129 |
+
*/
|
| 130 |
+
public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) {
|
| 131 |
+
if (splitter == null)
|
| 132 |
+
throw new IllegalArgumentException("splitter must not be null");
|
| 133 |
+
|
| 134 |
+
this.splitter = splitter;
|
| 135 |
+
|
| 136 |
+
try {
|
| 137 |
+
Properties props = new Properties();
|
| 138 |
+
PhileasConfiguration config = new PhileasConfiguration(props);
|
| 139 |
+
this.filterService = new PlainTextFilterService(config);
|
| 140 |
+
this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy());
|
| 141 |
+
System.out.println("[PIIDetector] Phileas filter service initialised.");
|
| 142 |
+
} catch (Exception e) {
|
| 143 |
+
throw new RuntimeException("Failed to initialise Phileas filter service", e);
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
// -----------------------------------------------------------------------
|
| 148 |
+
// Public API
|
| 149 |
+
// -----------------------------------------------------------------------
|
| 150 |
+
|
| 151 |
+
/**
|
| 152 |
+
* Splits {@code text} into sentences and returns a {@link SentencePIIScore}
|
| 153 |
+
* for each sentence.
|
| 154 |
+
*
|
| 155 |
+
* Sentences shorter than {@link #MIN_WORDS} words receive a zero score
|
| 156 |
+
* without calling Phileas (to avoid spurious detections on fragments).
|
| 157 |
+
*
|
| 158 |
+
* @param text any Bulgarian plain text (may span multiple paragraphs)
|
| 159 |
+
* @return one score per detected sentence, in order; never null
|
| 160 |
+
*/
|
| 161 |
+
public List<SentencePIIScore> analyseText(String text) {
|
| 162 |
+
List<SentencePIIScore> results = new ArrayList<>();
|
| 163 |
+
if (text == null || text.isBlank()) return results;
|
| 164 |
+
|
| 165 |
+
int docCounter = 0;
|
| 166 |
+
for (String sentence : splitter.split(text)) {
|
| 167 |
+
results.add(analyseSentence(sentence, DOC_ID + (docCounter++)));
|
| 168 |
+
}
|
| 169 |
+
return results;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
/**
|
| 173 |
+
* Analyses a single pre-split sentence.
|
| 174 |
+
*
|
| 175 |
+
* @param sentence the sentence string (not null)
|
| 176 |
+
* @param docId a document/sentence identifier string for Phileas context
|
| 177 |
+
* @return a fully populated {@link SentencePIIScore}
|
| 178 |
+
*/
|
| 179 |
+
public SentencePIIScore analyseSentence(String sentence, String docId) {
|
| 180 |
+
|
| 181 |
+
// --- Tokenise ---
|
| 182 |
+
String[] rawTokens = sentence.trim().split("\\s+");
|
| 183 |
+
List<String> tokens = new ArrayList<>();
|
| 184 |
+
for (String t : rawTokens) {
|
| 185 |
+
String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", "");
|
| 186 |
+
if (!clean.isEmpty()) tokens.add(clean);
|
| 187 |
+
}
|
| 188 |
+
int totalWords = tokens.size();
|
| 189 |
+
|
| 190 |
+
if (totalWords < MIN_WORDS) {
|
| 191 |
+
return SentencePIIScore.empty(sentence, totalWords);
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
// --- Run Phileas ---
|
| 195 |
+
List<Span> spans;
|
| 196 |
+
try {
|
| 197 |
+
FilterResponse response = filterService.filter(
|
| 198 |
+
policies, CONTEXT, docId, sentence, null);
|
| 199 |
+
spans = response.getSpans() != null ? response.getSpans() : List.of();
|
| 200 |
+
} catch (Exception e) {
|
| 201 |
+
System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage());
|
| 202 |
+
return SentencePIIScore.error(sentence, totalWords, e.getMessage());
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
// --- Map character-level spans back to token positions ---
|
| 206 |
+
// Build token character offsets from the original sentence string
|
| 207 |
+
int[] tokenStart = new int[tokens.size()];
|
| 208 |
+
int[] tokenEnd = new int[tokens.size()];
|
| 209 |
+
int cursor = 0;
|
| 210 |
+
for (int ti = 0; ti < tokens.size(); ti++) {
|
| 211 |
+
String tok = tokens.get(ti);
|
| 212 |
+
int idx = sentence.indexOf(tok, cursor);
|
| 213 |
+
if (idx < 0) {
|
| 214 |
+
// Fallback: token not found at expected position (normalisation artefact)
|
| 215 |
+
tokenStart[ti] = cursor;
|
| 216 |
+
tokenEnd[ti] = cursor + tok.length();
|
| 217 |
+
} else {
|
| 218 |
+
tokenStart[ti] = idx;
|
| 219 |
+
tokenEnd[ti] = idx + tok.length();
|
| 220 |
+
cursor = idx + tok.length();
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
// Count distinct PII tokens and collect type labels per token
|
| 225 |
+
Map<Integer, String> piiTokenType = new LinkedHashMap<>(); // tokenIndex → PII type
|
| 226 |
+
for (Span span : spans) {
|
| 227 |
+
int spanStart = span.getStart();
|
| 228 |
+
int spanEnd = span.getEnd();
|
| 229 |
+
String type = span.getFilterType() != null
|
| 230 |
+
? span.getFilterType().name()
|
| 231 |
+
: "UNKNOWN";
|
| 232 |
+
|
| 233 |
+
for (int ti = 0; ti < tokens.size(); ti++) {
|
| 234 |
+
// Overlap: token and span share at least one character
|
| 235 |
+
if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) {
|
| 236 |
+
piiTokenType.put(ti, type);
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
// --- Build type frequency map ---
|
| 242 |
+
Map<String, Integer> typeCounts = new LinkedHashMap<>();
|
| 243 |
+
for (String type : piiTokenType.values()) {
|
| 244 |
+
typeCounts.merge(type, 1, Integer::sum);
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
int piiTokenCount = piiTokenType.size();
|
| 248 |
+
double coverage = totalWords > 0
|
| 249 |
+
? (double) piiTokenCount / totalWords
|
| 250 |
+
: 0.0;
|
| 251 |
+
|
| 252 |
+
return new SentencePIIScore(
|
| 253 |
+
sentence, totalWords, piiTokenCount, coverage,
|
| 254 |
+
new ArrayList<>(piiTokenType.values()),
|
| 255 |
+
typeCounts, spans, null);
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
// -----------------------------------------------------------------------
|
| 259 |
+
// Corpus-level processing
|
| 260 |
+
// -----------------------------------------------------------------------
|
| 261 |
+
|
| 262 |
+
/**
|
| 263 |
+
* Analyses all .txt files in {@code corpusDir} sentence by sentence and
|
| 264 |
+
* writes results to a TSV file at {@code reportPath}.
|
| 265 |
+
*
|
| 266 |
+
* Only sentences with at least one PII token are written to the report.
|
| 267 |
+
*
|
| 268 |
+
* @param corpusDir directory of plain-text .txt files
|
| 269 |
+
* @param reportPath destination TSV report file path
|
| 270 |
+
*/
|
| 271 |
+
public void analyseDirectory(String corpusDir, String reportPath) {
|
| 272 |
+
try {
|
| 273 |
+
FileHandler fh = new FileHandler();
|
| 274 |
+
int filesProcessed = 0, sentencesWritten = 0;
|
| 275 |
+
|
| 276 |
+
try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
|
| 277 |
+
new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) {
|
| 278 |
+
|
| 279 |
+
bw.write("file\t" + SentencePIIScore.tsvHeader());
|
| 280 |
+
bw.newLine();
|
| 281 |
+
|
| 282 |
+
for (File f : fh.getFileListing(new File(corpusDir))) {
|
| 283 |
+
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
|
| 284 |
+
|
| 285 |
+
System.out.println("[PIIDetector] Processing: " + f.getName());
|
| 286 |
+
|
| 287 |
+
StringBuilder text = new StringBuilder();
|
| 288 |
+
try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
|
| 289 |
+
while (sc.hasNextLine()) text.append(sc.nextLine()).append(' ');
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
int docCounter = 0;
|
| 293 |
+
for (SentencePIIScore score : analyseText(text.toString())) {
|
| 294 |
+
if (score.hasPII()) {
|
| 295 |
+
bw.write(f.getName() + "\t" + score.toTsv());
|
| 296 |
+
bw.newLine();
|
| 297 |
+
sentencesWritten++;
|
| 298 |
+
}
|
| 299 |
+
docCounter++;
|
| 300 |
+
}
|
| 301 |
+
filesProcessed++;
|
| 302 |
+
}
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
System.out.printf("[PIIDetector] Done. Files: %d Sentences with PII written: %d%n",
|
| 306 |
+
filesProcessed, sentencesWritten);
|
| 307 |
+
|
| 308 |
+
} catch (Exception e) {
|
| 309 |
+
e.printStackTrace();
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
// -----------------------------------------------------------------------
|
| 314 |
+
// Policy builder
|
| 315 |
+
// -----------------------------------------------------------------------
|
| 316 |
+
|
| 317 |
+
/**
|
| 318 |
+
* Builds the default Phileas {@link Policy} that activates all
|
| 319 |
+
* language-agnostic PII filters with a REDACT strategy (so that
|
| 320 |
+
* span positions remain stable for overlap calculation).
|
| 321 |
+
*
|
| 322 |
+
* To customise, edit the JSON string below or deserialise your own
|
| 323 |
+
* policy from a .json file with:
|
| 324 |
+
* Policy policy = Policy.fromJson(new String(Files.readAllBytes(path)));
|
| 325 |
+
*
|
| 326 |
+
* To add a Bulgarian names dictionary, add an "identifiers.dictionary"
|
| 327 |
+
* block pointing to a file of Bulgarian given names and surnames.
|
| 328 |
+
*/
|
| 329 |
+
private Policy buildPolicy() throws Exception {
|
| 330 |
+
String policyJson = "{"
|
| 331 |
+
+ "\"name\": \"pii-all\","
|
| 332 |
+
+ "\"identifiers\": {"
|
| 333 |
+
+ "\"emailAddress\": {\"emailAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 334 |
+
+ "\"phoneNumber\": {\"phoneNumberFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 335 |
+
+ "\"ipAddress\": {\"ipAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 336 |
+
+ "\"url\": {\"urlFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 337 |
+
+ "\"creditCard\": {\"creditCardFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 338 |
+
+ "\"ssn\": {\"ssnFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 339 |
+
+ "\"ibanCode\": {\"ibanCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 340 |
+
+ "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 341 |
+
+ "\"date\": {\"dateFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 342 |
+
+ "\"age\": {\"ageFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 343 |
+
+ "\"macAddress\": {\"macAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 344 |
+
+ "\"bitcoinAddress\": {\"bitcoinAddressFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 345 |
+
+ "\"vin\": {\"vinFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 346 |
+
+ "\"zipCode\": {\"zipCodeFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
|
| 347 |
+
+ "\"person\": {\"personFilterStrategies\": [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}"
|
| 348 |
+
+ "}"
|
| 349 |
+
+ "}";
|
| 350 |
+
return Policy.fromJson(policyJson);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
// -----------------------------------------------------------------------
|
| 354 |
+
// Inner result class
|
| 355 |
+
// -----------------------------------------------------------------------
|
| 356 |
+
|
| 357 |
+
/**
|
| 358 |
+
* Immutable result object for one sentence's PII analysis.
|
| 359 |
+
*/
|
| 360 |
+
public static class SentencePIIScore {
|
| 361 |
+
|
| 362 |
+
private final String sentence;
|
| 363 |
+
private final int totalWords;
|
| 364 |
+
private final int piiTokenCount;
|
| 365 |
+
/** PII coverage: piiTokenCount / totalWords in [0, 1]. */
|
| 366 |
+
private final double piiCoverage;
|
| 367 |
+
/** Ordered list of PII type labels for each PII token found. */
|
| 368 |
+
private final List<String> piiTypes;
|
| 369 |
+
/** Frequency of each PII type in this sentence. */
|
| 370 |
+
private final Map<String, Integer> typeFrequency;
|
| 371 |
+
/** Raw Phileas spans (character-level). */
|
| 372 |
+
private final List<Span> spans;
|
| 373 |
+
/** Non-null if Phileas threw an exception for this sentence. */
|
| 374 |
+
private final String errorMessage;
|
| 375 |
+
|
| 376 |
+
SentencePIIScore(String sentence, int totalWords, int piiTokenCount,
|
| 377 |
+
double piiCoverage, List<String> piiTypes,
|
| 378 |
+
Map<String, Integer> typeFrequency,
|
| 379 |
+
List<Span> spans, String errorMessage) {
|
| 380 |
+
this.sentence = sentence;
|
| 381 |
+
this.totalWords = totalWords;
|
| 382 |
+
this.piiTokenCount = piiTokenCount;
|
| 383 |
+
this.piiCoverage = piiCoverage;
|
| 384 |
+
this.piiTypes = Collections.unmodifiableList(piiTypes);
|
| 385 |
+
this.typeFrequency = Collections.unmodifiableMap(typeFrequency);
|
| 386 |
+
this.spans = spans != null
|
| 387 |
+
? Collections.unmodifiableList(spans)
|
| 388 |
+
: List.of();
|
| 389 |
+
this.errorMessage = errorMessage;
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
static SentencePIIScore empty(String sentence, int totalWords) {
|
| 393 |
+
return new SentencePIIScore(sentence, totalWords, 0, 0.0,
|
| 394 |
+
List.of(), Map.of(), List.of(), null);
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
static SentencePIIScore error(String sentence, int totalWords, String msg) {
|
| 398 |
+
return new SentencePIIScore(sentence, totalWords, 0, 0.0,
|
| 399 |
+
List.of(), Map.of(), List.of(), msg);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
// --- Accessors ---
|
| 403 |
+
|
| 404 |
+
public String getSentence() { return sentence; }
|
| 405 |
+
public int getTotalWords() { return totalWords; }
|
| 406 |
+
public int getPiiTokenCount() { return piiTokenCount; }
|
| 407 |
+
/** PII coverage ratio in [0, 1]. */
|
| 408 |
+
public double getPiiCoverage() { return piiCoverage; }
|
| 409 |
+
/** PII coverage expressed as a percentage [0, 100]. */
|
| 410 |
+
public double getPiiCoveragePercent() { return piiCoverage * 100.0; }
|
| 411 |
+
public List<String> getPiiTypes() { return piiTypes; }
|
| 412 |
+
public Map<String, Integer> getTypeFrequency() { return typeFrequency; }
|
| 413 |
+
public List<Span> getSpans() { return spans; }
|
| 414 |
+
public boolean hasPII() { return piiTokenCount > 0; }
|
| 415 |
+
public boolean hasError() { return errorMessage != null; }
|
| 416 |
+
public String getErrorMessage() { return errorMessage; }
|
| 417 |
+
|
| 418 |
+
/** Number of distinct PII categories detected in this sentence. */
|
| 419 |
+
public int distinctPiiTypes() { return typeFrequency.size(); }
|
| 420 |
+
|
| 421 |
+
// --- TSV export ---
|
| 422 |
+
|
| 423 |
+
/**
|
| 424 |
+
* TSV row: sentence | totalWords | piiTokens | coverage% | distinctTypes | typeFrequency
|
| 425 |
+
*/
|
| 426 |
+
public String toTsv() {
|
| 427 |
+
return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s",
|
| 428 |
+
sentence.replace('\t', ' '),
|
| 429 |
+
totalWords,
|
| 430 |
+
piiTokenCount,
|
| 431 |
+
piiCoverage,
|
| 432 |
+
getPiiCoveragePercent(),
|
| 433 |
+
distinctPiiTypes(),
|
| 434 |
+
typeFrequency.toString());
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
public static String tsvHeader() {
|
| 438 |
+
return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency";
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
@Override
|
| 442 |
+
public String toString() {
|
| 443 |
+
return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}",
|
| 444 |
+
totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet());
|
| 445 |
+
}
|
| 446 |
+
}
|
| 447 |
+
}
|
java/bg/bas/dcl/LLMs/SentenceBiasScore.java
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package bg.bas.dcl.LLMs;
|
| 2 |
+
|
| 3 |
+
import java.util.Collections;
|
| 4 |
+
import java.util.List;
|
| 5 |
+
import java.util.Map;
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
* SentenceBiasScore
|
| 9 |
+
*
|
| 10 |
+
*/
|
| 11 |
+
public class SentenceBiasScore {
|
| 12 |
+
|
| 13 |
+
public static final String[] BIAS_TYPES = {
|
| 14 |
+
"gender", "race_ethnicity", "religion", "disability", "appearance"
|
| 15 |
+
};
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
private final String sentence;
|
| 19 |
+
|
| 20 |
+
private final int totalWords;
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
private final Map<String, Double> pairCoverage;
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
private final Map<String, Integer> signalCount;
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
private final Map<String, Integer> evaluatorCount;
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
/** All dictionary entries matched in this sentence (lemma strings). */
|
| 33 |
+
private final List<String> matchedLemmas;
|
| 34 |
+
|
| 35 |
+
/** Total matched bias words (evaluative, non-neutral). */
|
| 36 |
+
private final int totalBiasWords;
|
| 37 |
+
|
| 38 |
+
/** Count of matched derogatory terms. */
|
| 39 |
+
private final int totalDerogatory;
|
| 40 |
+
|
| 41 |
+
/** Count of matched colloquial terms. */
|
| 42 |
+
private final int totalColloquial;
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
private final boolean multiType;
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
SentenceBiasScore(String sentence,
|
| 49 |
+
int totalWords,
|
| 50 |
+
Map<String, Double> pairCoverage,
|
| 51 |
+
Map<String, Integer> signalCount,
|
| 52 |
+
Map<String, Integer> evaluatorCount,
|
| 53 |
+
List<String> matchedLemmas,
|
| 54 |
+
int totalBiasWords,
|
| 55 |
+
int totalDerogatory,
|
| 56 |
+
int totalColloquial,
|
| 57 |
+
boolean multiType) {
|
| 58 |
+
this.sentence = sentence;
|
| 59 |
+
this.totalWords = totalWords;
|
| 60 |
+
this.pairCoverage = Collections.unmodifiableMap(pairCoverage);
|
| 61 |
+
this.signalCount = Collections.unmodifiableMap(signalCount);
|
| 62 |
+
this.evaluatorCount = Collections.unmodifiableMap(evaluatorCount);
|
| 63 |
+
this.matchedLemmas = Collections.unmodifiableList(matchedLemmas);
|
| 64 |
+
this.totalBiasWords = totalBiasWords;
|
| 65 |
+
this.totalDerogatory= totalDerogatory;
|
| 66 |
+
this.totalColloquial= totalColloquial;
|
| 67 |
+
this.multiType = multiType;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
public double getPairCoverage(String biasType) {
|
| 72 |
+
if (biasType == null || biasType.isBlank()) return totalCoverage();
|
| 73 |
+
return pairCoverage.getOrDefault(biasType.toLowerCase(), 0.0);
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
public double totalCoverage() {
|
| 78 |
+
double sum = 0;
|
| 79 |
+
for (double v : pairCoverage.values()) sum += v;
|
| 80 |
+
return sum;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
public double[] coverageArray() {
|
| 85 |
+
double[] arr = new double[BIAS_TYPES.length];
|
| 86 |
+
for (int i = 0; i < BIAS_TYPES.length; i++)
|
| 87 |
+
arr[i] = getPairCoverage(BIAS_TYPES[i]);
|
| 88 |
+
return arr;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/** True if any bias type has a non-zero pair-coverage score. */
|
| 92 |
+
public boolean isBiased() {
|
| 93 |
+
for (double v : pairCoverage.values())
|
| 94 |
+
if (v > 0) return true;
|
| 95 |
+
return false;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
public String getSentence() { return sentence; }
|
| 100 |
+
public int getTotalWords() { return totalWords; }
|
| 101 |
+
public int getSignalCount(String type) { return signalCount.getOrDefault(type, 0); }
|
| 102 |
+
public int getEvaluatorCount(String type) { return evaluatorCount.getOrDefault(type, 0); }
|
| 103 |
+
public List<String>getMatchedLemmas() { return matchedLemmas; }
|
| 104 |
+
public int getTotalBiasWords() { return totalBiasWords; }
|
| 105 |
+
public int getTotalDerogatory() { return totalDerogatory; }
|
| 106 |
+
public int getTotalColloquial() { return totalColloquial; }
|
| 107 |
+
public boolean isMultiType() { return multiType; }
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
public String toTsv() {
|
| 111 |
+
StringBuilder sb = new StringBuilder();
|
| 112 |
+
sb.append(sentence).append('\t');
|
| 113 |
+
sb.append(totalWords).append('\t');
|
| 114 |
+
sb.append(matchedLemmas).append('\t');
|
| 115 |
+
|
| 116 |
+
for (String type : BIAS_TYPES) {
|
| 117 |
+
sb.append(signalCount.getOrDefault(type, 0)).append('\t');
|
| 118 |
+
sb.append(evaluatorCount.getOrDefault(type, 0)).append('\t');
|
| 119 |
+
sb.append(String.format("%.4f", getPairCoverage(type))).append('\t');
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
sb.append(totalBiasWords).append('\t');
|
| 123 |
+
sb.append(totalDerogatory).append('\t');
|
| 124 |
+
sb.append(totalColloquial).append('\t');
|
| 125 |
+
sb.append(multiType ? 1 : 0).append('\t');
|
| 126 |
+
sb.append(String.format("%.4f", totalCoverage()));
|
| 127 |
+
|
| 128 |
+
return sb.toString();
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
public static String tsvHeader() {
|
| 133 |
+
StringBuilder sb = new StringBuilder();
|
| 134 |
+
sb.append("sentence\ttotalWords\tmatchedLemmas\t");
|
| 135 |
+
for (String type : BIAS_TYPES)
|
| 136 |
+
sb.append(type).append("_signals\t")
|
| 137 |
+
.append(type).append("_evaluators\t")
|
| 138 |
+
.append(type).append("_coverage\t");
|
| 139 |
+
sb.append("totalBiasWords\ttotalDerogatory\ttotalColloquial\t")
|
| 140 |
+
.append("multiType\ttotalCoverage");
|
| 141 |
+
return sb.toString();
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
@Override
|
| 145 |
+
public String toString() {
|
| 146 |
+
return String.format("SentenceBiasScore{words=%d, coverage=%.3f, biased=%b, sentence='%s'}",
|
| 147 |
+
totalWords, totalCoverage(), isBiased(),
|
| 148 |
+
sentence.length() > 80 ? sentence.substring(0, 80) + "…" : sentence);
|
| 149 |
+
}
|
| 150 |
+
}
|
resources/bulgarian_bias_dictionary_v4.tsv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
resources/metadata_schema.json
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
| 3 |
+
"$id": "https://dcl.bas.bg/ifgpt/metadata-schema/v1.0",
|
| 4 |
+
"title": "IfGPT Document Metadata Schema",
|
| 5 |
+
"description": "Metadata schema for textual documents in the IfGPT Bulgarian language dataset. 15 mandatory fields + 8 optional fields.",
|
| 6 |
+
"type": "object",
|
| 7 |
+
|
| 8 |
+
"required": [
|
| 9 |
+
"Identifier",
|
| 10 |
+
"Licence",
|
| 11 |
+
"PublicationDate",
|
| 12 |
+
"DocumentTitle",
|
| 13 |
+
"Source",
|
| 14 |
+
"Medium",
|
| 15 |
+
"Url",
|
| 16 |
+
"Domain",
|
| 17 |
+
"Keywords",
|
| 18 |
+
"NumberWords",
|
| 19 |
+
"NumberSentences",
|
| 20 |
+
"NumberParagraphs",
|
| 21 |
+
"NumberTokens",
|
| 22 |
+
"PersonallyIdentifiableInformation",
|
| 23 |
+
"BiasedInformation"
|
| 24 |
+
],
|
| 25 |
+
|
| 26 |
+
"properties": {
|
| 27 |
+
|
| 28 |
+
"Identifier": {
|
| 29 |
+
"type": "string",
|
| 30 |
+
"description": "Unique document identifier with the language prefix 'bg'.",
|
| 31 |
+
"pattern": "^bg_",
|
| 32 |
+
"examples": ["bg_bnc_12345", "bg_MARCELL_doc001", "bg_CURLICAT_xyz"]
|
| 33 |
+
},
|
| 34 |
+
|
| 35 |
+
"Licence": {
|
| 36 |
+
"type": "string",
|
| 37 |
+
"description": "Licence name with classification by type (open, restricted, etc.).",
|
| 38 |
+
"enum": [
|
| 39 |
+
"CC0",
|
| 40 |
+
"CC0-1.0",
|
| 41 |
+
"CC-BY-4.0",
|
| 42 |
+
"CC-BY-SA-4.0",
|
| 43 |
+
"CC-BY-NC-4.0",
|
| 44 |
+
"CC-BY-NC-SA-4.0",
|
| 45 |
+
"Restricted",
|
| 46 |
+
"Proprietary",
|
| 47 |
+
"Unknown"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
|
| 51 |
+
"PublicationDate": {
|
| 52 |
+
"type": "string",
|
| 53 |
+
"description": "Date of publication of the text (yyyy-mm-dd).",
|
| 54 |
+
"pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
|
| 55 |
+
"examples": ["2023-04-15", "2019-01-01", ""]
|
| 56 |
+
},
|
| 57 |
+
|
| 58 |
+
"DocumentTitle": {
|
| 59 |
+
"type": "string",
|
| 60 |
+
"description": "Title of the document.",
|
| 61 |
+
"examples": ["Закон за защита на данните", "Статия за климатичните промени"]
|
| 62 |
+
},
|
| 63 |
+
|
| 64 |
+
"Source": {
|
| 65 |
+
"type": "string",
|
| 66 |
+
"description": "Publishing organisation, media outlet or institutional originator.",
|
| 67 |
+
"examples": ["Министерски съвет", "БНР", "Сега"]
|
| 68 |
+
},
|
| 69 |
+
|
| 70 |
+
"Medium": {
|
| 71 |
+
"type": "string",
|
| 72 |
+
"description": "Modality of the resource.",
|
| 73 |
+
"enum": ["textual", "multimodal"]
|
| 74 |
+
},
|
| 75 |
+
|
| 76 |
+
"Url": {
|
| 77 |
+
"type": "string",
|
| 78 |
+
"description": "Original web address of the document.",
|
| 79 |
+
"format": "uri",
|
| 80 |
+
"examples": ["https://www.lex.bg/laws/ldoc/123", ""]
|
| 81 |
+
},
|
| 82 |
+
|
| 83 |
+
"Domain": {
|
| 84 |
+
"type": "array",
|
| 85 |
+
"description": "Up to six subject areas from a controlled vocabulary.",
|
| 86 |
+
"maxItems": 6,
|
| 87 |
+
"items": {
|
| 88 |
+
"type": "string",
|
| 89 |
+
"enum": [
|
| 90 |
+
"Държавно управление",
|
| 91 |
+
"Право и законодателство",
|
| 92 |
+
"Икономика и финанси",
|
| 93 |
+
"Образование",
|
| 94 |
+
"Наука и технологии",
|
| 95 |
+
"Здравеопазване",
|
| 96 |
+
"Култура и изкуство",
|
| 97 |
+
"Спорт",
|
| 98 |
+
"Медии и журналистика",
|
| 99 |
+
"Общество и политика",
|
| 100 |
+
"Околна среда",
|
| 101 |
+
"Религия",
|
| 102 |
+
"История",
|
| 103 |
+
"Литература и художествена проза",
|
| 104 |
+
"Неформална комуникация",
|
| 105 |
+
"Друго"
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
"examples": [["Държавно управление"], ["Медии и журналистика", "Общество и политика"]]
|
| 109 |
+
},
|
| 110 |
+
|
| 111 |
+
"Keywords": {
|
| 112 |
+
"type": "array",
|
| 113 |
+
"description": "Up to six free-text keywords characterising the content.",
|
| 114 |
+
"maxItems": 6,
|
| 115 |
+
"items": { "type": "string" },
|
| 116 |
+
"examples": [["климат", "законодателство", "ЕС"]]
|
| 117 |
+
},
|
| 118 |
+
|
| 119 |
+
"NumberWords": {
|
| 120 |
+
"type": "integer",
|
| 121 |
+
"description": "Total number of words (non-punctuation tokens).",
|
| 122 |
+
"minimum": 0
|
| 123 |
+
},
|
| 124 |
+
|
| 125 |
+
"NumberSentences": {
|
| 126 |
+
"type": "integer",
|
| 127 |
+
"description": "Total number of sentences.",
|
| 128 |
+
"minimum": 0
|
| 129 |
+
},
|
| 130 |
+
|
| 131 |
+
"NumberParagraphs": {
|
| 132 |
+
"type": "integer",
|
| 133 |
+
"description": "Total number of paragraphs.",
|
| 134 |
+
"minimum": 0
|
| 135 |
+
},
|
| 136 |
+
|
| 137 |
+
"NumberTokens": {
|
| 138 |
+
"type": "integer",
|
| 139 |
+
"description": "Total number of tokens (words + punctuation).",
|
| 140 |
+
"minimum": 0
|
| 141 |
+
},
|
| 142 |
+
|
| 143 |
+
"PersonallyIdentifiableInformation": {
|
| 144 |
+
"type": "array",
|
| 145 |
+
"description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as personally identifiable information, in [0,1]. Length equals NumberSentences.",
|
| 146 |
+
"items": {
|
| 147 |
+
"type": "number",
|
| 148 |
+
"minimum": 0.0,
|
| 149 |
+
"maximum": 1.0
|
| 150 |
+
},
|
| 151 |
+
"examples": [[0.0, 0.0, 0.15, 0.0, 0.05]]
|
| 152 |
+
},
|
| 153 |
+
|
| 154 |
+
"BiasedInformation": {
|
| 155 |
+
"type": "array",
|
| 156 |
+
"description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as potentially biased (signal-evaluator pair coverage), in [0,1]. Length equals NumberSentences.",
|
| 157 |
+
"items": {
|
| 158 |
+
"type": "number",
|
| 159 |
+
"minimum": 0.0,
|
| 160 |
+
"maximum": 1.0
|
| 161 |
+
},
|
| 162 |
+
"examples": [[0.0, 0.0, 0.0, 0.10, 0.0]]
|
| 163 |
+
},
|
| 164 |
+
|
| 165 |
+
"Author": {
|
| 166 |
+
"type": "array",
|
| 167 |
+
"description": "[Optional] Name(s) of the author(s).",
|
| 168 |
+
"items": { "type": "string" },
|
| 169 |
+
"examples": [["Иван Иванов"], ["Агенция БТА"]]
|
| 170 |
+
},
|
| 171 |
+
|
| 172 |
+
"Style": {
|
| 173 |
+
"type": "string",
|
| 174 |
+
"description": "[Optional] Stylistic register of the document.",
|
| 175 |
+
"enum": [
|
| 176 |
+
"Административен",
|
| 177 |
+
"Журналистически",
|
| 178 |
+
"Научен",
|
| 179 |
+
"Художествен",
|
| 180 |
+
"Разговорен",
|
| 181 |
+
"Правен",
|
| 182 |
+
"Технически",
|
| 183 |
+
"Неформален",
|
| 184 |
+
""
|
| 185 |
+
]
|
| 186 |
+
},
|
| 187 |
+
|
| 188 |
+
"Type": {
|
| 189 |
+
"type": "string",
|
| 190 |
+
"description": "[Optional] Document genre.",
|
| 191 |
+
"enum": [
|
| 192 |
+
"Закон",
|
| 193 |
+
"Наредба",
|
| 194 |
+
"Решение",
|
| 195 |
+
"Статия",
|
| 196 |
+
"Книга",
|
| 197 |
+
"Доклад",
|
| 198 |
+
"Интервю",
|
| 199 |
+
"Коментар",
|
| 200 |
+
"Форум",
|
| 201 |
+
"Блог",
|
| 202 |
+
"Уикипедия",
|
| 203 |
+
"Друго",
|
| 204 |
+
""
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
|
| 208 |
+
"Subdomain": {
|
| 209 |
+
"type": "array",
|
| 210 |
+
"description": "[Optional] Narrower thematic classification, hierarchically linked to Domain.",
|
| 211 |
+
"maxItems": 6,
|
| 212 |
+
"items": { "type": "string" },
|
| 213 |
+
"examples": [["Европейско законодателство"], ["Климатична политика"]]
|
| 214 |
+
},
|
| 215 |
+
|
| 216 |
+
"TranslatedDocument": {
|
| 217 |
+
"type": ["boolean", "string"],
|
| 218 |
+
"description": "[Optional] true = translation into Bulgarian; false = original Bulgarian text.",
|
| 219 |
+
"examples": [false, true, ""]
|
| 220 |
+
},
|
| 221 |
+
|
| 222 |
+
"CollectionDate": {
|
| 223 |
+
"type": "string",
|
| 224 |
+
"description": "[Optional] Date of acquisition into the collection (yyyy-mm-dd).",
|
| 225 |
+
"pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
|
| 226 |
+
"examples": ["2024-03-10", ""]
|
| 227 |
+
},
|
| 228 |
+
|
| 229 |
+
"LicenceLink": {
|
| 230 |
+
"type": "string",
|
| 231 |
+
"description": "[Optional] URL of the licence text.",
|
| 232 |
+
"format": "uri",
|
| 233 |
+
"examples": [
|
| 234 |
+
"https://creativecommons.org/public-domain/cc0/",
|
| 235 |
+
"https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf",
|
| 236 |
+
""
|
| 237 |
+
]
|
| 238 |
+
},
|
| 239 |
+
|
| 240 |
+
"TaskCategories": {
|
| 241 |
+
"type": "array",
|
| 242 |
+
"description": "[Optional] Anticipated NLP applications from a predefined list.",
|
| 243 |
+
"items": {
|
| 244 |
+
"type": "string",
|
| 245 |
+
"enum": [
|
| 246 |
+
"Language Modelling",
|
| 247 |
+
"Text Classification",
|
| 248 |
+
"Named Entity Recognition",
|
| 249 |
+
"Machine Translation",
|
| 250 |
+
"Summarisation",
|
| 251 |
+
"Question Answering",
|
| 252 |
+
"Sentiment Analysis",
|
| 253 |
+
"Bias Detection",
|
| 254 |
+
"PII Detection",
|
| 255 |
+
"Information Extraction",
|
| 256 |
+
"Coreference Resolution",
|
| 257 |
+
"Dependency Parsing",
|
| 258 |
+
"Other"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
"examples": [["Language Modelling", "Named Entity Recognition"]]
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
},
|
| 265 |
+
|
| 266 |
+
"additionalProperties": false
|
| 267 |
+
}
|