File size: 9,626 Bytes
18573e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | package bg.bas.dcl.LLMs;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* BiasLexicon
*
* Loads the Bulgarian bias dictionary (bulgarian_bias_dictionary_v4.tsv) and
* provides fast O(1) form-level lookup for use by the bias detector.
*
* -----------------------------------------------------------------------
* TSV FORMAT (tab-separated, first row is header):
*
* Col 0 word canonical lemma
* Col 1 POS N | A | V | …
* Col 2 signal true | false
* Col 3 biasType gender | race_ethnicity | religion | disability | appearance | ""
* Col 4 biasValue positive | negative | neutral | ""
* Col 5 derogatory true | false
* Col 6 colloquial true | false
* Col 7 forms (boolean flag — ignored; inflected forms in col 10)
* Col 8 positivity double [0,1]
* Col 9 negativity double [0,1]
* Col 10 inflectedForms pipe-separated surface forms, or empty
*
*
*/
public class BiasLexicon {
// -----------------------------------------------------------------------
// Indexes
// -----------------------------------------------------------------------
/**
* Primary form index: lowercased surface form → BiasEntry.
* A single form can only map to one entry (first one wins if there are
* duplicates — extremely rare in the dictionary).
*/
private final Map<String, BiasEntry> formIndex = new HashMap<>();
/**
* Canonical word index: lowercased lemma → BiasEntry.
* Useful when you already have the base form.
*/
private final Map<String, BiasEntry> wordIndex = new HashMap<>();
/** All entries in load order. */
private final List<BiasEntry> entries = new ArrayList<>();
// -----------------------------------------------------------------------
// Loading statistics
// -----------------------------------------------------------------------
private int loadedEntries = 0;
private int skippedLines = 0;
private int formConflicts = 0;
// -----------------------------------------------------------------------
// Constructor
// -----------------------------------------------------------------------
/**
* Loads the bias dictionary from a TSV file.
*
* @param tsvPath absolute path to the TSV file
* @throws RuntimeException if the file cannot be read
*/
public BiasLexicon(String tsvPath) {
load(tsvPath);
System.out.printf("[BiasLexicon] Loaded %d entries, %d form keys, "
+ "%d skipped lines, %d form conflicts.%n",
loadedEntries, formIndex.size(), skippedLines, formConflicts);
}
// -----------------------------------------------------------------------
// Lookup API
// -----------------------------------------------------------------------
/**
* Looks up a surface token (case-insensitive) and returns the
* matching {@link BiasEntry}, or {@code null} if not found.
*
* @param token any surface form (inflected or base)
*/
public BiasEntry lookup(String token) {
if (token == null || token.isBlank()) return null;
return formIndex.get(token.toLowerCase().trim());
}
/**
* Returns true if the token (any form) is present in the lexicon.
*
* @param token surface form to check
*/
public boolean contains(String token) {
return lookup(token) != null;
}
/**
* Looks up a canonical lemma directly.
*
* @param lemma the base/dictionary form
*/
public BiasEntry lookupLemma(String lemma) {
if (lemma == null || lemma.isBlank()) return null;
return wordIndex.get(lemma.toLowerCase().trim());
}
// -----------------------------------------------------------------------
// Filtered views
// -----------------------------------------------------------------------
/**
* Returns all entries whose {@code biasType} matches the given category
* (case-insensitive), plus all general entries (empty biasType).
*
* @param biasType e.g. "gender", "disability"
*/
public List<BiasEntry> getByType(String biasType) {
List<BiasEntry> result = new ArrayList<>();
String target = biasType == null ? "" : biasType.toLowerCase().trim();
for (BiasEntry e : entries)
if (e.getBiasType().equalsIgnoreCase(target) || e.getBiasType().isEmpty())
result.add(e);
return result;
}
/**
* Returns all entries that are marked as signals (signal=true) for
* the given bias category, or all signal entries if biasType is null/empty.
*/
public List<BiasEntry> getSignals(String biasType) {
List<BiasEntry> result = new ArrayList<>();
for (BiasEntry e : entries) {
if (!e.isSignal()) continue;
if (biasType == null || biasType.isBlank()
|| e.getBiasType().isEmpty()
|| e.getBiasType().equalsIgnoreCase(biasType))
result.add(e);
}
return result;
}
/** Returns an unmodifiable view of all loaded entries. */
public Collection<BiasEntry> getAll() {
return Collections.unmodifiableList(entries);
}
/** Number of loaded dictionary entries. */
public int size() { return entries.size(); }
// -----------------------------------------------------------------------
// Internal loading
// -----------------------------------------------------------------------
private void load(String tsvPath) {
try (BufferedReader br = new BufferedReader(
new InputStreamReader(new FileInputStream(tsvPath),
StandardCharsets.UTF_8))) {
String headerLine = br.readLine(); // skip header
if (headerLine == null) {
System.err.println("[BiasLexicon] Empty file: " + tsvPath);
return;
}
String line;
int lineNum = 1; // already read header as line 1
while ((line = br.readLine()) != null) {
lineNum++;
if (line.isBlank()) { skippedLines++; continue; }
String[] cols = line.split("\t", -1);
// Minimum viable: need at least 10 columns
if (cols.length < 10) {
System.err.printf("[BiasLexicon] Line %d: only %d columns, skipping.%n",
lineNum, cols.length);
skippedLines++;
continue;
}
try {
String word = cols[0].trim();
String pos = cols[1].trim();
boolean signal = "true".equalsIgnoreCase(cols[2].trim());
String biasType = cols[3].trim();
String biasValue = cols[4].trim();
boolean derog = "true".equalsIgnoreCase(cols[5].trim());
boolean coll = "true".equalsIgnoreCase(cols[6].trim());
// cols[7] is a boolean forms-flag (ignored)
double positivity = parseDouble(cols[8], lineNum);
double negativity = parseDouble(cols[9], lineNum);
// Inflected forms: pipe-separated in col 10 (if present)
Set<String> formsSet = new HashSet<>();
formsSet.add(word.toLowerCase()); // always include the lemma
if (cols.length > 10 && !cols[10].isBlank()) {
for (String f : cols[10].split("\\|")) {
String fc = f.trim().toLowerCase();
if (!fc.isEmpty()) formsSet.add(fc);
}
}
BiasEntry entry = new BiasEntry(word, pos, signal,
biasType, biasValue, derog, coll,
positivity, negativity, formsSet);
entries.add(entry);
wordIndex.put(word.toLowerCase(), entry);
for (String form : formsSet) {
if (formIndex.containsKey(form)) {
formConflicts++;
// Keep first entry — do not overwrite
} else {
formIndex.put(form, entry);
}
}
loadedEntries++;
} catch (Exception e) {
System.err.printf("[BiasLexicon] Line %d: parse error — %s%n",
lineNum, e.getMessage());
skippedLines++;
}
}
} catch (Exception e) {
throw new RuntimeException("Failed to load bias lexicon from: " + tsvPath, e);
}
}
private double parseDouble(String s, int lineNum) {
try {
return Double.parseDouble(s.trim());
} catch (NumberFormatException e) {
System.err.printf("[BiasLexicon] Line %d: cannot parse double '%s', using 0.0%n",
lineNum, s);
return 0.0;
}
}
}
|