| package bg.bas.dcl.LLMs; |
|
|
| import java.io.BufferedReader; |
| import java.io.FileInputStream; |
| import java.io.InputStreamReader; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public class BiasLexicon { |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| private final Map<String, BiasEntry> formIndex = new HashMap<>(); |
|
|
| |
| |
| |
| |
| private final Map<String, BiasEntry> wordIndex = new HashMap<>(); |
|
|
| |
| private final List<BiasEntry> entries = new ArrayList<>(); |
|
|
| |
| |
| |
|
|
| private int loadedEntries = 0; |
| private int skippedLines = 0; |
| private int formConflicts = 0; |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| public BiasLexicon(String tsvPath) { |
| load(tsvPath); |
| System.out.printf("[BiasLexicon] Loaded %d entries, %d form keys, " |
| + "%d skipped lines, %d form conflicts.%n", |
| loadedEntries, formIndex.size(), skippedLines, formConflicts); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| public BiasEntry lookup(String token) { |
| if (token == null || token.isBlank()) return null; |
| return formIndex.get(token.toLowerCase().trim()); |
| } |
|
|
| |
| |
| |
| |
| |
| public boolean contains(String token) { |
| return lookup(token) != null; |
| } |
|
|
| |
| |
| |
| |
| |
| public BiasEntry lookupLemma(String lemma) { |
| if (lemma == null || lemma.isBlank()) return null; |
| return wordIndex.get(lemma.toLowerCase().trim()); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| public List<BiasEntry> getByType(String biasType) { |
| List<BiasEntry> result = new ArrayList<>(); |
| String target = biasType == null ? "" : biasType.toLowerCase().trim(); |
| for (BiasEntry e : entries) |
| if (e.getBiasType().equalsIgnoreCase(target) || e.getBiasType().isEmpty()) |
| result.add(e); |
| return result; |
| } |
|
|
| |
| |
| |
| |
| public List<BiasEntry> getSignals(String biasType) { |
| List<BiasEntry> result = new ArrayList<>(); |
| for (BiasEntry e : entries) { |
| if (!e.isSignal()) continue; |
| if (biasType == null || biasType.isBlank() |
| || e.getBiasType().isEmpty() |
| || e.getBiasType().equalsIgnoreCase(biasType)) |
| result.add(e); |
| } |
| return result; |
| } |
|
|
| |
| public Collection<BiasEntry> getAll() { |
| return Collections.unmodifiableList(entries); |
| } |
|
|
| |
| public int size() { return entries.size(); } |
|
|
| |
| |
| |
|
|
| private void load(String tsvPath) { |
| try (BufferedReader br = new BufferedReader( |
| new InputStreamReader(new FileInputStream(tsvPath), |
| StandardCharsets.UTF_8))) { |
|
|
| String headerLine = br.readLine(); |
| if (headerLine == null) { |
| System.err.println("[BiasLexicon] Empty file: " + tsvPath); |
| return; |
| } |
|
|
| String line; |
| int lineNum = 1; |
|
|
| while ((line = br.readLine()) != null) { |
| lineNum++; |
| if (line.isBlank()) { skippedLines++; continue; } |
|
|
| String[] cols = line.split("\t", -1); |
|
|
| |
| if (cols.length < 10) { |
| System.err.printf("[BiasLexicon] Line %d: only %d columns, skipping.%n", |
| lineNum, cols.length); |
| skippedLines++; |
| continue; |
| } |
|
|
| try { |
| String word = cols[0].trim(); |
| String pos = cols[1].trim(); |
| boolean signal = "true".equalsIgnoreCase(cols[2].trim()); |
| String biasType = cols[3].trim(); |
| String biasValue = cols[4].trim(); |
| boolean derog = "true".equalsIgnoreCase(cols[5].trim()); |
| boolean coll = "true".equalsIgnoreCase(cols[6].trim()); |
| |
| double positivity = parseDouble(cols[8], lineNum); |
| double negativity = parseDouble(cols[9], lineNum); |
|
|
| |
| Set<String> formsSet = new HashSet<>(); |
| formsSet.add(word.toLowerCase()); |
|
|
| if (cols.length > 10 && !cols[10].isBlank()) { |
| for (String f : cols[10].split("\\|")) { |
| String fc = f.trim().toLowerCase(); |
| if (!fc.isEmpty()) formsSet.add(fc); |
| } |
| } |
|
|
| BiasEntry entry = new BiasEntry(word, pos, signal, |
| biasType, biasValue, derog, coll, |
| positivity, negativity, formsSet); |
|
|
| entries.add(entry); |
| wordIndex.put(word.toLowerCase(), entry); |
|
|
| for (String form : formsSet) { |
| if (formIndex.containsKey(form)) { |
| formConflicts++; |
| |
| } else { |
| formIndex.put(form, entry); |
| } |
| } |
|
|
| loadedEntries++; |
|
|
| } catch (Exception e) { |
| System.err.printf("[BiasLexicon] Line %d: parse error — %s%n", |
| lineNum, e.getMessage()); |
| skippedLines++; |
| } |
| } |
|
|
| } catch (Exception e) { |
| throw new RuntimeException("Failed to load bias lexicon from: " + tsvPath, e); |
| } |
| } |
|
|
| private double parseDouble(String s, int lineNum) { |
| try { |
| return Double.parseDouble(s.trim()); |
| } catch (NumberFormatException e) { |
| System.err.printf("[BiasLexicon] Line %d: cannot parse double '%s', using 0.0%n", |
| lineNum, s); |
| return 0.0; |
| } |
| } |
| } |
|
|