File size: 5,657 Bytes
18573e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | package bg.bas.dcl.LLMs;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
/**
* BulgarianSentenceSplitter
*
* Wraps the Apache OpenNLP sentence detection model for Bulgarian, providing
* a clean, reusable API for all other pipeline components.
*
* -----------------------------------------------------------------------
* MAVEN DEPENDENCIES (add to pom.xml):
*
* <!-- OpenNLP toolkit -->
* <dependency>
* <groupId>org.apache.opennlp</groupId>
* <artifactId>opennlp-tools</artifactId>
* <version>2.4.0</version>
* </dependency>
*
* <!-- Bulgarian sentence-detection model (UD-based, Apache 2.0) -->
* <dependency>
* <groupId>org.apache.opennlp</groupId>
* <artifactId>opennlp-models-sentdetect-bg</artifactId>
* <version>1.2</version>
* </dependency>
*
* The model JAR bundles the binary model at:
* opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin
* You can also supply an external model file via the two-argument constructor.
*
* -------------------------------------------------
*/
public class BulgarianSentenceSplitter {
// -----------------------------------------------------------------------
// Constants
// -----------------------------------------------------------------------
/**
* Classpath location of the bundled Bulgarian sentence-detection model.
* Matches the path inside the opennlp-models-sentdetect-bg JAR.
*/
private static final String BUNDLED_MODEL_PATH =
"opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin";
/**
* Minimum character length for a string to be considered a valid sentence.
* Shorter strings are returned as-is without splitting.
*/
private static final int MIN_TEXT_LENGTH = 5;
// -----------------------------------------------------------------------
// State
// -----------------------------------------------------------------------
private final SentenceDetectorME detector;
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
/**
* Loads the Bulgarian sentence-detection model from the bundled Maven JAR.
* Requires the opennlp-models-sentdetect-bg artifact on the classpath.
*
* @throws RuntimeException if the model cannot be loaded
*/
public BulgarianSentenceSplitter() {
this(null);
}
/**
* Loads the Bulgarian sentence-detection model.
*
* @param modelPath absolute path to a .bin OpenNLP sentence-detection model,
* or {@code null} / empty string to load from the classpath JAR
* @throws RuntimeException if the model cannot be loaded
*/
public BulgarianSentenceSplitter(String modelPath) {
try {
InputStream stream;
if (modelPath == null || modelPath.isBlank()) {
// Load from the bundled JAR on the classpath
stream = getClass().getClassLoader()
.getResourceAsStream(BUNDLED_MODEL_PATH);
if (stream == null) {
throw new IllegalStateException(
"Bulgarian sentence model not found .");
}
System.out.println("[SentenceSplitter] Loaded bundled model: " + BUNDLED_MODEL_PATH);
} else {
File f = new File(modelPath);
if (!f.exists())
throw new IllegalArgumentException(
"Sentence model file not found: " + modelPath);
stream = new FileInputStream(f);
System.out.println("[SentenceSplitter] Loaded external model: " + modelPath);
}
SentenceModel model = new SentenceModel(stream);
stream.close();
detector = new SentenceDetectorME(model);
} catch (Exception e) {
throw new RuntimeException("Failed to load Bulgarian sentence model", e);
}
}
// -----------------------------------------------------------------------
// Core API
// -----------------------------------------------------------------------
public String[] split(String text) {
if (text == null) return new String[0];
String trimmed = text.trim();
if (trimmed.length() < MIN_TEXT_LENGTH) {
return trimmed.isEmpty() ? new String[0] : new String[]{trimmed};
}
return detector.sentDetect(trimmed);
}
public List<String> splitToList(String text) {
return new ArrayList<>(Arrays.asList(split(text)));
}
public List<String> splitParagraphs(String[] paragraphs) {
List<String> all = new ArrayList<>();
if (paragraphs == null) return all;
for (String para : paragraphs) {
if (para != null && !para.isBlank())
all.addAll(splitToList(para));
}
return all;
}
public double[] getSentenceProbabilities() {
return detector.getSentenceProbabilities();
}
public List<String> splitAndFilter(String text, int minWords) {
List<String> result = new ArrayList<>();
for (String sent : split(text)) {
if (sent.split("\\s+").length >= minWords)
result.add(sent);
}
return result;
}
}
|