package bg.bas.dcl.LLMs;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
/**
* BulgarianSentenceSplitter
*
* Wraps the Apache OpenNLP sentence detection model for Bulgarian, providing
* a clean, reusable API for all other pipeline components.
*
* -----------------------------------------------------------------------
* MAVEN DEPENDENCIES (add to pom.xml):
*
*
*
* org.apache.opennlp
* opennlp-tools
* 2.4.0
*
*
*
*
* org.apache.opennlp
* opennlp-models-sentdetect-bg
* 1.2
*
*
* The model JAR bundles the binary model at:
* opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin
* You can also supply an external model file via the two-argument constructor.
*
* -------------------------------------------------
*/
public class BulgarianSentenceSplitter {
// -----------------------------------------------------------------------
// Constants
// -----------------------------------------------------------------------
/**
* Classpath location of the bundled Bulgarian sentence-detection model.
* Matches the path inside the opennlp-models-sentdetect-bg JAR.
*/
private static final String BUNDLED_MODEL_PATH =
"opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin";
/**
* Minimum character length for a string to be considered a valid sentence.
* Shorter strings are returned as-is without splitting.
*/
private static final int MIN_TEXT_LENGTH = 5;
// -----------------------------------------------------------------------
// State
// -----------------------------------------------------------------------
private final SentenceDetectorME detector;
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
/**
* Loads the Bulgarian sentence-detection model from the bundled Maven JAR.
* Requires the opennlp-models-sentdetect-bg artifact on the classpath.
*
* @throws RuntimeException if the model cannot be loaded
*/
public BulgarianSentenceSplitter() {
this(null);
}
/**
* Loads the Bulgarian sentence-detection model.
*
* @param modelPath absolute path to a .bin OpenNLP sentence-detection model,
* or {@code null} / empty string to load from the classpath JAR
* @throws RuntimeException if the model cannot be loaded
*/
public BulgarianSentenceSplitter(String modelPath) {
try {
InputStream stream;
if (modelPath == null || modelPath.isBlank()) {
// Load from the bundled JAR on the classpath
stream = getClass().getClassLoader()
.getResourceAsStream(BUNDLED_MODEL_PATH);
if (stream == null) {
throw new IllegalStateException(
"Bulgarian sentence model not found .");
}
System.out.println("[SentenceSplitter] Loaded bundled model: " + BUNDLED_MODEL_PATH);
} else {
File f = new File(modelPath);
if (!f.exists())
throw new IllegalArgumentException(
"Sentence model file not found: " + modelPath);
stream = new FileInputStream(f);
System.out.println("[SentenceSplitter] Loaded external model: " + modelPath);
}
SentenceModel model = new SentenceModel(stream);
stream.close();
detector = new SentenceDetectorME(model);
} catch (Exception e) {
throw new RuntimeException("Failed to load Bulgarian sentence model", e);
}
}
// -----------------------------------------------------------------------
// Core API
// -----------------------------------------------------------------------
public String[] split(String text) {
if (text == null) return new String[0];
String trimmed = text.trim();
if (trimmed.length() < MIN_TEXT_LENGTH) {
return trimmed.isEmpty() ? new String[0] : new String[]{trimmed};
}
return detector.sentDetect(trimmed);
}
public List splitToList(String text) {
return new ArrayList<>(Arrays.asList(split(text)));
}
public List splitParagraphs(String[] paragraphs) {
List all = new ArrayList<>();
if (paragraphs == null) return all;
for (String para : paragraphs) {
if (para != null && !para.isBlank())
all.addAll(splitToList(para));
}
return all;
}
public double[] getSentenceProbabilities() {
return detector.getSentenceProbabilities();
}
public List splitAndFilter(String text, int minWords) {
List result = new ArrayList<>();
for (String sent : split(text)) {
if (sent.split("\\s+").length >= minWords)
result.add(sent);
}
return result;
}
}