File size: 5,657 Bytes
18573e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
package bg.bas.dcl.LLMs;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;

/**
 * BulgarianSentenceSplitter
 *
 * Wraps the Apache OpenNLP sentence detection model for Bulgarian, providing
 * a clean, reusable API for all other pipeline components.
 *
 * -----------------------------------------------------------------------
 * MAVEN DEPENDENCIES (add to pom.xml):
 *
 *   <!-- OpenNLP toolkit -->
 *   <dependency>
 *     <groupId>org.apache.opennlp</groupId>
 *     <artifactId>opennlp-tools</artifactId>
 *     <version>2.4.0</version>
 *   </dependency>
 *
 *   <!-- Bulgarian sentence-detection model (UD-based, Apache 2.0) -->
 *   <dependency>
 *     <groupId>org.apache.opennlp</groupId>
 *     <artifactId>opennlp-models-sentdetect-bg</artifactId>
 *     <version>1.2</version>
 *   </dependency>
 *
 * The model JAR bundles the binary model at:
 *   opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin
 * You can also supply an external model file via the two-argument constructor.
 *
 * ------------------------------------------------- 
 */
public class BulgarianSentenceSplitter {

    // -----------------------------------------------------------------------
    // Constants
    // -----------------------------------------------------------------------

    /**
     * Classpath location of the bundled Bulgarian sentence-detection model.
     * Matches the path inside the opennlp-models-sentdetect-bg JAR.
     */
    private static final String BUNDLED_MODEL_PATH =
            "opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin";

    /**
     * Minimum character length for a string to be considered a valid sentence.
     * Shorter strings are returned as-is without splitting.
     */
    private static final int MIN_TEXT_LENGTH = 5;

    // -----------------------------------------------------------------------
    // State
    // -----------------------------------------------------------------------

    private final SentenceDetectorME detector;

    // -----------------------------------------------------------------------
    // Constructors
    // -----------------------------------------------------------------------

    /**
     * Loads the Bulgarian sentence-detection model from the bundled Maven JAR.
     * Requires the opennlp-models-sentdetect-bg artifact on the classpath.
     *
     * @throws RuntimeException if the model cannot be loaded
     */
    public BulgarianSentenceSplitter() {
        this(null);
    }

    /**
     * Loads the Bulgarian sentence-detection model.
     *
     * @param modelPath absolute path to a .bin OpenNLP sentence-detection model,
     *                  or {@code null} / empty string to load from the classpath JAR
     * @throws RuntimeException if the model cannot be loaded
     */
    public BulgarianSentenceSplitter(String modelPath) {
        try {
            InputStream stream;

            if (modelPath == null || modelPath.isBlank()) {
                // Load from the bundled JAR on the classpath
                stream = getClass().getClassLoader()
                        .getResourceAsStream(BUNDLED_MODEL_PATH);
                if (stream == null) {
                    throw new IllegalStateException(
                            "Bulgarian sentence model not found .");
                }
                System.out.println("[SentenceSplitter] Loaded bundled model: " + BUNDLED_MODEL_PATH);
            } else {
                File f = new File(modelPath);
                if (!f.exists())
                    throw new IllegalArgumentException(
                            "Sentence model file not found: " + modelPath);
                stream = new FileInputStream(f);
                System.out.println("[SentenceSplitter] Loaded external model: " + modelPath);
            }

            SentenceModel model = new SentenceModel(stream);
            stream.close();
            detector = new SentenceDetectorME(model);

        } catch (Exception e) {
            throw new RuntimeException("Failed to load Bulgarian sentence model", e);
        }
    }

    // -----------------------------------------------------------------------
    // Core API
    // -----------------------------------------------------------------------

     
    public String[] split(String text) {
        if (text == null) return new String[0];
        String trimmed = text.trim();
        if (trimmed.length() < MIN_TEXT_LENGTH) {
            return trimmed.isEmpty() ? new String[0] : new String[]{trimmed};
        }
        return detector.sentDetect(trimmed);
    }

    
    public List<String> splitToList(String text) {
        return new ArrayList<>(Arrays.asList(split(text)));
    }

     
    public List<String> splitParagraphs(String[] paragraphs) {
        List<String> all = new ArrayList<>();
        if (paragraphs == null) return all;
        for (String para : paragraphs) {
            if (para != null && !para.isBlank())
                all.addAll(splitToList(para));
        }
        return all;
    }

     
    public double[] getSentenceProbabilities() {
        return detector.getSentenceProbabilities();
    }
 
     
    public List<String> splitAndFilter(String text, int minWords) {
        List<String> result = new ArrayList<>();
        for (String sent : split(text)) {
            if (sent.split("\\s+").length >= minWords)
                result.add(sent);
        }
        return result;
    }
}