File size: 8,908 Bytes
2b60cf4
b052258
462c128
 
 
 
b052258
 
 
462c128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b052258
 
 
 
f5a4e79
 
462c128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5a4e79
 
 
462c128
f5a4e79
462c128
b052258
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
2b60cf4
 
 
b052258
2b60cf4
 
 
b052258
2b60cf4
 
 
b052258
2b60cf4
b052258
2b60cf4
b052258
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b60cf4
b052258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web. 
import re
import subprocess
import sys
import tempfile
import urllib.request
from typing import Any, Dict, List, Tuple

import calamancy
import spacy


MODEL_WHEEL_URL = (
    "https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/"
    "tl_calamancy_md-any-py3-none-any.whl"
)
MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl"


def _install_model_wheel_workaround() -> None:
    """Install the model wheel via a valid local filename to satisfy pip."""
    with tempfile.TemporaryDirectory() as tmpdir:
        local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}"
        urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel)
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel])


def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"):
    """Load the CalamanCy model once for either training or web inference."""
    errors = []

    # Best path: load the installed spaCy package directly and avoid calamancy's installer.
    try:
        return spacy.load("tl_calamancy_md")
    except Exception as exc:
        errors.append(f"spacy.load(tl_calamancy_md): {exc}")

    # Workaround invalid upstream wheel filename by downloading and renaming locally.
    try:
        _install_model_wheel_workaround()
        return spacy.load("tl_calamancy_md")
    except Exception as exc:
        errors.append(f"manual wheel install: {exc}")

    # Last-resort fallback if calamancy fixes model installation behavior.
    for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]:
        try:
            return calamancy.load(candidate)
        except Exception as exc:
            errors.append(f"calamancy.load({candidate}): {exc}")

    raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors))

# Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive.
def merge_dash_sentences(doc) -> List:
    """Merge sentences split by dash tokens (from hyphenated words)."""
    dash_tokens = {"-"}
    raw_sents = list(doc.sents)
    if not raw_sents:
        return []

    merged = [raw_sents[0]]
    for sent in raw_sents[1:]:
        prev = merged[-1]

        start = max(0, prev.end - 2)
        end = min(len(doc), sent.start + 2)
        has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end))

        if has_nearby_dash:
            merged[-1] = doc[prev.start : sent.end]
        else:
            merged.append(sent)
    return merged

# cleans the sentence, avoids misidentifying simple sentences as compound/complex
def simple_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w\s\-.!?]", "", text)  # keep sentence-ending punctuation
    return text.strip()

# gets the sentence, word, and token count
def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
    tokens = [t for t in doc if not t.is_punct and not t.is_space]
    num_words = len(tokens)

    try:
        num_sentences = len(merge_dash_sentences(doc)) or 1
    except Exception:
        num_sentences = 1

    # Fallback for edge cases where sentence splitting fails.
    if num_sentences == 1 and original_text:
        punct_splits = re.split(r"[.!?]+", original_text)
        punct_count = len([s for s in punct_splits if s.strip()])
        if punct_count > num_sentences:
            num_sentences = punct_count

    return num_words, num_sentences, tokens


def mean_lengths(tokens, num_words: int, num_sentences: int):
    mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0
    mean_sentence_length = num_words / num_sentences if num_sentences else 0
    return round(mean_word_length, 4), round(mean_sentence_length, 4)

# TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not. 
def type_token_ratio(tokens, num_words: int):
    word_list = [t.text.lower() for t in tokens]
    return round(len(set(word_list)) / num_words if num_words else 0, 4)

def count_filipino_syllables(word: str) -> int:
    """Approximate Filipino syllable count by counting vowel nuclei."""
    if not isinstance(word, str):
        return 0

    word = re.sub(r"[^a-z-]", "", word.lower())
    if not word:
        return 0

    syllables = 0
    for part in filter(None, word.split("-")):
        syllables += len(re.findall(r"[aeiou]", part))

    return max(syllables, 1)

# tags token that contains more than 3 syllables
def polysyllabic_count(tokens) -> int:
    return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)

# Computes lexical density and part-of-speech ratios for the token list.
def lexical_density_and_pos(tokens, num_words: int):
    content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
    content_words = 0
    pos_counts: Dict[str, int] = {}

    for t in tokens:
        pos = getattr(t, "pos_", None)
        if pos:
            pos_counts[pos] = pos_counts.get(pos, 0) + 1
            if pos in content_pos:
                content_words += 1

    lexical_density = content_words / num_words if num_words else 0

    pos_ratios = {
        "noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4),
        "verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4),
        "adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4),
        "adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4),
        "pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4),
    }

    return round(lexical_density, 4), pos_ratios

# identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density.
def foreign_word_density(tokens):
    english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
    foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]

    count = 0
    for t in tokens:
        word = t.text.lower()
        if len(word) <= 2:
            continue
        if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters):
            count += 1

    return round(count / len(tokens) if tokens else 0, 4)

# checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object
def detect_svo_vso(doc):
    sentences = merge_dash_sentences(doc)
    if not sentences:
        return "Unknown"

    sent = sentences[0]
    tokens = [t for t in sent if not t.is_punct and not t.is_space]

    has_ay = any(t.text.lower() == "ay" for t in tokens)

    first_content = None
    for t in tokens:
        if t.pos_ in {"NOUN", "PRON", "VERB"}:
            first_content = t
            break

    if not first_content:
        return "Unknown"

    if first_content.pos_ == "VERB" and not has_ay:
        return "VSO"
    if first_content.pos_ in {"NOUN", "PRON"} or has_ay:
        return "SVO"

    return "Unknown"

# detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has.
def detect_sentence_type(doc):
    tokens = [t for t in doc if not t.is_punct and not t.is_space]

    coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"}
    subord = {"dahil", "kapag", "upang", "kung", "sapagkat"}

    has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens)
    has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens)

    if has_coord and has_subord:
        return "Compound-Complex"
    if has_subord:
        return "Complex"
    if has_coord:
        return "Compound"

    return "Simple"

# main func
def extract_features(text: str, nlp) -> Dict[str, Any]:
    if not text or not isinstance(text, str):
        return {}

    cleaned = simple_clean(text)
    doc = nlp(cleaned)

    num_words, num_sentences, tokens = basic_counts(doc, cleaned)
    mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences)
    ttr = type_token_ratio(tokens, num_words)
    poly = polysyllabic_count(tokens)
    lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words)
    foreign_density = foreign_word_density(tokens)
    construction = detect_svo_vso(doc)
    sentence_type = detect_sentence_type(doc)

    return {
        "num_words": num_words,
        "num_sentences": num_sentences,
        "mean_word_length": mean_word,
        "mean_sentence_length": mean_sentence,
        "polysyllabic_words": poly,
        "lexical_density": lex_density,
        "type_token_ratio": ttr,
        "foreign_word_density": foreign_density,
        "sentence_construction_type": construction,
        "sentence_type": sentence_type,
        **pos_ratios,
    }