File size: 8,908 Bytes
2b60cf4 b052258 462c128 b052258 462c128 b052258 f5a4e79 462c128 f5a4e79 462c128 f5a4e79 462c128 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 2b60cf4 b052258 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | # ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web.
import re
import subprocess
import sys
import tempfile
import urllib.request
from typing import Any, Dict, List, Tuple
import calamancy
import spacy
MODEL_WHEEL_URL = (
"https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/"
"tl_calamancy_md-any-py3-none-any.whl"
)
MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl"
def _install_model_wheel_workaround() -> None:
"""Install the model wheel via a valid local filename to satisfy pip."""
with tempfile.TemporaryDirectory() as tmpdir:
local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}"
urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel)
subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel])
def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"):
"""Load the CalamanCy model once for either training or web inference."""
errors = []
# Best path: load the installed spaCy package directly and avoid calamancy's installer.
try:
return spacy.load("tl_calamancy_md")
except Exception as exc:
errors.append(f"spacy.load(tl_calamancy_md): {exc}")
# Workaround invalid upstream wheel filename by downloading and renaming locally.
try:
_install_model_wheel_workaround()
return spacy.load("tl_calamancy_md")
except Exception as exc:
errors.append(f"manual wheel install: {exc}")
# Last-resort fallback if calamancy fixes model installation behavior.
for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]:
try:
return calamancy.load(candidate)
except Exception as exc:
errors.append(f"calamancy.load({candidate}): {exc}")
raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors))
# Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive.
def merge_dash_sentences(doc) -> List:
"""Merge sentences split by dash tokens (from hyphenated words)."""
dash_tokens = {"-"}
raw_sents = list(doc.sents)
if not raw_sents:
return []
merged = [raw_sents[0]]
for sent in raw_sents[1:]:
prev = merged[-1]
start = max(0, prev.end - 2)
end = min(len(doc), sent.start + 2)
has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end))
if has_nearby_dash:
merged[-1] = doc[prev.start : sent.end]
else:
merged.append(sent)
return merged
# cleans the sentence, avoids misidentifying simple sentences as compound/complex
def simple_clean(text: str) -> str:
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[^\w\s\-.!?]", "", text) # keep sentence-ending punctuation
return text.strip()
# gets the sentence, word, and token count
def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
tokens = [t for t in doc if not t.is_punct and not t.is_space]
num_words = len(tokens)
try:
num_sentences = len(merge_dash_sentences(doc)) or 1
except Exception:
num_sentences = 1
# Fallback for edge cases where sentence splitting fails.
if num_sentences == 1 and original_text:
punct_splits = re.split(r"[.!?]+", original_text)
punct_count = len([s for s in punct_splits if s.strip()])
if punct_count > num_sentences:
num_sentences = punct_count
return num_words, num_sentences, tokens
def mean_lengths(tokens, num_words: int, num_sentences: int):
mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0
mean_sentence_length = num_words / num_sentences if num_sentences else 0
return round(mean_word_length, 4), round(mean_sentence_length, 4)
# TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not.
def type_token_ratio(tokens, num_words: int):
word_list = [t.text.lower() for t in tokens]
return round(len(set(word_list)) / num_words if num_words else 0, 4)
def count_filipino_syllables(word: str) -> int:
"""Approximate Filipino syllable count by counting vowel nuclei."""
if not isinstance(word, str):
return 0
word = re.sub(r"[^a-z-]", "", word.lower())
if not word:
return 0
syllables = 0
for part in filter(None, word.split("-")):
syllables += len(re.findall(r"[aeiou]", part))
return max(syllables, 1)
# tags token that contains more than 3 syllables
def polysyllabic_count(tokens) -> int:
return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)
# Computes lexical density and part-of-speech ratios for the token list.
def lexical_density_and_pos(tokens, num_words: int):
content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
content_words = 0
pos_counts: Dict[str, int] = {}
for t in tokens:
pos = getattr(t, "pos_", None)
if pos:
pos_counts[pos] = pos_counts.get(pos, 0) + 1
if pos in content_pos:
content_words += 1
lexical_density = content_words / num_words if num_words else 0
pos_ratios = {
"noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4),
"verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4),
"adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4),
"adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4),
"pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4),
}
return round(lexical_density, 4), pos_ratios
# identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density.
def foreign_word_density(tokens):
english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]
count = 0
for t in tokens:
word = t.text.lower()
if len(word) <= 2:
continue
if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters):
count += 1
return round(count / len(tokens) if tokens else 0, 4)
# checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object
def detect_svo_vso(doc):
sentences = merge_dash_sentences(doc)
if not sentences:
return "Unknown"
sent = sentences[0]
tokens = [t for t in sent if not t.is_punct and not t.is_space]
has_ay = any(t.text.lower() == "ay" for t in tokens)
first_content = None
for t in tokens:
if t.pos_ in {"NOUN", "PRON", "VERB"}:
first_content = t
break
if not first_content:
return "Unknown"
if first_content.pos_ == "VERB" and not has_ay:
return "VSO"
if first_content.pos_ in {"NOUN", "PRON"} or has_ay:
return "SVO"
return "Unknown"
# detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has.
def detect_sentence_type(doc):
tokens = [t for t in doc if not t.is_punct and not t.is_space]
coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"}
subord = {"dahil", "kapag", "upang", "kung", "sapagkat"}
has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens)
has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens)
if has_coord and has_subord:
return "Compound-Complex"
if has_subord:
return "Complex"
if has_coord:
return "Compound"
return "Simple"
# main func
def extract_features(text: str, nlp) -> Dict[str, Any]:
if not text or not isinstance(text, str):
return {}
cleaned = simple_clean(text)
doc = nlp(cleaned)
num_words, num_sentences, tokens = basic_counts(doc, cleaned)
mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences)
ttr = type_token_ratio(tokens, num_words)
poly = polysyllabic_count(tokens)
lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words)
foreign_density = foreign_word_density(tokens)
construction = detect_svo_vso(doc)
sentence_type = detect_sentence_type(doc)
return {
"num_words": num_words,
"num_sentences": num_sentences,
"mean_word_length": mean_word,
"mean_sentence_length": mean_sentence,
"polysyllabic_words": poly,
"lexical_density": lex_density,
"type_token_ratio": ttr,
"foreign_word_density": foreign_density,
"sentence_construction_type": construction,
"sentence_type": sentence_type,
**pos_ratios,
}
|