Alalay / feature_core.py
Jandayl's picture
added comments
2b60cf4
# ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web.
import re
import subprocess
import sys
import tempfile
import urllib.request
from typing import Any, Dict, List, Tuple
import calamancy
import spacy
MODEL_WHEEL_URL = (
"https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/"
"tl_calamancy_md-any-py3-none-any.whl"
)
MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl"
def _install_model_wheel_workaround() -> None:
"""Install the model wheel via a valid local filename to satisfy pip."""
with tempfile.TemporaryDirectory() as tmpdir:
local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}"
urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel)
subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel])
def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"):
"""Load the CalamanCy model once for either training or web inference."""
errors = []
# Best path: load the installed spaCy package directly and avoid calamancy's installer.
try:
return spacy.load("tl_calamancy_md")
except Exception as exc:
errors.append(f"spacy.load(tl_calamancy_md): {exc}")
# Workaround invalid upstream wheel filename by downloading and renaming locally.
try:
_install_model_wheel_workaround()
return spacy.load("tl_calamancy_md")
except Exception as exc:
errors.append(f"manual wheel install: {exc}")
# Last-resort fallback if calamancy fixes model installation behavior.
for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]:
try:
return calamancy.load(candidate)
except Exception as exc:
errors.append(f"calamancy.load({candidate}): {exc}")
raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors))
# Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive.
def merge_dash_sentences(doc) -> List:
"""Merge sentences split by dash tokens (from hyphenated words)."""
dash_tokens = {"-"}
raw_sents = list(doc.sents)
if not raw_sents:
return []
merged = [raw_sents[0]]
for sent in raw_sents[1:]:
prev = merged[-1]
start = max(0, prev.end - 2)
end = min(len(doc), sent.start + 2)
has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end))
if has_nearby_dash:
merged[-1] = doc[prev.start : sent.end]
else:
merged.append(sent)
return merged
# cleans the sentence, avoids misidentifying simple sentences as compound/complex
def simple_clean(text: str) -> str:
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[^\w\s\-.!?]", "", text) # keep sentence-ending punctuation
return text.strip()
# gets the sentence, word, and token count
def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
tokens = [t for t in doc if not t.is_punct and not t.is_space]
num_words = len(tokens)
try:
num_sentences = len(merge_dash_sentences(doc)) or 1
except Exception:
num_sentences = 1
# Fallback for edge cases where sentence splitting fails.
if num_sentences == 1 and original_text:
punct_splits = re.split(r"[.!?]+", original_text)
punct_count = len([s for s in punct_splits if s.strip()])
if punct_count > num_sentences:
num_sentences = punct_count
return num_words, num_sentences, tokens
def mean_lengths(tokens, num_words: int, num_sentences: int):
mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0
mean_sentence_length = num_words / num_sentences if num_sentences else 0
return round(mean_word_length, 4), round(mean_sentence_length, 4)
# TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not.
def type_token_ratio(tokens, num_words: int):
word_list = [t.text.lower() for t in tokens]
return round(len(set(word_list)) / num_words if num_words else 0, 4)
def count_filipino_syllables(word: str) -> int:
"""Approximate Filipino syllable count by counting vowel nuclei."""
if not isinstance(word, str):
return 0
word = re.sub(r"[^a-z-]", "", word.lower())
if not word:
return 0
syllables = 0
for part in filter(None, word.split("-")):
syllables += len(re.findall(r"[aeiou]", part))
return max(syllables, 1)
# tags token that contains more than 3 syllables
def polysyllabic_count(tokens) -> int:
return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)
# Computes lexical density and part-of-speech ratios for the token list.
def lexical_density_and_pos(tokens, num_words: int):
content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
content_words = 0
pos_counts: Dict[str, int] = {}
for t in tokens:
pos = getattr(t, "pos_", None)
if pos:
pos_counts[pos] = pos_counts.get(pos, 0) + 1
if pos in content_pos:
content_words += 1
lexical_density = content_words / num_words if num_words else 0
pos_ratios = {
"noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4),
"verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4),
"adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4),
"adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4),
"pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4),
}
return round(lexical_density, 4), pos_ratios
# identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density.
def foreign_word_density(tokens):
english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]
count = 0
for t in tokens:
word = t.text.lower()
if len(word) <= 2:
continue
if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters):
count += 1
return round(count / len(tokens) if tokens else 0, 4)
# checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object
def detect_svo_vso(doc):
sentences = merge_dash_sentences(doc)
if not sentences:
return "Unknown"
sent = sentences[0]
tokens = [t for t in sent if not t.is_punct and not t.is_space]
has_ay = any(t.text.lower() == "ay" for t in tokens)
first_content = None
for t in tokens:
if t.pos_ in {"NOUN", "PRON", "VERB"}:
first_content = t
break
if not first_content:
return "Unknown"
if first_content.pos_ == "VERB" and not has_ay:
return "VSO"
if first_content.pos_ in {"NOUN", "PRON"} or has_ay:
return "SVO"
return "Unknown"
# detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has.
def detect_sentence_type(doc):
tokens = [t for t in doc if not t.is_punct and not t.is_space]
coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"}
subord = {"dahil", "kapag", "upang", "kung", "sapagkat"}
has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens)
has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens)
if has_coord and has_subord:
return "Compound-Complex"
if has_subord:
return "Complex"
if has_coord:
return "Compound"
return "Simple"
# main func
def extract_features(text: str, nlp) -> Dict[str, Any]:
if not text or not isinstance(text, str):
return {}
cleaned = simple_clean(text)
doc = nlp(cleaned)
num_words, num_sentences, tokens = basic_counts(doc, cleaned)
mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences)
ttr = type_token_ratio(tokens, num_words)
poly = polysyllabic_count(tokens)
lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words)
foreign_density = foreign_word_density(tokens)
construction = detect_svo_vso(doc)
sentence_type = detect_sentence_type(doc)
return {
"num_words": num_words,
"num_sentences": num_sentences,
"mean_word_length": mean_word,
"mean_sentence_length": mean_sentence,
"polysyllabic_words": poly,
"lexical_density": lex_density,
"type_token_ratio": ttr,
"foreign_word_density": foreign_density,
"sentence_construction_type": construction,
"sentence_type": sentence_type,
**pos_ratios,
}