# ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web. import re import subprocess import sys import tempfile import urllib.request from typing import Any, Dict, List, Tuple import calamancy import spacy MODEL_WHEEL_URL = ( "https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/" "tl_calamancy_md-any-py3-none-any.whl" ) MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl" def _install_model_wheel_workaround() -> None: """Install the model wheel via a valid local filename to satisfy pip.""" with tempfile.TemporaryDirectory() as tmpdir: local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}" urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel) subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel]) def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"): """Load the CalamanCy model once for either training or web inference.""" errors = [] # Best path: load the installed spaCy package directly and avoid calamancy's installer. try: return spacy.load("tl_calamancy_md") except Exception as exc: errors.append(f"spacy.load(tl_calamancy_md): {exc}") # Workaround invalid upstream wheel filename by downloading and renaming locally. try: _install_model_wheel_workaround() return spacy.load("tl_calamancy_md") except Exception as exc: errors.append(f"manual wheel install: {exc}") # Last-resort fallback if calamancy fixes model installation behavior. for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]: try: return calamancy.load(candidate) except Exception as exc: errors.append(f"calamancy.load({candidate}): {exc}") raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors)) # Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive. def merge_dash_sentences(doc) -> List: """Merge sentences split by dash tokens (from hyphenated words).""" dash_tokens = {"-"} raw_sents = list(doc.sents) if not raw_sents: return [] merged = [raw_sents[0]] for sent in raw_sents[1:]: prev = merged[-1] start = max(0, prev.end - 2) end = min(len(doc), sent.start + 2) has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end)) if has_nearby_dash: merged[-1] = doc[prev.start : sent.end] else: merged.append(sent) return merged # cleans the sentence, avoids misidentifying simple sentences as compound/complex def simple_clean(text: str) -> str: if not isinstance(text, str): return "" text = text.lower() text = re.sub(r"\s+", " ", text) text = re.sub(r"[^\w\s\-.!?]", "", text) # keep sentence-ending punctuation return text.strip() # gets the sentence, word, and token count def basic_counts(doc, original_text: str) -> Tuple[int, int, List]: tokens = [t for t in doc if not t.is_punct and not t.is_space] num_words = len(tokens) try: num_sentences = len(merge_dash_sentences(doc)) or 1 except Exception: num_sentences = 1 # Fallback for edge cases where sentence splitting fails. if num_sentences == 1 and original_text: punct_splits = re.split(r"[.!?]+", original_text) punct_count = len([s for s in punct_splits if s.strip()]) if punct_count > num_sentences: num_sentences = punct_count return num_words, num_sentences, tokens def mean_lengths(tokens, num_words: int, num_sentences: int): mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0 mean_sentence_length = num_words / num_sentences if num_sentences else 0 return round(mean_word_length, 4), round(mean_sentence_length, 4) # TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not. def type_token_ratio(tokens, num_words: int): word_list = [t.text.lower() for t in tokens] return round(len(set(word_list)) / num_words if num_words else 0, 4) def count_filipino_syllables(word: str) -> int: """Approximate Filipino syllable count by counting vowel nuclei.""" if not isinstance(word, str): return 0 word = re.sub(r"[^a-z-]", "", word.lower()) if not word: return 0 syllables = 0 for part in filter(None, word.split("-")): syllables += len(re.findall(r"[aeiou]", part)) return max(syllables, 1) # tags token that contains more than 3 syllables def polysyllabic_count(tokens) -> int: return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3) # Computes lexical density and part-of-speech ratios for the token list. def lexical_density_and_pos(tokens, num_words: int): content_pos = {"NOUN", "VERB", "ADJ", "ADV"} content_words = 0 pos_counts: Dict[str, int] = {} for t in tokens: pos = getattr(t, "pos_", None) if pos: pos_counts[pos] = pos_counts.get(pos, 0) + 1 if pos in content_pos: content_words += 1 lexical_density = content_words / num_words if num_words else 0 pos_ratios = { "noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4), "verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4), "adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4), "adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4), "pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4), } return round(lexical_density, 4), pos_ratios # identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density. def foreign_word_density(tokens): english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"] foreign_letters = ["f", "v", "z", "x", "q", "j", "c"] count = 0 for t in tokens: word = t.text.lower() if len(word) <= 2: continue if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters): count += 1 return round(count / len(tokens) if tokens else 0, 4) # checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object def detect_svo_vso(doc): sentences = merge_dash_sentences(doc) if not sentences: return "Unknown" sent = sentences[0] tokens = [t for t in sent if not t.is_punct and not t.is_space] has_ay = any(t.text.lower() == "ay" for t in tokens) first_content = None for t in tokens: if t.pos_ in {"NOUN", "PRON", "VERB"}: first_content = t break if not first_content: return "Unknown" if first_content.pos_ == "VERB" and not has_ay: return "VSO" if first_content.pos_ in {"NOUN", "PRON"} or has_ay: return "SVO" return "Unknown" # detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has. def detect_sentence_type(doc): tokens = [t for t in doc if not t.is_punct and not t.is_space] coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"} subord = {"dahil", "kapag", "upang", "kung", "sapagkat"} has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens) has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens) if has_coord and has_subord: return "Compound-Complex" if has_subord: return "Complex" if has_coord: return "Compound" return "Simple" # main func def extract_features(text: str, nlp) -> Dict[str, Any]: if not text or not isinstance(text, str): return {} cleaned = simple_clean(text) doc = nlp(cleaned) num_words, num_sentences, tokens = basic_counts(doc, cleaned) mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences) ttr = type_token_ratio(tokens, num_words) poly = polysyllabic_count(tokens) lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words) foreign_density = foreign_word_density(tokens) construction = detect_svo_vso(doc) sentence_type = detect_sentence_type(doc) return { "num_words": num_words, "num_sentences": num_sentences, "mean_word_length": mean_word, "mean_sentence_length": mean_sentence, "polysyllabic_words": poly, "lexical_density": lex_density, "type_token_ratio": ttr, "foreign_word_density": foreign_density, "sentence_construction_type": construction, "sentence_type": sentence_type, **pos_ratios, }