| |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import urllib.request |
| from typing import Any, Dict, List, Tuple |
|
|
| import calamancy |
| import spacy |
|
|
|
|
| MODEL_WHEEL_URL = ( |
| "https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/" |
| "tl_calamancy_md-any-py3-none-any.whl" |
| ) |
| MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl" |
|
|
|
|
| def _install_model_wheel_workaround() -> None: |
| """Install the model wheel via a valid local filename to satisfy pip.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}" |
| urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel) |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel]) |
|
|
|
|
| def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"): |
| """Load the CalamanCy model once for either training or web inference.""" |
| errors = [] |
|
|
| |
| try: |
| return spacy.load("tl_calamancy_md") |
| except Exception as exc: |
| errors.append(f"spacy.load(tl_calamancy_md): {exc}") |
|
|
| |
| try: |
| _install_model_wheel_workaround() |
| return spacy.load("tl_calamancy_md") |
| except Exception as exc: |
| errors.append(f"manual wheel install: {exc}") |
|
|
| |
| for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]: |
| try: |
| return calamancy.load(candidate) |
| except Exception as exc: |
| errors.append(f"calamancy.load({candidate}): {exc}") |
|
|
| raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors)) |
|
|
| |
| def merge_dash_sentences(doc) -> List: |
| """Merge sentences split by dash tokens (from hyphenated words).""" |
| dash_tokens = {"-"} |
| raw_sents = list(doc.sents) |
| if not raw_sents: |
| return [] |
|
|
| merged = [raw_sents[0]] |
| for sent in raw_sents[1:]: |
| prev = merged[-1] |
|
|
| start = max(0, prev.end - 2) |
| end = min(len(doc), sent.start + 2) |
| has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end)) |
|
|
| if has_nearby_dash: |
| merged[-1] = doc[prev.start : sent.end] |
| else: |
| merged.append(sent) |
| return merged |
|
|
| |
| def simple_clean(text: str) -> str: |
| if not isinstance(text, str): |
| return "" |
| text = text.lower() |
| text = re.sub(r"\s+", " ", text) |
| text = re.sub(r"[^\w\s\-.!?]", "", text) |
| return text.strip() |
|
|
| |
| def basic_counts(doc, original_text: str) -> Tuple[int, int, List]: |
| tokens = [t for t in doc if not t.is_punct and not t.is_space] |
| num_words = len(tokens) |
|
|
| try: |
| num_sentences = len(merge_dash_sentences(doc)) or 1 |
| except Exception: |
| num_sentences = 1 |
|
|
| |
| if num_sentences == 1 and original_text: |
| punct_splits = re.split(r"[.!?]+", original_text) |
| punct_count = len([s for s in punct_splits if s.strip()]) |
| if punct_count > num_sentences: |
| num_sentences = punct_count |
|
|
| return num_words, num_sentences, tokens |
|
|
|
|
| def mean_lengths(tokens, num_words: int, num_sentences: int): |
| mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0 |
| mean_sentence_length = num_words / num_sentences if num_sentences else 0 |
| return round(mean_word_length, 4), round(mean_sentence_length, 4) |
|
|
| |
| def type_token_ratio(tokens, num_words: int): |
| word_list = [t.text.lower() for t in tokens] |
| return round(len(set(word_list)) / num_words if num_words else 0, 4) |
|
|
| def count_filipino_syllables(word: str) -> int: |
| """Approximate Filipino syllable count by counting vowel nuclei.""" |
| if not isinstance(word, str): |
| return 0 |
|
|
| word = re.sub(r"[^a-z-]", "", word.lower()) |
| if not word: |
| return 0 |
|
|
| syllables = 0 |
| for part in filter(None, word.split("-")): |
| syllables += len(re.findall(r"[aeiou]", part)) |
|
|
| return max(syllables, 1) |
|
|
| |
| def polysyllabic_count(tokens) -> int: |
| return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3) |
|
|
| |
| def lexical_density_and_pos(tokens, num_words: int): |
| content_pos = {"NOUN", "VERB", "ADJ", "ADV"} |
| content_words = 0 |
| pos_counts: Dict[str, int] = {} |
|
|
| for t in tokens: |
| pos = getattr(t, "pos_", None) |
| if pos: |
| pos_counts[pos] = pos_counts.get(pos, 0) + 1 |
| if pos in content_pos: |
| content_words += 1 |
|
|
| lexical_density = content_words / num_words if num_words else 0 |
|
|
| pos_ratios = { |
| "noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4), |
| "verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4), |
| "adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4), |
| "adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4), |
| "pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4), |
| } |
|
|
| return round(lexical_density, 4), pos_ratios |
|
|
| |
| def foreign_word_density(tokens): |
| english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"] |
| foreign_letters = ["f", "v", "z", "x", "q", "j", "c"] |
|
|
| count = 0 |
| for t in tokens: |
| word = t.text.lower() |
| if len(word) <= 2: |
| continue |
| if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters): |
| count += 1 |
|
|
| return round(count / len(tokens) if tokens else 0, 4) |
|
|
| |
| def detect_svo_vso(doc): |
| sentences = merge_dash_sentences(doc) |
| if not sentences: |
| return "Unknown" |
|
|
| sent = sentences[0] |
| tokens = [t for t in sent if not t.is_punct and not t.is_space] |
|
|
| has_ay = any(t.text.lower() == "ay" for t in tokens) |
|
|
| first_content = None |
| for t in tokens: |
| if t.pos_ in {"NOUN", "PRON", "VERB"}: |
| first_content = t |
| break |
|
|
| if not first_content: |
| return "Unknown" |
|
|
| if first_content.pos_ == "VERB" and not has_ay: |
| return "VSO" |
| if first_content.pos_ in {"NOUN", "PRON"} or has_ay: |
| return "SVO" |
|
|
| return "Unknown" |
|
|
| |
| def detect_sentence_type(doc): |
| tokens = [t for t in doc if not t.is_punct and not t.is_space] |
|
|
| coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"} |
| subord = {"dahil", "kapag", "upang", "kung", "sapagkat"} |
|
|
| has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens) |
| has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens) |
|
|
| if has_coord and has_subord: |
| return "Compound-Complex" |
| if has_subord: |
| return "Complex" |
| if has_coord: |
| return "Compound" |
|
|
| return "Simple" |
|
|
| |
| def extract_features(text: str, nlp) -> Dict[str, Any]: |
| if not text or not isinstance(text, str): |
| return {} |
|
|
| cleaned = simple_clean(text) |
| doc = nlp(cleaned) |
|
|
| num_words, num_sentences, tokens = basic_counts(doc, cleaned) |
| mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences) |
| ttr = type_token_ratio(tokens, num_words) |
| poly = polysyllabic_count(tokens) |
| lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words) |
| foreign_density = foreign_word_density(tokens) |
| construction = detect_svo_vso(doc) |
| sentence_type = detect_sentence_type(doc) |
|
|
| return { |
| "num_words": num_words, |
| "num_sentences": num_sentences, |
| "mean_word_length": mean_word, |
| "mean_sentence_length": mean_sentence, |
| "polysyllabic_words": poly, |
| "lexical_density": lex_density, |
| "type_token_ratio": ttr, |
| "foreign_word_density": foreign_density, |
| "sentence_construction_type": construction, |
| "sentence_type": sentence_type, |
| **pos_ratios, |
| } |
|
|