Spaces:

Jandayl
/

Alalay

Sleeping

File size: 8,908 Bytes

# ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web. 
import re
import subprocess
import sys
import tempfile
import urllib.request
from typing import Any, Dict, List, Tuple

import calamancy
import spacy


MODEL_WHEEL_URL = (
    "https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/"
    "tl_calamancy_md-any-py3-none-any.whl"
)
MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl"


def _install_model_wheel_workaround() -> None:
    """Install the model wheel via a valid local filename to satisfy pip."""
    with tempfile.TemporaryDirectory() as tmpdir:
        local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}"
        urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel)
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel])


def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"):
    """Load the CalamanCy model once for either training or web inference."""
    errors = []

    # Best path: load the installed spaCy package directly and avoid calamancy's installer.
    try:
        return spacy.load("tl_calamancy_md")
    except Exception as exc:
        errors.append(f"spacy.load(tl_calamancy_md): {exc}")

    # Workaround invalid upstream wheel filename by downloading and renaming locally.
    try:
        _install_model_wheel_workaround()
        return spacy.load("tl_calamancy_md")
    except Exception as exc:
        errors.append(f"manual wheel install: {exc}")

    # Last-resort fallback if calamancy fixes model installation behavior.
    for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]:
        try:
            return calamancy.load(candidate)
        except Exception as exc:
            errors.append(f"calamancy.load({candidate}): {exc}")

    raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors))

# Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive.
def merge_dash_sentences(doc) -> List:
    """Merge sentences split by dash tokens (from hyphenated words)."""
    dash_tokens = {"-"}
    raw_sents = list(doc.sents)
    if not raw_sents:
        return []

    merged = [raw_sents[0]]
    for sent in raw_sents[1:]:
        prev = merged[-1]

        start = max(0, prev.end - 2)
        end = min(len(doc), sent.start + 2)
        has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end))

        if has_nearby_dash:
            merged[-1] = doc[prev.start : sent.end]
        else:
            merged.append(sent)
    return merged

# cleans the sentence, avoids misidentifying simple sentences as compound/complex
def simple_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w\s\-.!?]", "", text)  # keep sentence-ending punctuation
    return text.strip()

# gets the sentence, word, and token count
def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
    tokens = [t for t in doc if not t.is_punct and not t.is_space]
    num_words = len(tokens)

    try:
        num_sentences = len(merge_dash_sentences(doc)) or 1
    except Exception:
        num_sentences = 1

    # Fallback for edge cases where sentence splitting fails.
    if num_sentences == 1 and original_text:
        punct_splits = re.split(r"[.!?]+", original_text)
        punct_count = len([s for s in punct_splits if s.strip()])
        if punct_count > num_sentences:
            num_sentences = punct_count

    return num_words, num_sentences, tokens


def mean_lengths(tokens, num_words: int, num_sentences: int):
    mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0
    mean_sentence_length = num_words / num_sentences if num_sentences else 0
    return round(mean_word_length, 4), round(mean_sentence_length, 4)

# TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not. 
def type_token_ratio(tokens, num_words: int):
    word_list = [t.text.lower() for t in tokens]
    return round(len(set(word_list)) / num_words if num_words else 0, 4)

def count_filipino_syllables(word: str) -> int:
    """Approximate Filipino syllable count by counting vowel nuclei."""
    if not isinstance(word, str):
        return 0

    word = re.sub(r"[^a-z-]", "", word.lower())
    if not word:
        return 0

    syllables = 0
    for part in filter(None, word.split("-")):
        syllables += len(re.findall(r"[aeiou]", part))

    return max(syllables, 1)

# tags token that contains more than 3 syllables
def polysyllabic_count(tokens) -> int:
    return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)

# Computes lexical density and part-of-speech ratios for the token list.
def lexical_density_and_pos(tokens, num_words: int):
    content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
    content_words = 0
    pos_counts: Dict[str, int] = {}

    for t in tokens:
        pos = getattr(t, "pos_", None)
        if pos:
            pos_counts[pos] = pos_counts.get(pos, 0) + 1
            if pos in content_pos:
                content_words += 1

    lexical_density = content_words / num_words if num_words else 0

    pos_ratios = {
        "noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4),
        "verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4),
        "adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4),
        "adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4),
        "pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4),
    }

    return round(lexical_density, 4), pos_ratios

# identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density.
def foreign_word_density(tokens):
    english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
    foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]

    count = 0
    for t in tokens:
        word = t.text.lower()
        if len(word) <= 2:
            continue
        if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters):
            count += 1

    return round(count / len(tokens) if tokens else 0, 4)

# checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object
def detect_svo_vso(doc):
    sentences = merge_dash_sentences(doc)
    if not sentences:
        return "Unknown"

    sent = sentences[0]
    tokens = [t for t in sent if not t.is_punct and not t.is_space]

    has_ay = any(t.text.lower() == "ay" for t in tokens)

    first_content = None
    for t in tokens:
        if t.pos_ in {"NOUN", "PRON", "VERB"}:
            first_content = t
            break

    if not first_content:
        return "Unknown"

    if first_content.pos_ == "VERB" and not has_ay:
        return "VSO"
    if first_content.pos_ in {"NOUN", "PRON"} or has_ay:
        return "SVO"

    return "Unknown"

# detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has.
def detect_sentence_type(doc):
    tokens = [t for t in doc if not t.is_punct and not t.is_space]

    coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"}
    subord = {"dahil", "kapag", "upang", "kung", "sapagkat"}

    has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens)
    has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens)

    if has_coord and has_subord:
        return "Compound-Complex"
    if has_subord:
        return "Complex"
    if has_coord:
        return "Compound"

    return "Simple"

# main func
def extract_features(text: str, nlp) -> Dict[str, Any]:
    if not text or not isinstance(text, str):
        return {}

    cleaned = simple_clean(text)
    doc = nlp(cleaned)

    num_words, num_sentences, tokens = basic_counts(doc, cleaned)
    mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences)
    ttr = type_token_ratio(tokens, num_words)
    poly = polysyllabic_count(tokens)
    lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words)
    foreign_density = foreign_word_density(tokens)
    construction = detect_svo_vso(doc)
    sentence_type = detect_sentence_type(doc)

    return {
        "num_words": num_words,
        "num_sentences": num_sentences,
        "mean_word_length": mean_word,
        "mean_sentence_length": mean_sentence,
        "polysyllabic_words": poly,
        "lexical_density": lex_density,
        "type_token_ratio": ttr,
        "foreign_word_density": foreign_density,
        "sentence_construction_type": construction,
        "sentence_type": sentence_type,
        **pos_ratios,
    }