Spaces:

Jandayl
/

Alalay

Sleeping

App Files Files Community

Alalay / feature_core.py

Jandayl

added comments

2b60cf4 about 2 months ago

raw

history blame contribute delete

8.91 kB

	# ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web.
	import re
	import subprocess
	import sys
	import tempfile
	import urllib.request
	from typing import Any, Dict, List, Tuple

	import calamancy
	import spacy


	MODEL_WHEEL_URL = (
	"https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/"
	"tl_calamancy_md-any-py3-none-any.whl"
	)
	MODEL_WHEEL_LOCAL_NAME = "tl_calamancy_md-0.2.0-py3-none-any.whl"


	def _install_model_wheel_workaround() -> None:
	"""Install the model wheel via a valid local filename to satisfy pip."""
	with tempfile.TemporaryDirectory() as tmpdir:
	local_wheel = f"{tmpdir}/{MODEL_WHEEL_LOCAL_NAME}"
	urllib.request.urlretrieve(MODEL_WHEEL_URL, local_wheel)
	subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", local_wheel])


	def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"):
	"""Load the CalamanCy model once for either training or web inference."""
	errors = []

	# Best path: load the installed spaCy package directly and avoid calamancy's installer.
	try:
	return spacy.load("tl_calamancy_md")
	except Exception as exc:
	errors.append(f"spacy.load(tl_calamancy_md): {exc}")

	# Workaround invalid upstream wheel filename by downloading and renaming locally.
	try:
	_install_model_wheel_workaround()
	return spacy.load("tl_calamancy_md")
	except Exception as exc:
	errors.append(f"manual wheel install: {exc}")

	# Last-resort fallback if calamancy fixes model installation behavior.
	for candidate in [model_name, "tl_calamancy_md-0.2.0", "tl_calamancy_md"]:
	try:
	return calamancy.load(candidate)
	except Exception as exc:
	errors.append(f"calamancy.load({candidate}): {exc}")

	raise RuntimeError("Failed to load CalamanCy model. " + " \| ".join(errors))

	# Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive.
	def merge_dash_sentences(doc) -> List:
	"""Merge sentences split by dash tokens (from hyphenated words)."""
	dash_tokens = {"-"}
	raw_sents = list(doc.sents)
	if not raw_sents:
	return []

	merged = [raw_sents[0]]
	for sent in raw_sents[1:]:
	prev = merged[-1]

	start = max(0, prev.end - 2)
	end = min(len(doc), sent.start + 2)
	has_nearby_dash = any(doc[i].text in dash_tokens for i in range(start, end))

	if has_nearby_dash:
	merged[-1] = doc[prev.start : sent.end]
	else:
	merged.append(sent)
	return merged

	# cleans the sentence, avoids misidentifying simple sentences as compound/complex
	def simple_clean(text: str) -> str:
	if not isinstance(text, str):
	return ""
	text = text.lower()
	text = re.sub(r"\s+", " ", text)
	text = re.sub(r"[^\w\s\-.!?]", "", text) # keep sentence-ending punctuation
	return text.strip()

	# gets the sentence, word, and token count
	def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
	tokens = [t for t in doc if not t.is_punct and not t.is_space]
	num_words = len(tokens)

	try:
	num_sentences = len(merge_dash_sentences(doc)) or 1
	except Exception:
	num_sentences = 1

	# Fallback for edge cases where sentence splitting fails.
	if num_sentences == 1 and original_text:
	punct_splits = re.split(r"[.!?]+", original_text)
	punct_count = len([s for s in punct_splits if s.strip()])
	if punct_count > num_sentences:
	num_sentences = punct_count

	return num_words, num_sentences, tokens


	def mean_lengths(tokens, num_words: int, num_sentences: int):
	mean_word_length = sum(len(t.text) for t in tokens) / num_words if num_words else 0
	mean_sentence_length = num_words / num_sentences if num_sentences else 0
	return round(mean_word_length, 4), round(mean_sentence_length, 4)

	# TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not.
	def type_token_ratio(tokens, num_words: int):
	word_list = [t.text.lower() for t in tokens]
	return round(len(set(word_list)) / num_words if num_words else 0, 4)

	def count_filipino_syllables(word: str) -> int:
	"""Approximate Filipino syllable count by counting vowel nuclei."""
	if not isinstance(word, str):
	return 0

	word = re.sub(r"[^a-z-]", "", word.lower())
	if not word:
	return 0

	syllables = 0
	for part in filter(None, word.split("-")):
	syllables += len(re.findall(r"[aeiou]", part))

	return max(syllables, 1)

	# tags token that contains more than 3 syllables
	def polysyllabic_count(tokens) -> int:
	return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)

	# Computes lexical density and part-of-speech ratios for the token list.
	def lexical_density_and_pos(tokens, num_words: int):
	content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
	content_words = 0
	pos_counts: Dict[str, int] = {}

	for t in tokens:
	pos = getattr(t, "pos_", None)
	if pos:
	pos_counts[pos] = pos_counts.get(pos, 0) + 1
	if pos in content_pos:
	content_words += 1

	lexical_density = content_words / num_words if num_words else 0

	pos_ratios = {
	"noun_ratio": round(pos_counts.get("NOUN", 0) / num_words if num_words else 0, 4),
	"verb_ratio": round(pos_counts.get("VERB", 0) / num_words if num_words else 0, 4),
	"adj_ratio": round(pos_counts.get("ADJ", 0) / num_words if num_words else 0, 4),
	"adv_ratio": round(pos_counts.get("ADV", 0) / num_words if num_words else 0, 4),
	"pron_ratio": round(pos_counts.get("PRON", 0) / num_words if num_words else 0, 4),
	}

	return round(lexical_density, 4), pos_ratios

	# identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density.
	def foreign_word_density(tokens):
	english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
	foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]

	count = 0
	for t in tokens:
	word = t.text.lower()
	if len(word) <= 2:
	continue
	if any(n in word for n in english_ngrams) or any(l in word for l in foreign_letters):
	count += 1

	return round(count / len(tokens) if tokens else 0, 4)

	# checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object
	def detect_svo_vso(doc):
	sentences = merge_dash_sentences(doc)
	if not sentences:
	return "Unknown"

	sent = sentences[0]
	tokens = [t for t in sent if not t.is_punct and not t.is_space]

	has_ay = any(t.text.lower() == "ay" for t in tokens)

	first_content = None
	for t in tokens:
	if t.pos_ in {"NOUN", "PRON", "VERB"}:
	first_content = t
	break

	if not first_content:
	return "Unknown"

	if first_content.pos_ == "VERB" and not has_ay:
	return "VSO"
	if first_content.pos_ in {"NOUN", "PRON"} or has_ay:
	return "SVO"

	return "Unknown"

	# detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has.
	def detect_sentence_type(doc):
	tokens = [t for t in doc if not t.is_punct and not t.is_space]

	coord = {"at", "pero", "o", "maging", "saka", "subalit", "kaya"}
	subord = {"dahil", "kapag", "upang", "kung", "sapagkat"}

	has_coord = any(t.text.lower() in coord and t.pos_ == "CCONJ" for t in tokens)
	has_subord = any(t.text.lower() in subord and t.pos_ == "SCONJ" for t in tokens)

	if has_coord and has_subord:
	return "Compound-Complex"
	if has_subord:
	return "Complex"
	if has_coord:
	return "Compound"

	return "Simple"

	# main func
	def extract_features(text: str, nlp) -> Dict[str, Any]:
	if not text or not isinstance(text, str):
	return {}

	cleaned = simple_clean(text)
	doc = nlp(cleaned)

	num_words, num_sentences, tokens = basic_counts(doc, cleaned)
	mean_word, mean_sentence = mean_lengths(tokens, num_words, num_sentences)
	ttr = type_token_ratio(tokens, num_words)
	poly = polysyllabic_count(tokens)
	lex_density, pos_ratios = lexical_density_and_pos(tokens, num_words)
	foreign_density = foreign_word_density(tokens)
	construction = detect_svo_vso(doc)
	sentence_type = detect_sentence_type(doc)

	return {
	"num_words": num_words,
	"num_sentences": num_sentences,
	"mean_word_length": mean_word,
	"mean_sentence_length": mean_sentence,
	"polysyllabic_words": poly,
	"lexical_density": lex_density,
	"type_token_ratio": ttr,
	"foreign_word_density": foreign_density,
	"sentence_construction_type": construction,
	"sentence_type": sentence_type,
	**pos_ratios,
	}