Spaces:

Yermia
/

Notulen_Otomatis

Sleeping

App Files Files Community

Notulen_Otomatis / src /nlp_utils.py

Yermia

Upload 13 files

fda93d9 verified 18 days ago

raw

history blame contribute delete

9.28 kB

	"""
	Advanced NLP utilities: NER + dependency parsing wrapper with graceful fallbacks.

	Provides a small abstraction `AdvancedNLPExtractor` that will use spaCy if available
	(or fallback regex/heuristic extractors) to extract structured action items and
	decisions from sentence-level metadata.
	"""

	from __future__ import annotations

	import logging
	import re
	from typing import Any, Dict, List, Optional

	try:
	import spacy
	from spacy.language import Language

	_HAS_SPACY = True
	except Exception:
	_HAS_SPACY = False

	try:
	from langdetect import detect as _detect_lang

	_HAS_LANGDETECT = True
	except Exception:
	_HAS_LANGDETECT = False

	logger = logging.getLogger("AdvancedNLP")


	class AdvancedNLPExtractor:
	"""Wrapper providing NER and dependency-based extraction.

	Usage:
	extractor = AdvancedNLPExtractor()
	items = extractor.extract_actions_from_sentences(sent_meta)

	`sent_meta` is a list of dicts produced by `BERTSummarizer._get_sentences_with_meta`
	where each dict contains at least `text`, `speaker_id`, `start`, `end`.
	"""

	def __init__(self, lang: Optional[str] = None):
	self.lang = lang
	self._nlp: Optional[Language] = None
	if _HAS_SPACY:
	try:
	model = self._choose_model(lang)
	if model is not None:
	self._nlp = spacy.load(model)
	logger.info(f"Loaded spaCy model: {model}")
	except Exception as e:
	logger.warning(f"spaCy model load failed: {e}")
	self._nlp = None
	else:
	logger.debug("spaCy not available; using heuristic fallbacks")

	def _choose_model(self, lang: Optional[str]) -> Optional[str]:
	# Prefer language-specific small models if available
	if lang is None and _HAS_LANGDETECT:
	return None # leave None to let caller decide based on text
	if lang == "id":
	return "id_core_news_sm"
	if lang == "en":
	return "en_core_web_sm"
	# Fall back to cross-lingual entity model if present
	return "xx_ent_wiki_sm"

	def _detect_lang(self, text: str) -> Optional[str]:
	if not _HAS_LANGDETECT:
	return None
	try:
	return _detect_lang(text)
	except Exception:
	return None

	def _get_doc(self, text: str):
	# If spaCy is loaded, use it. Otherwise return None.
	if self._nlp is None:
	# try to lazily pick a model based on language
	if _HAS_SPACY:
	lang = self._detect_lang(text)
	model = self._choose_model(lang)
	if model:
	try:
	self._nlp = spacy.load(model)
	logger.info(f"Lazy-loaded spaCy model: {model}")
	except Exception:
	self._nlp = None
	return None
	try:
	return self._nlp(text)
	except Exception:
	return None

	def extract_persons(self, text: str) -> List[str]:
	doc = self._get_doc(text)
	if doc is None:
	# simple regex: capitalized words sequences
	names = re.findall(r"\b([A-Z][a-z]{1,20}(?:\s+[A-Z][a-z]{1,20})*)\b", text)
	return list(dict.fromkeys(names))
	persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
	# preserve order, unique
	return list(dict.fromkeys(persons))

	def extract_actions_from_sentences(
	self, sent_meta: List[Dict[str, Any]]
	) -> List[Dict[str, Any]]:
	"""Return candidate action items extracted from sentence metadata.

	Each returned dict contains: {owner, task, sentence_idx, confidence}
	"""
	results: List[Dict[str, Any]] = []

	texts = [s["text"] for s in sent_meta]
	full = " ".join(texts[: max(1, min(10, len(texts)))])
	lang = self._detect_lang(full) if _HAS_LANGDETECT else None

	for i, s in enumerate(sent_meta):
	text = s.get("text", "").strip()
	if not text:
	continue

	# Quick keyword filter (language-agnostic): if no action words, skip
	if not re.search(
	r"\b(akan\|harus\|perlu\|tolong\|mohon\|harap\|deadline\|target\|tugas\|follow up\|tindak lanjut\|siapkan\|buat\|bikin\|saya\|aku\|kami\|kita)\b",
	text,
	flags=re.IGNORECASE,
	):
	# also check for English keywords
	if not re.search(
	r"\b(will\|shall\|must\|please\|assign\|task\|deadline\|action item\|follow up\|todo)\b",
	text,
	flags=re.IGNORECASE,
	):
	continue

	doc = self._get_doc(text)
	owner: Optional[str] = None
	task: Optional[str] = None
	confidence = 0.5

	# First, try to find PERSON entities in the sentence
	if doc is not None:
	persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
	if persons:
	owner = persons[0]
	confidence = 0.8

	# dependency parse-based task extraction
	try:
	# find ROOT verb
	root = None
	for token in doc:
	if token.dep_ == "ROOT" and token.pos_ in ("VERB", "AUX"):
	root = token
	break

	if root is not None:
	# look for direct objects / xcomp / ccomp
	objs = [t for t in doc if t.dep_ in ("dobj", "obj", "xcomp", "ccomp")]
	if objs:
	task = " ".join([tok.text for tok in objs[0].subtree])
	confidence = max(confidence, 0.7)
	else:
	# fallback: use root subtree as task
	task = " ".join([tok.text for tok in root.subtree])
	confidence = max(confidence, 0.6)

	# If no owner found, search preceding tokens for personal pronouns
	if owner is None:
	pron = [t for t in doc if t.pos_ == "PRON"]
	if pron:
	owner = pron[0].text
	confidence = 0.6
	except Exception:
	pass

	# Regex fallback to capture "Name akan <action>" in many languages
	if owner is None:
	m = re.search(
	r"\b([A-Z][a-z]{1,20})\b\s+(akan\|will\|harus\|must\|to)\s+(?P<task>.+)",
	text,
	flags=re.IGNORECASE,
	)
	if m:
	owner = m.group(1)
	task = m.group("task").strip(" .,:;-")
	confidence = 0.7

	# Otherwise, check for "Saya akan"/"Aku akan" and attribute to speaker
	if owner is None and re.search(r"\b(saya\|aku\|kami\|kita)\b", text, flags=re.IGNORECASE):
	owner = s.get("speaker_id")
	# try extract phrase after 'akan' or commit verb
	m2 = re.search(
	r"\b(?:akan\|saya akan\|aku akan\|saya akan membuat\|aku akan membuat\|tolong\|siapkan\|buat\|bikin)\b\s*(?P<task>.+)$",
	text,
	flags=re.IGNORECASE,
	)
	if m2:
	task = m2.group("task").strip(" .,:;-")
	confidence = 0.7

	# final fallback: if sentence contains action keywords, use whole sentence
	if task is None:
	# trim connectors and filler
	t = re.sub(r"^(oke\|ya\|nah\|baik)\b[:,-]*", "", text, flags=re.IGNORECASE).strip()
	task = t[:300]

	# Basic length filter
	if task and len(task.split()) < 3:
	continue

	results.append(
	{
	"owner": owner or s.get("speaker_id"),
	"task": task,
	"sentence_idx": i,
	"confidence": confidence,
	}
	)

	return results


	def extract_decisions_from_sentences(sent_meta: List[Dict[str, Any]]) -> List[str]:
	"""Simple decision extraction: look for decision keywords and return cleaned contexts."""
	results: List[str] = []
	decision_kw = re.compile(
	r"\b(diputuskan\|disepakati\|kesimpulan\|keputusan\|sepakat\|setuju\|disetujui\|putus\|decided\|decision)\b",
	flags=re.IGNORECASE,
	)

	for i, s in enumerate(sent_meta):
	text = s.get("text", "").strip()
	if not text:
	continue
	if decision_kw.search(text):
	cleaned = re.sub(r"\[.*?\]", "", text)
	results.append(cleaned.strip())

	return results