Spaces:

Yermia
/

Notulen_Otomatis

Sleeping

File size: 9,277 Bytes

fda93d9

"""

Advanced NLP utilities: NER + dependency parsing wrapper with graceful fallbacks.



Provides a small abstraction `AdvancedNLPExtractor` that will use spaCy if available

(or fallback regex/heuristic extractors) to extract structured action items and

decisions from sentence-level metadata.

"""

from __future__ import annotations

import logging
import re
from typing import Any, Dict, List, Optional

try:
    import spacy
    from spacy.language import Language

    _HAS_SPACY = True
except Exception:
    _HAS_SPACY = False

try:
    from langdetect import detect as _detect_lang

    _HAS_LANGDETECT = True
except Exception:
    _HAS_LANGDETECT = False

logger = logging.getLogger("AdvancedNLP")


class AdvancedNLPExtractor:
    """Wrapper providing NER and dependency-based extraction.



    Usage:

        extractor = AdvancedNLPExtractor()

        items = extractor.extract_actions_from_sentences(sent_meta)



    `sent_meta` is a list of dicts produced by `BERTSummarizer._get_sentences_with_meta`

    where each dict contains at least `text`, `speaker_id`, `start`, `end`.

    """

    def __init__(self, lang: Optional[str] = None):
        self.lang = lang
        self._nlp: Optional[Language] = None
        if _HAS_SPACY:
            try:
                model = self._choose_model(lang)
                if model is not None:
                    self._nlp = spacy.load(model)
                    logger.info(f"Loaded spaCy model: {model}")
            except Exception as e:
                logger.warning(f"spaCy model load failed: {e}")
                self._nlp = None
        else:
            logger.debug("spaCy not available; using heuristic fallbacks")

    def _choose_model(self, lang: Optional[str]) -> Optional[str]:
        # Prefer language-specific small models if available
        if lang is None and _HAS_LANGDETECT:
            return None  # leave None to let caller decide based on text
        if lang == "id":
            return "id_core_news_sm"
        if lang == "en":
            return "en_core_web_sm"
        # Fall back to cross-lingual entity model if present
        return "xx_ent_wiki_sm"

    def _detect_lang(self, text: str) -> Optional[str]:
        if not _HAS_LANGDETECT:
            return None
        try:
            return _detect_lang(text)
        except Exception:
            return None

    def _get_doc(self, text: str):
        # If spaCy is loaded, use it. Otherwise return None.
        if self._nlp is None:
            # try to lazily pick a model based on language
            if _HAS_SPACY:
                lang = self._detect_lang(text)
                model = self._choose_model(lang)
                if model:
                    try:
                        self._nlp = spacy.load(model)
                        logger.info(f"Lazy-loaded spaCy model: {model}")
                    except Exception:
                        self._nlp = None
            return None
        try:
            return self._nlp(text)
        except Exception:
            return None

    def extract_persons(self, text: str) -> List[str]:
        doc = self._get_doc(text)
        if doc is None:
            # simple regex: capitalized words sequences
            names = re.findall(r"\b([A-Z][a-z]{1,20}(?:\s+[A-Z][a-z]{1,20})*)\b", text)
            return list(dict.fromkeys(names))
        persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
        # preserve order, unique
        return list(dict.fromkeys(persons))

    def extract_actions_from_sentences(

        self, sent_meta: List[Dict[str, Any]]

    ) -> List[Dict[str, Any]]:
        """Return candidate action items extracted from sentence metadata.



        Each returned dict contains: {owner, task, sentence_idx, confidence}

        """
        results: List[Dict[str, Any]] = []

        texts = [s["text"] for s in sent_meta]
        full = " ".join(texts[: max(1, min(10, len(texts)))])
        lang = self._detect_lang(full) if _HAS_LANGDETECT else None

        for i, s in enumerate(sent_meta):
            text = s.get("text", "").strip()
            if not text:
                continue

            # Quick keyword filter (language-agnostic): if no action words, skip
            if not re.search(
                r"\b(akan|harus|perlu|tolong|mohon|harap|deadline|target|tugas|follow up|tindak lanjut|siapkan|buat|bikin|saya|aku|kami|kita)\b",
                text,
                flags=re.IGNORECASE,
            ):
                # also check for English keywords
                if not re.search(
                    r"\b(will|shall|must|please|assign|task|deadline|action item|follow up|todo)\b",
                    text,
                    flags=re.IGNORECASE,
                ):
                    continue

            doc = self._get_doc(text)
            owner: Optional[str] = None
            task: Optional[str] = None
            confidence = 0.5

            # First, try to find PERSON entities in the sentence
            if doc is not None:
                persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
                if persons:
                    owner = persons[0]
                    confidence = 0.8

                # dependency parse-based task extraction
                try:
                    # find ROOT verb
                    root = None
                    for token in doc:
                        if token.dep_ == "ROOT" and token.pos_ in ("VERB", "AUX"):
                            root = token
                            break

                    if root is not None:
                        # look for direct objects / xcomp / ccomp
                        objs = [t for t in doc if t.dep_ in ("dobj", "obj", "xcomp", "ccomp")]
                        if objs:
                            task = " ".join([tok.text for tok in objs[0].subtree])
                            confidence = max(confidence, 0.7)
                        else:
                            # fallback: use root subtree as task
                            task = " ".join([tok.text for tok in root.subtree])
                            confidence = max(confidence, 0.6)

                    # If no owner found, search preceding tokens for personal pronouns
                    if owner is None:
                        pron = [t for t in doc if t.pos_ == "PRON"]
                        if pron:
                            owner = pron[0].text
                            confidence = 0.6
                except Exception:
                    pass

            # Regex fallback to capture "Name akan <action>" in many languages
            if owner is None:
                m = re.search(
                    r"\b([A-Z][a-z]{1,20})\b\s+(akan|will|harus|must|to)\s+(?P<task>.+)",
                    text,
                    flags=re.IGNORECASE,
                )
                if m:
                    owner = m.group(1)
                    task = m.group("task").strip(" .,:;-")
                    confidence = 0.7

            # Otherwise, check for "Saya akan"/"Aku akan" and attribute to speaker
            if owner is None and re.search(r"\b(saya|aku|kami|kita)\b", text, flags=re.IGNORECASE):
                owner = s.get("speaker_id")
                # try extract phrase after 'akan' or commit verb
                m2 = re.search(
                    r"\b(?:akan|saya akan|aku akan|saya akan membuat|aku akan membuat|tolong|siapkan|buat|bikin)\b\s*(?P<task>.+)$",
                    text,
                    flags=re.IGNORECASE,
                )
                if m2:
                    task = m2.group("task").strip(" .,:;-")
                    confidence = 0.7

            # final fallback: if sentence contains action keywords, use whole sentence
            if task is None:
                # trim connectors and filler
                t = re.sub(r"^(oke|ya|nah|baik)\b[:,-]*", "", text, flags=re.IGNORECASE).strip()
                task = t[:300]

            # Basic length filter
            if task and len(task.split()) < 3:
                continue

            results.append(
                {
                    "owner": owner or s.get("speaker_id"),
                    "task": task,
                    "sentence_idx": i,
                    "confidence": confidence,
                }
            )

        return results


def extract_decisions_from_sentences(sent_meta: List[Dict[str, Any]]) -> List[str]:
    """Simple decision extraction: look for decision keywords and return cleaned contexts."""
    results: List[str] = []
    decision_kw = re.compile(
        r"\b(diputuskan|disepakati|kesimpulan|keputusan|sepakat|setuju|disetujui|putus|decided|decision)\b",
        flags=re.IGNORECASE,
    )

    for i, s in enumerate(sent_meta):
        text = s.get("text", "").strip()
        if not text:
            continue
        if decision_kw.search(text):
            cleaned = re.sub(r"\[.*?\]", "", text)
            results.append(cleaned.strip())

    return results