Spaces:
Sleeping
Sleeping
File size: 9,277 Bytes
fda93d9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | """
Advanced NLP utilities: NER + dependency parsing wrapper with graceful fallbacks.
Provides a small abstraction `AdvancedNLPExtractor` that will use spaCy if available
(or fallback regex/heuristic extractors) to extract structured action items and
decisions from sentence-level metadata.
"""
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional
try:
import spacy
from spacy.language import Language
_HAS_SPACY = True
except Exception:
_HAS_SPACY = False
try:
from langdetect import detect as _detect_lang
_HAS_LANGDETECT = True
except Exception:
_HAS_LANGDETECT = False
logger = logging.getLogger("AdvancedNLP")
class AdvancedNLPExtractor:
"""Wrapper providing NER and dependency-based extraction.
Usage:
extractor = AdvancedNLPExtractor()
items = extractor.extract_actions_from_sentences(sent_meta)
`sent_meta` is a list of dicts produced by `BERTSummarizer._get_sentences_with_meta`
where each dict contains at least `text`, `speaker_id`, `start`, `end`.
"""
def __init__(self, lang: Optional[str] = None):
self.lang = lang
self._nlp: Optional[Language] = None
if _HAS_SPACY:
try:
model = self._choose_model(lang)
if model is not None:
self._nlp = spacy.load(model)
logger.info(f"Loaded spaCy model: {model}")
except Exception as e:
logger.warning(f"spaCy model load failed: {e}")
self._nlp = None
else:
logger.debug("spaCy not available; using heuristic fallbacks")
def _choose_model(self, lang: Optional[str]) -> Optional[str]:
# Prefer language-specific small models if available
if lang is None and _HAS_LANGDETECT:
return None # leave None to let caller decide based on text
if lang == "id":
return "id_core_news_sm"
if lang == "en":
return "en_core_web_sm"
# Fall back to cross-lingual entity model if present
return "xx_ent_wiki_sm"
def _detect_lang(self, text: str) -> Optional[str]:
if not _HAS_LANGDETECT:
return None
try:
return _detect_lang(text)
except Exception:
return None
def _get_doc(self, text: str):
# If spaCy is loaded, use it. Otherwise return None.
if self._nlp is None:
# try to lazily pick a model based on language
if _HAS_SPACY:
lang = self._detect_lang(text)
model = self._choose_model(lang)
if model:
try:
self._nlp = spacy.load(model)
logger.info(f"Lazy-loaded spaCy model: {model}")
except Exception:
self._nlp = None
return None
try:
return self._nlp(text)
except Exception:
return None
def extract_persons(self, text: str) -> List[str]:
doc = self._get_doc(text)
if doc is None:
# simple regex: capitalized words sequences
names = re.findall(r"\b([A-Z][a-z]{1,20}(?:\s+[A-Z][a-z]{1,20})*)\b", text)
return list(dict.fromkeys(names))
persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
# preserve order, unique
return list(dict.fromkeys(persons))
def extract_actions_from_sentences(
self, sent_meta: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Return candidate action items extracted from sentence metadata.
Each returned dict contains: {owner, task, sentence_idx, confidence}
"""
results: List[Dict[str, Any]] = []
texts = [s["text"] for s in sent_meta]
full = " ".join(texts[: max(1, min(10, len(texts)))])
lang = self._detect_lang(full) if _HAS_LANGDETECT else None
for i, s in enumerate(sent_meta):
text = s.get("text", "").strip()
if not text:
continue
# Quick keyword filter (language-agnostic): if no action words, skip
if not re.search(
r"\b(akan|harus|perlu|tolong|mohon|harap|deadline|target|tugas|follow up|tindak lanjut|siapkan|buat|bikin|saya|aku|kami|kita)\b",
text,
flags=re.IGNORECASE,
):
# also check for English keywords
if not re.search(
r"\b(will|shall|must|please|assign|task|deadline|action item|follow up|todo)\b",
text,
flags=re.IGNORECASE,
):
continue
doc = self._get_doc(text)
owner: Optional[str] = None
task: Optional[str] = None
confidence = 0.5
# First, try to find PERSON entities in the sentence
if doc is not None:
persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
if persons:
owner = persons[0]
confidence = 0.8
# dependency parse-based task extraction
try:
# find ROOT verb
root = None
for token in doc:
if token.dep_ == "ROOT" and token.pos_ in ("VERB", "AUX"):
root = token
break
if root is not None:
# look for direct objects / xcomp / ccomp
objs = [t for t in doc if t.dep_ in ("dobj", "obj", "xcomp", "ccomp")]
if objs:
task = " ".join([tok.text for tok in objs[0].subtree])
confidence = max(confidence, 0.7)
else:
# fallback: use root subtree as task
task = " ".join([tok.text for tok in root.subtree])
confidence = max(confidence, 0.6)
# If no owner found, search preceding tokens for personal pronouns
if owner is None:
pron = [t for t in doc if t.pos_ == "PRON"]
if pron:
owner = pron[0].text
confidence = 0.6
except Exception:
pass
# Regex fallback to capture "Name akan <action>" in many languages
if owner is None:
m = re.search(
r"\b([A-Z][a-z]{1,20})\b\s+(akan|will|harus|must|to)\s+(?P<task>.+)",
text,
flags=re.IGNORECASE,
)
if m:
owner = m.group(1)
task = m.group("task").strip(" .,:;-")
confidence = 0.7
# Otherwise, check for "Saya akan"/"Aku akan" and attribute to speaker
if owner is None and re.search(r"\b(saya|aku|kami|kita)\b", text, flags=re.IGNORECASE):
owner = s.get("speaker_id")
# try extract phrase after 'akan' or commit verb
m2 = re.search(
r"\b(?:akan|saya akan|aku akan|saya akan membuat|aku akan membuat|tolong|siapkan|buat|bikin)\b\s*(?P<task>.+)$",
text,
flags=re.IGNORECASE,
)
if m2:
task = m2.group("task").strip(" .,:;-")
confidence = 0.7
# final fallback: if sentence contains action keywords, use whole sentence
if task is None:
# trim connectors and filler
t = re.sub(r"^(oke|ya|nah|baik)\b[:,-]*", "", text, flags=re.IGNORECASE).strip()
task = t[:300]
# Basic length filter
if task and len(task.split()) < 3:
continue
results.append(
{
"owner": owner or s.get("speaker_id"),
"task": task,
"sentence_idx": i,
"confidence": confidence,
}
)
return results
def extract_decisions_from_sentences(sent_meta: List[Dict[str, Any]]) -> List[str]:
"""Simple decision extraction: look for decision keywords and return cleaned contexts."""
results: List[str] = []
decision_kw = re.compile(
r"\b(diputuskan|disepakati|kesimpulan|keputusan|sepakat|setuju|disetujui|putus|decided|decision)\b",
flags=re.IGNORECASE,
)
for i, s in enumerate(sent_meta):
text = s.get("text", "").strip()
if not text:
continue
if decision_kw.search(text):
cleaned = re.sub(r"\[.*?\]", "", text)
results.append(cleaned.strip())
return results
|