SocioCausaNet / modeling_joint_causal.py

adding neural only mode for relation extraction

cf13840 verified 10 months ago

34.7 kB

	from __future__ import annotations
	from typing import Dict, List, Optional
	import torch
	import torch.nn as nn
	from transformers import AutoModel, PreTrainedModel
	from dataclasses import dataclass
	try:
	from .config import id2label_bio, id2label_rel, id2label_cls
	except ImportError:
	from config import id2label_bio, id2label_rel, id2label_cls

	try:
	from .configuration_joint_causal import JointCausalConfig
	except ImportError:
	from configuration_joint_causal import JointCausalConfig

	# ---------------------------------------------------------------------------
	# Type aliases & label maps
	# ---------------------------------------------------------------------------
	label2id_bio = {v: k for k, v in id2label_bio.items()}
	label2id_rel = {v: k for k, v in id2label_rel.items()}
	label2id_cls = {v: k for k, v in id2label_cls.items()}

	# ---------------------------------------------------------------------------
	# Main module
	# ---------------------------------------------------------------------------
	"""Joint Causal Extraction Model (softmax)
	============================================================================

	A PyTorch module for joint causal extraction using softmax decoding for BIO tagging.
	The model supports class weights for handling imbalanced data.

	```python
	>>> model = JointCausalModel() # softmax-based model
	"""


	# ---------------------------------------------------------------------------
	# Span dataclass
	# ---------------------------------------------------------------------------
	@dataclass
	class Span:
	role: str
	start_tok: int
	end_tok: int
	text: str
	is_virtual: bool = False


	# ---------------------------------------------------------------------------
	# Main module
	# ---------------------------------------------------------------------------

	class JointCausalModel(PreTrainedModel):

	"""Encoder + three heads with optional CRF BIO decoder.

	This model integrates a pre-trained transformer encoder with three distinct
	heads for:
	1. Classification (cls_head): Predicts a global label for the input.
	2. BIO tagging (bio_head): Performs sequence tagging using BIO scheme.
	Can operate with a CRF layer or standard softmax.
	3. Relation extraction (rel_head): Identifies relations between entities
	detected by the BIO tagging head.
	"""
	# Link the model to its config class, as shown in the tutorial.
	config_class = JointCausalConfig

	# ------------------------------------------------------------------
	# constructor
	# -----------------------------------------------------------
	def __init__(self, config: JointCausalConfig):

	"""Initializes the JointCausalModel.

	Args:
	encoder_name: Name of the pre-trained transformer model to use
	(e.g., "bert-base-uncased").
	num_cls_labels: Number of labels for the classification task.
	num_bio_labels: Number of labels for the BIO tagging task.
	num_rel_labels: Number of labels for the relation extraction task.
	dropout: Dropout rate for regularization.
	"""

	super().__init__(config)
	self.config = config

	self.enc = AutoModel.from_pretrained(config.encoder_name)
	self.hidden_size = self.enc.config.hidden_size
	self.dropout = nn.Dropout(config.dropout)
	self.layer_norm = nn.LayerNorm(self.hidden_size)



	self.cls_head = nn.Sequential(
	nn.Linear(self.hidden_size, self.hidden_size // 2),
	nn.ReLU(),
	nn.Dropout(config.dropout),
	nn.Linear(self.hidden_size // 2, config.num_cls_labels),
	)
	self.bio_head = nn.Sequential(
	nn.Linear(self.hidden_size, self.hidden_size),
	nn.ReLU(),
	nn.Dropout(config.dropout),
	nn.Linear(self.hidden_size, self.hidden_size // 2),
	nn.ReLU(),
	nn.Dropout(config.dropout),
	nn.Linear(self.hidden_size // 2, config.num_bio_labels),
	)
	self.rel_head = nn.Sequential(
	nn.Linear(self.hidden_size * 2, self.hidden_size),
	nn.ReLU(),
	nn.Dropout(config.dropout),
	nn.Linear(self.hidden_size, self.hidden_size // 2),
	nn.ReLU(),
	nn.Dropout(config.dropout),
	nn.Linear(self.hidden_size // 2, config.num_rel_labels),
	)
	self._init_new_layer_weights()

	def get_config_dict(self) -> Dict:
	"""Returns the model's configuration as a dictionary."""
	return {
	"encoder_name": self.encoder_name,
	"num_cls_labels": self.num_cls_labels,
	"num_bio_labels": self.num_bio_labels,
	"num_rel_labels": self.num_rel_labels,
	"dropout": self.dropout_rate,
	}

	@classmethod
	def from_config_dict(cls, config: Dict) -> "JointCausalModel":
	"""Creates a JointCausalModel instance from a configuration dictionary."""
	return cls(**config)

	def _init_new_layer_weights(self):
	"""Initializes the weights of the newly added linear layers.

	Uses Xavier uniform initialization for weights and zeros for biases.
	"""
	for mod in [self.cls_head, self.bio_head, self.rel_head]:
	for sub_module in mod.modules():
	if isinstance(sub_module, nn.Linear):
	nn.init.xavier_uniform_(sub_module.weight)
	if sub_module.bias is not None:
	nn.init.zeros_(sub_module.bias)

	def encode(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
	"""Encodes the input using the transformer model.

	Args:
	input_ids: Tensor of input token IDs.
	attention_mask: Tensor indicating which tokens to attend to.

	Returns:
	Tensor of hidden states from the encoder, passed through dropout
	and layer normalization.
	"""
	hidden_states = self.enc(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
	return self.layer_norm(self.dropout(hidden_states))

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	*,
	bio_labels: torch.Tensor \| None = None,
	pair_batch: torch.Tensor \| None = None,
	cause_starts: torch.Tensor \| None = None,
	cause_ends: torch.Tensor \| None = None,
	effect_starts: torch.Tensor \| None = None,
	effect_ends: torch.Tensor \| None = None,
	) -> Dict[str, torch.Tensor \| None]:
	"""Performs a forward pass through the model.

	Args:
	input_ids: Tensor of input token IDs.
	attention_mask: Tensor indicating which tokens to attend to.
	bio_labels: Optional tensor of BIO labels for training.
	pair_batch: Optional tensor indicating which hidden states to use
	for relation extraction.
	cause_starts: Optional tensor of start indices for cause spans.
	cause_ends: Optional tensor of end indices for cause spans.
	effect_starts: Optional tensor of start indices for effect spans.
	effect_ends: Optional tensor of end indices for effect spans.

	Returns:
	A dictionary containing:
	- "cls_logits": Logits for the classification task.
	- "bio_emissions": Emissions from the BIO tagging head.
	- "tag_loss": Loss for the BIO tagging task (if bio_labels provided).
	- "rel_logits": Logits for the relation extraction task (if
	relation extraction inputs provided).
	"""
	# Encode input
	hidden = self.encode(input_ids, attention_mask)

	# Classification head
	cls_logits = self.cls_head(hidden[:, 0]) # Use [CLS] token representation

	# BIO tagging head
	emissions = self.bio_head(hidden)
	tag_loss: Optional[torch.Tensor] = None

	# Calculate BIO tagging loss if labels are provided
	if bio_labels is not None:
	# Softmax loss (typically handled by the training loop's loss function, e.g., CrossEntropyLoss)
	# Here, we initialize it to 0.0 as a placeholder.
	# The actual loss calculation for softmax would compare emissions with bio_labels.
	tag_loss = torch.tensor(0.0, device=emissions.device)

	# Relation extraction head
	rel_logits: torch.Tensor \| None = None
	if pair_batch is not None and cause_starts is not None and cause_ends is not None \
	and effect_starts is not None and effect_ends is not None:
	# Select hidden states corresponding to the pairs for relation extraction
	bio_states_for_rel = hidden[pair_batch]
	seq_len_rel = bio_states_for_rel.size(1)
	pos_rel = torch.arange(seq_len_rel, device=bio_states_for_rel.device).unsqueeze(0)

	# Create masks for cause and effect spans
	c_mask = ((cause_starts.unsqueeze(1) <= pos_rel) & (pos_rel <= cause_ends.unsqueeze(1))).unsqueeze(2)
	e_mask = ((effect_starts.unsqueeze(1) <= pos_rel) & (pos_rel <= effect_ends.unsqueeze(1))).unsqueeze(2)

	# Compute mean-pooled representations for cause and effect spans
	c_vec = (bio_states_for_rel * c_mask).sum(1) / c_mask.sum(1).clamp(min=1) # Average pooling, clamp to avoid div by zero
	e_vec = (bio_states_for_rel * e_mask).sum(1) / e_mask.sum(1).clamp(min=1) # Average pooling, clamp to avoid div by zero

	# Concatenate cause and effect vectors and pass through relation head
	rel_logits = self.rel_head(torch.cat([c_vec, e_vec], dim=1))

	return {
	"cls_logits": cls_logits,
	"bio_emissions": emissions,
	"tag_loss": tag_loss,
	"rel_logits": rel_logits,
	}

	# ---------------------------------------------------------------------------
	# Refactored prediction & post‑processing utilities for JointCausalModel
	# ---------------------------------------------------------------------------

	def predict(
	self,
	sents: List[str],
	tokenizer=None,
	*,
	rel_mode: str = "neural_only",
	rel_threshold: float = 0.8,
	cause_decision: str = "cls+span",
	) -> List[dict]:
	"""End‑to‑end inference for causal sentence extraction (batched).

	Args:
	sents: List of input sentences for causal extraction.
	tokenizer: Tokenizer instance for encoding sentences. If None, a default tokenizer is initialized.
	rel_mode: Strategy for relation extraction. "auto" mode simplifies relations when spans are limited.
	rel_threshold: Probability threshold for relation head to reduce spurious pairs.
	cause_decision: Strategy for determining causality ('cls_only', 'span_only', or 'cls+span').

	Returns:
	List of dictionaries containing:
	- "text": Original sentence.
	- "causal": Boolean indicating if the sentence is causal.
	- "relations": List of extracted causal relations.
	"""
	# ------------------------------------------------------------------
	# 0. Tokeniser & device
	# ------------------------------------------------------------------
	if tokenizer is None:
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained(self.encoder_name, use_fast=True)

	device = next(self.parameters()).device
	to_dev = lambda d: {k: v.to(device) for k, v in d.items()} # Move tensors to the model's device

	outputs: List[dict] = []

	# ------------------------------------------------------------------
	# 1. Batch tokenize all sentences
	# ------------------------------------------------------------------
	enc = tokenizer(sents, return_tensors="pt", truncation=True, max_length=512, padding=True)
	enc = to_dev(enc) # Ensure tensors are on the correct device

	with torch.no_grad():
	base = self(input_ids=enc["input_ids"], attention_mask=enc["attention_mask"])

	cls_logits_batch = base["cls_logits"] # Sentence-level classification logits
	bio_emissions_batch = base["bio_emissions"] # BIO tagging emissions
	input_ids_batch = enc["input_ids"] # Token IDs for each sentence
	attention_mask_batch = enc["attention_mask"] # Attention mask for each sentence

	batch_size = input_ids_batch.size(0)

	for i in range(batch_size):
	seq_len = attention_mask_batch[i].sum().item() # Determine the actual sequence length
	input_ids = input_ids_batch[i][:seq_len] # Trim padding tokens
	bio_emissions = bio_emissions_batch[i][:seq_len] # Trim emissions to sequence length
	tokens = tokenizer.convert_ids_to_tokens(input_ids) # Convert token IDs to actual tokens
	bio_ids = bio_emissions.argmax(-1).tolist() # Get predicted BIO label indices
	bio_labels = [id2label_bio[j] for j in bio_ids] # Map indices to label names

	# Apply BIO rules to clean up predictions
	fixed_labels = self._apply_bio_rules(tokens, bio_labels)
	spans = self._merge_spans(tokens, fixed_labels, tokenizer) # Merge spans based on cleaned labels

	# Determine if the sentence is causal based on classification logits and spans
	is_causal = self._decide_causal(cls_logits_batch[i], spans, cause_decision)

	# ------------------------------------------------------------------
	# 2. Relation extraction (per sentence, as before)
	# ------------------------------------------------------------------
	rels: List[dict] = []
	pure_cause_spans = [s for s in spans if s.role == "C"] # Extract pure cause spans
	pure_effect_spans = [s for s in spans if s.role == "E"] # Extract pure effect spans
	ce_spans = [s for s in spans if s.role == "CE"] # Extract combined cause-effect spans
	cause_spans = pure_cause_spans + ce_spans
	effect_spans = pure_effect_spans + ce_spans

	if cause_spans and effect_spans:
	# Check for presence of pure causes/effects and combined spans
	has_pure_causes = len(pure_cause_spans) > 0
	has_pure_effects = len(pure_effect_spans) > 0
	has_ce_spans = len(ce_spans) > 0

	if has_ce_spans and not (has_pure_causes or has_pure_effects):
	pass # Skip relation extraction if only combined spans exist
	elif rel_mode == "auto" and (len(cause_spans) == 1 or len(effect_spans) == 1):
	# Simplified relation extraction for single spans
	if len(cause_spans) == 1:
	for e in effect_spans:
	if (cause_spans[0].text.lower() != e.text.lower() or
	(cause_spans[0].role == "CE" and e.role != "CE")):
	rels.append({"cause": cause_spans[0].text, "effect": e.text, "type": "Rel_CE"})
	else:
	for c in cause_spans:
	if (c.text.lower() != effect_spans[0].text.lower() or
	(c.role == "CE" and effect_spans[0].role != "CE")):
	rels.append({"cause": c.text, "effect": effect_spans[0].text, "type": "Rel_CE"})
	elif rel_mode == "neural_only":
	# Always use the relation head for all valid pairs
	pair_meta = []
	for c in cause_spans:
	for e in effect_spans:
	if (not (c.start_tok == e.start_tok and c.end_tok == e.end_tok) or
	(c.role == "CE" and e.role in {"C", "E"}) or
	(c.role in {"C", "E"} and e.role == "CE")):
	pair_meta.append((c, e))
	if pair_meta:
	# Prepare tensors for this sentence only
	pair_batch = torch.zeros(len(pair_meta), dtype=torch.long, device=device)
	cause_starts = torch.tensor([c.start_tok for c, _ in pair_meta], device=device)
	cause_ends = torch.tensor([c.end_tok for c, _ in pair_meta], device=device)
	effect_starts = torch.tensor([e.start_tok for _, e in pair_meta], device=device)
	effect_ends = torch.tensor([e.end_tok for _, e in pair_meta], device=device)
	rel_logits = self(
	input_ids=input_ids.unsqueeze(0),
	attention_mask=attention_mask_batch[i][:seq_len].unsqueeze(0),
	pair_batch=pair_batch,
	cause_starts=cause_starts,
	cause_ends=cause_ends,
	effect_starts=effect_starts,
	effect_ends=effect_ends,
	)["rel_logits"]
	probs = torch.softmax(rel_logits, dim=-1)[:, 1].tolist() # Extract probabilities for relation type
	for (c, e), p in zip(pair_meta, probs):
	if p >= rel_threshold and c.text.lower() != e.text.lower():
	rels.append({"cause": c.text, "effect": e.text, "type": "Rel_CE"})
	else:
	# Full relation extraction for multiple spans
	pair_meta = []
	for c in cause_spans:
	for e in effect_spans:
	if (not (c.start_tok == e.start_tok and c.end_tok == e.end_tok) or
	(c.role == "CE" and e.role in {"C", "E"}) or
	(c.role in {"C", "E"} and e.role == "CE")):
	pair_meta.append((c, e))
	if pair_meta:
	# Prepare tensors for this sentence only
	pair_batch = torch.zeros(len(pair_meta), dtype=torch.long, device=device)
	cause_starts = torch.tensor([c.start_tok for c, _ in pair_meta], device=device)
	cause_ends = torch.tensor([c.end_tok for c, _ in pair_meta], device=device)
	effect_starts = torch.tensor([e.start_tok for _, e in pair_meta], device=device)
	effect_ends = torch.tensor([e.end_tok for _, e in pair_meta], device=device)
	rel_logits = self(
	input_ids=input_ids.unsqueeze(0),
	attention_mask=attention_mask_batch[i][:seq_len].unsqueeze(0),
	pair_batch=pair_batch,
	cause_starts=cause_starts,
	cause_ends=cause_ends,
	effect_starts=effect_starts,
	effect_ends=effect_ends,
	)["rel_logits"]
	probs = torch.softmax(rel_logits, dim=-1)[:, 1].tolist() # Extract probabilities for relation type
	for (c, e), p in zip(pair_meta, probs):
	if p >= rel_threshold and c.text.lower() != e.text.lower():
	rels.append({"cause": c.text, "effect": e.text, "type": "Rel_CE"})
	# Remove duplicate relations
	seen = set()
	uniq = []
	for r in rels:
	key = (r["cause"].lower(), r["effect"].lower())
	if key not in seen:
	seen.add(key)
	uniq.append(r)
	rels = uniq

	# If the sentence is predicted as non-causal, ensure no spans or relations are returned
	if not is_causal:
	outputs.append({
	"text": sents[i],
	"causal": is_causal,
	"relations": [], # Empty relations
	"spans": [], # Empty spans
	})
	else:
	outputs.append({
	"text": sents[i],
	"causal": is_causal,
	"relations": rels,
	})

	return outputs

	# ------------------------------------------------------------------
	# BIO utilities
	# ------------------------------------------------------------------
	@staticmethod
	def _apply_bio_rules(tok: List[str], lab: List[str]) -> List[str]:
	"""Light‑touch BIO sanitiser that fixes intra‑span role clashes and
	common WordPiece artefacts while deferring to model probabilities.

	Added rule (R‑6)
	----------------
	When a contiguous non‑O block mixes C and E roles (e.g.
	``B‑C I‑C I‑E I‑C``) we collapse the entire block to the majority
	role (ties prefer C). Only the first token keeps the ``B‑`` prefix.
	"""
	n = len(tok)
	out = lab.copy()

	# R‑1 propagate to ## -------------------------------------------------
	for i in range(1, n):
	if tok[i].startswith("##") and out[i] == "O" and out[i-1] != "O":
	role = out[i-1].split("-")[-1]
	out[i] = f"I-{role}"

	# R‑2 stray I‑tags → B ----------------------------------------------
	for i in range(n):
	if out[i].startswith("I-") and (i == 0 or out[i-1] == "O"):
	out[i] = out[i].replace("I-", "B-", 1)

	# R‑3 merge adjacent B blocks of same role ---------------------------
	for i in range(1, n):
	if out[i].startswith("B-") and out[i-1] != "O":
	role_prev = out[i-1].split("-")[-1]
	role_curr = out[i].split("-")[-1]
	if role_prev == role_curr:
	out[i] = out[i].replace("B-", "I-", 1)

	# R‑4 (removed): We no longer force punctuation tokens to O
	# This keeps apostrophes/hyphens inside spans when the model labels them. # R‑5 CE disambiguation - only convert CE if no other roles present
	roles_present = {tag.split("-")[-1] for tag in out if tag != "O"}
	if "CE" in roles_present and "C" not in roles_present and "E" not in roles_present:
	# Only CE tags present - convert all to C (arbitrary choice)
	for i, tag in enumerate(out):
	if tag.endswith("CE"):
	out[i] = tag[:-2] + "C"

	# R‑6 intra‑span role clash fix - preserve CE spans when meaningful
	i = 0
	while i < n:
	if out[i] == "O":
	i += 1
	continue
	start = i
	role_counts = {"C": 0, "E": 0, "CE": 0}
	has_mixed_roles = False

	# Count roles in this span and check for mixing
	while i < n and out[i] != "O" and not (i > start and out[i].startswith("B-")):
	role = out[i].split("-")[-1]
	role_counts[role] += 1
	i += 1

	# Check if span has mixed C/E roles (not including CE)
	non_ce_roles = set()
	j = start
	while j < i:
	role = out[j].split("-")[-1]
	if role in {"C", "E"}:
	non_ce_roles.add(role)
	j += 1

	if len(non_ce_roles) > 1:
	# Mixed C and E tags - resolve to majority
	maj = "C" if role_counts["C"] >= role_counts["E"] else "E"
	j = start
	first = True
	while j < i:
	out[j] = ("B-" if first else "I-") + maj
	first = False
	j += 1
	elif role_counts["CE"] > 0 and len(non_ce_roles) == 0:
	# Pure CE span - keep as CE
	j = start
	first = True
	while j < i:
	out[j] = ("B-" if first else "I-") + "CE"
	first = False
	j += 1
	elif role_counts["CE"] > 0 and len(non_ce_roles) == 1:
	# CE mixed with single pure role - check if CE is meaningful
	# If we have other pure spans of different types, keep CE
	other_roles = {tag.split("-")[-1] for tag in out if tag != "O"}
	pure_role = list(non_ce_roles)[0]

	if (pure_role == "C" and "E" in other_roles) or (pure_role == "E" and "C" in other_roles):
	# CE is meaningful - keep it
	j = start
	first = True
	while j < i:
	out[j] = ("B-" if first else "I-") + "CE"
	first = False
	j += 1
	else:
	# CE not meaningful - convert to pure role
	j = start
	first = True
	while j < i:
	out[j] = ("B-" if first else "I-") + pure_role
	first = False
	j += 1

	# R‑7 connector & punctuation bridge ----------------------------------
	CONNECT = {"of", "to", "with", "for", "and", "or", "but", "in"}
	for k in range(1, n - 1):
	left_role = out[k - 1].split("-")[-1] if out[k - 1] != "O" else None
	right_role = out[k + 1].split("-")[-1] if out[k + 1] != "O" else None
	if not left_role or left_role != right_role:
	continue
	# 7a: connector word originally tagged O
	if out[k] == "O" and tok[k].lower() in CONNECT:
	out[k] = "I-" + left_role
	# 7b: single‑char punctuation / hyphen / apostrophe bridge
	elif out[k] == "O" and len(tok[k]) == 1 and not tok[k].isalnum():
	out[k] = "I-" + left_role # 7c: mis‑role single token sandwiched by same role
	elif out[k].startswith("I-") and out[k].split("-")[-1] != left_role:
	out[k] = "I-" + left_role

	# R‑8 gap‑tolerant B‑tag merging ------------------------------------
	# Merge B- tags of the same type separated by small gaps (≤1 O tokens)
	# This reduces span fragmentation like "B-E O B-E" -> "B-E I-E I-E"
	b_positions = {}
	for i, label in enumerate(out):
	if label.startswith("B-"):
	role = label.split("-")[1]
	if role not in b_positions:
	b_positions[role] = []
	b_positions[role].append(i)

	for role, positions in b_positions.items():
	if len(positions) < 2:
	continue

	# Group positions that are close together (gap ≤ 1)
	groups = []
	current_group = [positions[0]]

	for i in range(1, len(positions)):
	prev_pos = positions[i-1]
	curr_pos = positions[i]
	gap_size = curr_pos - prev_pos - 1

	if gap_size <= 1: # Allow gaps of 0 or 1 O tokens
	gap_labels = out[prev_pos + 1:curr_pos]
	if all(label == "O" for label in gap_labels):
	current_group.append(curr_pos)
	else:
	groups.append(current_group)
	current_group = [curr_pos]
	else:
	groups.append(current_group)
	current_group = [curr_pos]

	groups.append(current_group)

	# Merge groups with multiple B- tags
	for group in groups:
	if len(group) > 1:
	first_pos = group[0]
	last_pos = group[-1]

	for pos in range(first_pos + 1, last_pos + 1):
	if pos in group[1:]: # B- tag to convert
	out[pos] = f"I-{role}"
	elif out[pos] == "O": # Fill gap
	out[pos] = f"I-{role}"

	return out

	# ------------------------------------------------------------------
	@staticmethod
	def _merge_spans(tok: List[str], lab: List[str], tokenizer) -> List["Span"]:
	"""Turn cleaned BIO labels into Span objects.

	Policy:
	1. First pass – assemble raw spans, letting them bridge a single
	connector (of, to, with, for, and, or, but, in).
	2. Trim leading/trailing connectors & punctuation.
	3. Normalise hyphen spacing & strip quotes.
	4. Role‑wise pruning – if a role has ≥1 span with ≥2 words, drop
	all its 1‑word spans. This removes stray nouns like "choices"
	while preserving them when they are the only cause/effect.
	"""
	CONNECT = {"of", "to", "with", "for", "and", "or", "but", "in"}

	spans: List[Span] = []
	i, n = 0, len(tok)
	while i < n:
	if lab[i] == "O":
	i += 1; continue
	role = lab[i].split("-")[-1]
	s = i
	i += 1
	while i < n:
	if lab[i].startswith("I-"):
	i += 1; continue
	if tok[i].lower() in CONNECT and lab[i] == "O" and i+1 < n and lab[i+1].startswith("I-"):
	i += 1; continue
	break
	e = i - 1
	text = tokenizer.convert_tokens_to_string(tok[s:e+1])
	# basic cleanup
	text = text.replace(" - ", "-").replace(" -", "-").replace("- ", "-")
	text = text.strip("\"'”’““”")
	words = text.split()
	while words and words[0].lower() in CONNECT:
	words.pop(0)
	while words and words[-1].lower() in CONNECT:
	words.pop()
	if not words:
	continue
	clean_text = " ".join(words)
	spans.append(Span(role, s, e, clean_text)) # role‑wise pruning --------------------------------------------------
	from collections import defaultdict, OrderedDict
	import re
	by_role = defaultdict(list)
	for sp in spans:
	by_role[sp.role].append(sp)
	final: List[Span] = []
	for role, group in by_role.items():
	has_multi = any((g.end_tok - g.start_tok) >= 1 for g in group)
	for sp in group:
	single_tok = (sp.end_tok - sp.start_tok) == 0
	# Only remove single-token spans if they look like artifacts
	# Keep all meaningful single-token spans like "depression", "cancer", etc.
	if single_tok:
	# Check if the span text looks like a meaningful entity
	is_meaningful = (
	len(sp.text) > 2 and # Longer than 2 characters
	sp.text.isalpha() and # Only alphabetic characters
	not sp.text.lower() in {"this", "that", "it", "they", "them", "he", "she", "we", "i", "you"} # Not pronouns
	)
	if not is_meaningful and has_multi:
	# Only skip single-token spans that seem like artifacts when multi-token spans exist
	if role == "C" or role == "E":
	continue
	final.append(sp)
	final.sort(key=lambda s: s.start_tok)
	# second pass: merge over pure punctuation gaps only -----------------
	merged: List[Span] = []
	def is_punct(tok):
	return len(tok) == 1 and not tok.isalnum()
	for sp in final:
	if merged and sp.role == merged[-1].role:
	gap_tokens = tok[merged[-1].end_tok + 1 : sp.start_tok]
	if gap_tokens and all(is_punct(t) for t in gap_tokens):
	# safe to merge across punctuation (e.g., apostrophe or hyphen)
	combined_text = tokenizer.convert_tokens_to_string(tok[merged[-1].start_tok: sp.end_tok + 1]).strip("\"'”’““”")
	merged[-1] = Span(sp.role, merged[-1].start_tok, sp.end_tok, combined_text)
	continue
	merged.append(sp)
	return merged

	def _decide_causal(self, cls_logits, spans, cause_decision):
	"""Determine if a sentence is causal based on classification logits and spans.

	Args:
	cls_logits: Tensor of classification logits
	spans: List of extracted spans
	cause_decision: Strategy for determining causality ('cls_only', 'span_only', or 'cls+span')

	Returns:
	bool: True if the sentence is determined to be causal
	"""
	prob_causal = torch.softmax(cls_logits, dim=-1)[1].item()

	# Check for presence of both cause and effect spans (CE spans count as both)
	has_cause_spans = any(x.role in ("C", "CE") for x in spans)
	has_effect_spans = any(x.role in ("E", "CE") for x in spans)
	has_both_spans = has_cause_spans and has_effect_spans

	if cause_decision == "cls_only":
	return prob_causal >= 0.5
	elif cause_decision == "span_only":
	return has_both_spans
	else: # "cls+span" - default behavior
	return prob_causal >= 0.5 and has_both_spans