Spaces:

rudaoshi
/

lang2logic

Sleeping

App Files Files Community

rudaoshi commited on Dec 8, 2025

Commit

2d45476

1 Parent(s): 6dd951d

implement app

Browse files

Files changed (17) hide show

README.md +46 -7
app.py +528 -132
inference.py +745 -0
lingua/__init__.py +0 -0
lingua/concept/standard.py +59 -0
lingua/learn/__init__.py +0 -0
lingua/learn/wordgraph/__init__.py +0 -0
lingua/learn/wordgraph/modeler/__init__.py +0 -0
lingua/learn/wordgraph/modeler/word2gp.py +290 -0
lingua/structure/__init__.py +1 -0
lingua/structure/basegraph.py +887 -0
lingua/structure/gpgraph.py +1683 -0
lingua/structure/utils.py +116 -0
lingua/utils/__init__.py +0 -0
lingua/utils/topology_sorter.py +140 -0
model.py +340 -0
requirements.txt +8 -6

README.md CHANGED Viewed

@@ -1,13 +1,52 @@
 ---
-title: Lang2logic
-emoji: 🖼
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 5.44.0
 app_file: app.py
 pinned: false
-license: gpl
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Word-Lingua Graph Parser
+emoji: 📊
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# Word-Lingua Graph Parser
+Parse sentences into linguistic structure graphs using deep learning.
+## Model
+This Space uses the [rudaoshi/lingua](https://huggingface.co/rudaoshi/lingua) model, which is a BERT-based parser with biaffine attention for word-lingua graph prediction.
+## Features
+- **Sentence Parsing**: Input any English sentence to parse it into a linguistic structure graph
+- **Graph Visualization**: Visualize the parsed graph with nodes and edges
+- **Constrained Decoding**: Use pattern-based constrained decoding for better graph structure (enabled by default)
+## Usage
+1. Enter a sentence in the text box
+2. Optionally toggle "Use Constrained Decoding" (recommended)
+3. Click "Parse Sentence" to generate the graph visualization
+## Example Sentences
+- "The cat sat on the mat."
+- "John loves Mary."
+- "I want to go to the store."
+## Graph Structure
+The parser generates word-lingua graphs that represent:
+- Predicate-argument relations (pred.arg.1, pred.arg.2, etc.)
+- Modification relations (@modification, etc.)
+- Discourse markers
+- And more linguistic structures
+## Technical Details
+- **Model Architecture**: BERT + Biaffine Attention
+- **Decoding**: Greedy pattern-based constrained decoding
+- **Output Format**: GPGraph (linguistic graph structure)

app.py CHANGED Viewed

@@ -1,154 +1,550 @@
-import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
 """
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
             )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
         ],
-        outputs=[result, seed],
     )
 if __name__ == "__main__":
     demo.launch()

+"""
+Gradio app for Word-Lingua Graph Parser
+This app loads the model from HuggingFace Hub and provides an interactive interface
+to parse sentences and visualize the resulting graph.
+Designed for HuggingFace Space deployment.
 """
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import sys
+import json
+import torch
+import numpy as np
+import tempfile
+from typing import Dict, List, Tuple, Optional
+from collections import Counter, defaultdict
+from transformers import AutoTokenizer
+from huggingface_hub import hf_hub_download
+# Add lingua_space directory to path (lingua is now a package)
+sys.path.insert(0, os.path.dirname(__file__))
+# Import model and graph classes
+from model import WordLinguaParserV2, ID2NODE_TYPE
+from lingua.structure.gpgraph import GPGraph, GPGPhraseNode, GPGEdge, GPGraphVisualizer
+from lingua.learn.wordgraph.modeler.word2gp import wordlingua2lingua
+# Import constrained decoding classes from inference.py
+# For HuggingFace Space, inference.py should be in the same directory
+from inference import EdgePatternStats, GreedyPatternDecoder
+import gradio as gr
+# Model ID for HuggingFace Hub
+MODEL_ID = "rudaoshi/lingua"
+# ============================================================================
+# Model Loading
+# ============================================================================
+class ModelLoader:
+    """Singleton class to load and cache the model."""
+    _instance = None
+    _model = None
+    _tokenizer = None
+    _label2id = None
+    _id2label = None
+    _device = None
+    _constrained_decoder = None
+    _stats = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def load_model(self, model_name_or_path: str,
+                   arc_hidden_size: int = 512,
+                   rel_hidden_size: int = 256,
+                   node_hidden_size: int = 256,
+                   word_pooling: str = "mean"):
+        """Load the model from HuggingFace directory or Hub."""
+        if self._model is not None:
+            return  # Already loaded
+        print(f"Loading model from {model_name_or_path}...")
+        # Set up device
+        self._device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+        print(f"Using device: {self._device}")
+        # Check if it's a HuggingFace Hub model ID (contains '/' and not a local path)
+        is_hub_model = '/' in model_name_or_path and not os.path.exists(model_name_or_path)
+        # Load label2id
+        if is_hub_model:
+            print("Downloading label2id.json from HuggingFace Hub...")
+            label2id_path = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename="label2id.json"
+            )
+        else:
+            label2id_path = os.path.join(model_name_or_path, "label2id.json")
+            if not os.path.exists(label2id_path):
+                raise FileNotFoundError(f"label2id.json not found in {model_name_or_path}")
+        with open(label2id_path, 'r') as f:
+            self._label2id = json.load(f)
+        self._id2label = {v: k for k, v in self._label2id.items()}
+        # Load tokenizer
+        print("Loading tokenizer...")
+        self._tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        # Create model
+        print("Creating model...")
+        self._model = WordLinguaParserV2(
+            bert_model_name=model_name_or_path,
+            label_num=len(self._label2id),
+            arc_hidden_size=arc_hidden_size,
+            rel_hidden_size=rel_hidden_size,
+            node_hidden_size=node_hidden_size,
+            dropout=0.0,
+            word_pooling=word_pooling
+        )
+        # Load model weights
+        if is_hub_model:
+            print("Downloading pytorch_model.bin from HuggingFace Hub...")
+            model_path = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename="pytorch_model.bin"
             )
+        else:
+            model_path = os.path.join(model_name_or_path, "pytorch_model.bin")
+            if not os.path.exists(model_path):
+                raise FileNotFoundError(f"pytorch_model.bin not found in {model_name_or_path}")
+        print("Loading model weights...")
+        state_dict = torch.load(model_path, map_location=self._device)
+        if isinstance(state_dict, dict) and 'model_state_dict' in state_dict:
+            self._model.load_state_dict(state_dict['model_state_dict'])
+        else:
+            self._model.load_state_dict(state_dict)
+        self._model.to(self._device)
+        self._model.eval()
+        print("Model loaded successfully!")
+        # Load constrained decoding statistics if available
+        self._load_constrained_decoding_stats(model_name_or_path, is_hub_model)
+    def _load_constrained_decoding_stats(self, model_name_or_path: str, is_hub_model: bool):
+        """Load edge pattern statistics for constrained decoding."""
+        try:
+            if is_hub_model:
+                print("Downloading edge_pattern_stats.json from HuggingFace Hub...")
+                stats_path = hf_hub_download(
+                    repo_id=model_name_or_path,
+                    filename="edge_pattern_stats.json"
+                )
+            else:
+                stats_path = os.path.join(model_name_or_path, "edge_pattern_stats.json")
+                if not os.path.exists(stats_path):
+                    print("edge_pattern_stats.json not found, constrained decoding will be disabled.")
+                    return
+            print("Loading edge pattern statistics...")
+            self._stats = EdgePatternStats()
+            self._stats.load(stats_path)
+            # Create constrained decoder
+            self._constrained_decoder = GreedyPatternDecoder(
+                stats=self._stats,
+                id2label=self._id2label,
+                label2id=self._label2id,
+                arc_threshold=0.5
+            )
+            print("Constrained decoding ready!")
+        except Exception as e:
+            print(f"Warning: Could not load constrained decoding stats: {e}")
+            print("Constrained decoding will be disabled.")
+    @property
+    def constrained_decoder(self):
+        return self._constrained_decoder
+    @property
+    def model(self):
+        return self._model
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    @property
+    def id2label(self):
+        return self._id2label
+    @property
+    def device(self):
+        return self._device
+# ============================================================================
+# Graph Reconstruction
+# ============================================================================
+def predictions_to_word_lingua_graph(
+    words: List[str],
+    arc_preds: np.ndarray,  # [num_words, num_words] binary
+    rel_preds: np.ndarray,  # [num_words, num_words] label ids
+    node_type_preds: np.ndarray,  # [num_words] node type ids
+    is_root_preds: np.ndarray,  # [num_words] binary
+    child_of_whether_preds: np.ndarray,  # [num_words] binary
+    id2label: Dict[int, str],
+) -> GPGraph:
+    """Reconstruct a word-lingua-graph from model predictions."""
+    # Create a new GPGraph
+    graph = GPGraph()
+    graph.words = words
+    graph.sentence = " ".join(words)  # Set sentence for visualization
+    num_words = len(words)
+    # Create nodes for each word
+    word_to_node = {}
+    for i, word in enumerate(words):
+        node = GPGPhraseNode(
+            ID=str(i),
+            spans=[(i, i)],
+            pos=ID2NODE_TYPE.get(node_type_preds[i], "NominalConstant")
+        )
+        # Set child_of_whether attribute if predicted
+        if child_of_whether_preds[i]:
+            node.child_of_whether = True
+        graph.add_node(node)
+        word_to_node[i] = node
+    # Add edges based on arc predictions
+    for i in range(num_words):
+        for j in range(num_words):
+            if arc_preds[i, j]:
+                parent_node = word_to_node[i]
+                child_node = word_to_node[j]
+                label = id2label.get(rel_preds[i, j], "UNK")
+                edge = GPGEdge(label=label)
+                graph.add_edge(parent_node, child_node, edge)
+    return graph
+# ============================================================================
+# Inference Function
+# ============================================================================
+def prepare_input(sentence: str, tokenizer, max_length: int = 512) -> Dict:
+    """Prepare input for the model from a sentence."""
+    # Tokenize
+    encoding = tokenizer(
+        sentence,
+        max_length=max_length,
+        truncation=True,
+        return_offsets_mapping=True,
+        add_special_tokens=True,
+    )
+    # Get word boundaries from the sentence
+    words = []
+    word_starts = []
+    current_pos = 0
+    for i, char in enumerate(sentence):
+        if char == ' ':
+            if current_pos < i:
+                words.append(sentence[current_pos:i])
+                word_starts.append(current_pos)
+            current_pos = i + 1
+    if current_pos < len(sentence):
+        words.append(sentence[current_pos:])
+        word_starts.append(current_pos)
+    # Map words to subword indices
+    offset_mapping = encoding.get("offset_mapping", [])
+    word_to_subword = []
+    for word_idx, word_start in enumerate(word_starts):
+        word_end = word_start + len(words[word_idx])
+        subword_indices = []
+        for subword_idx, (start, end) in enumerate(offset_mapping):
+            if start == end:  # Skip special tokens
+                continue
+            # Check if subword overlaps with word
+            if start < word_end and end > word_start:
+                subword_indices.append(subword_idx)
+        if not subword_indices:
+            # Fallback: assign to nearest subword
+            for subword_idx, (start, end) in enumerate(offset_mapping):
+                if start == end:
+                    continue
+                if start >= word_start:
+                    subword_indices = [subword_idx]
+                    break
+        word_to_subword.append(subword_indices if subword_indices else [0])
+    return {
+        "input_ids": encoding["input_ids"],
+        "attention_mask": encoding["attention_mask"],
+        "word_to_subword": word_to_subword,
+        "num_words": len(words),
+        "words": words,
+    }
+def parse_sentence(sentence: str, model_loader: ModelLoader, use_constrained: bool = True) -> Tuple[GPGraph, str]:
+    """Parse a sentence and return the graph."""
+    if not sentence.strip():
+        return None, "Please enter a sentence."
+    try:
+        # Prepare input
+        tokenizer = model_loader.tokenizer
+        input_data = prepare_input(sentence, tokenizer)
+        # Convert to tensors
+        input_ids = torch.tensor([input_data["input_ids"]], dtype=torch.long).to(model_loader.device)
+        attention_mask = torch.tensor([input_data["attention_mask"]], dtype=torch.bool).to(model_loader.device)
+        max_words = input_data["num_words"]
+        max_subwords = max(len(subwords) for subwords in input_data["word_to_subword"])
+        word_to_subword = torch.full((1, max_words, max_subwords), -1, dtype=torch.long).to(model_loader.device)
+        word_mask = torch.ones(1, max_words, dtype=torch.bool).to(model_loader.device)
+        for w_idx, subword_indices in enumerate(input_data["word_to_subword"]):
+            for s_idx, subword_idx in enumerate(subword_indices):
+                word_to_subword[0, w_idx, s_idx] = subword_idx
+        # Run inference
+        with torch.no_grad():
+            outputs = model_loader.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                word_to_subword=word_to_subword,
+                word_mask=word_mask
+            )
+        # Get predictions
+        arc_logits = outputs["arc_logits"][0].cpu().numpy()  # [num_words, num_words]
+        rel_logits = outputs["rel_logits"][0].cpu().numpy()  # [num_words, num_words, num_labels]
+        child_of_whether_logits = outputs["child_of_whether_logits"][0].cpu().numpy()  # [num_words, 2]
+        is_root_logits = outputs["is_root_logits"][0].cpu().numpy()  # [num_words, 2]
+        node_type_logits = outputs["node_type_logits"][0].cpu().numpy()  # [num_words, num_types]
+        # Decode predictions
+        num_words = len(input_data["words"])
+        # Node predictions
+        child_of_whether_preds = np.argmax(child_of_whether_logits, axis=-1)  # [num_words]
+        node_type_preds = np.argmax(node_type_logits, axis=-1)  # [num_words]
+        # Use constrained decoding if available and requested
+        if use_constrained and model_loader.constrained_decoder is not None:
+            word_lingua_graph, decoding_info = model_loader.constrained_decoder.decode(
+                words=input_data["words"],
+                arc_logits=arc_logits,
+                rel_logits=rel_logits,
+                node_type_preds=node_type_preds,
+                is_root_logits=is_root_logits,
+                child_of_whether_preds=child_of_whether_preds,
+            )
+            graph = word_lingua_graph
+        else:
+            # Simple thresholding decoding
+            arc_preds = (arc_logits > 0).astype(int)
+            rel_preds = np.argmax(rel_logits, axis=-1)  # [num_words, num_words]
+            # Ensure there's exactly one root node
+            root_probs = torch.softmax(torch.tensor(is_root_logits), dim=-1)[:, 1].numpy()
+            root_idx = int(np.argmax(root_probs))
+            is_root_preds = np.zeros(num_words, dtype=int)
+            is_root_preds[root_idx] = 1
+            # Reconstruct word-lingua-graph
+            word_lingua_graph = predictions_to_word_lingua_graph(
+                words=input_data["words"],
+                arc_preds=arc_preds,
+                rel_preds=rel_preds,
+                node_type_preds=node_type_preds,
+                is_root_preds=is_root_preds,
+                child_of_whether_preds=child_of_whether_preds,
+                id2label=model_loader.id2label
             )
+            graph = word_lingua_graph
+        # Convert word-lingua-graph to linguagraph
+        try:
+            linguagraph, _ = wordlingua2lingua(graph)
+            graph = linguagraph
+        except Exception as e:
+            # If conversion fails, return the word-lingua-graph with a warning
+            return graph, f"Warning: Failed to convert to linguagraph: {str(e)}. Returning word-lingua-graph."
+        return graph, None
+    except Exception as e:
+        import traceback
+        error_msg = f"Error parsing sentence: {str(e)}\n{traceback.format_exc()}"
+        return None, error_msg
+def visualize_graph(graph: GPGraph) -> Optional[str]:
+    """Visualize graph and return path to temporary image file."""
+    if graph is None:
+        return None
+    try:
+        # Create temporary file
+        temp_fd, temp_file = tempfile.mkstemp(suffix=".png")
+        os.close(temp_fd)
+        # Visualize
+        visualizer = GPGraphVisualizer()
+        visualizer.visualize(graph, file_name=temp_file, format="png")
+        return temp_file
+    except Exception as e:
+        print(f"Error visualizing graph: {e}")
+        return None
+# ============================================================================
+# Gradio Interface
+# ============================================================================
+def process_sentence(sentence: str) -> Tuple[Optional[str], str]:
+    """Process a sentence and return the visualization."""
+    try:
+        # Load model if not already loaded
+        model_loader = ModelLoader()
+        if model_loader.model is None:
+            model_loader.load_model(MODEL_ID)
+        # Parse sentence (always use constrained decoding if available)
+        use_constrained = True
+        graph, error = parse_sentence(sentence, model_loader, use_constrained=use_constrained)
+        if error:
+            return None, error
+        # Visualize
+        img_path = visualize_graph(graph)
+        decoding_mode = "constrained" if (use_constrained and model_loader.constrained_decoder) else "simple"
+        if img_path:
+            return img_path, f"Graph generated successfully! (Decoding: {decoding_mode})"
+        else:
+            return None, "Failed to generate visualization."
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        return None, error_msg
+def load_model_on_startup():
+    """Load model when the Space starts up."""
+    try:
+        model_loader = ModelLoader()
+        if model_loader.model is None:
+            print(f"Loading model {MODEL_ID}...")
+            model_loader.load_model(MODEL_ID)
+            print("Model loaded successfully!")
+            return "Model loaded successfully!"
+        return "Model already loaded."
+    except Exception as e:
+        error_msg = f"Error loading model: {str(e)}"
+        print(error_msg)
+        return error_msg
+# Create Gradio interface
+with gr.Blocks(title="Lingua Graph Parser") as demo:
+    gr.Markdown("""
+    # Lingua Graph Parser
+    Parse sentences into linguistic structure graphs using deep learning.
+    Enter a sentence below to visualize its linguistic structure as a graph.
+    """)
+    with gr.Column():
+        with gr.Row():
+            sentence_input = gr.Textbox(
+                label="Input Sentence",
+                placeholder="Enter a sentence here...",
+                lines=3,
+                info="Type any English sentence to parse",
+                scale=4
+            )
+            parse_btn = gr.Button("Parse Sentence", variant="primary", size="lg", scale=1)
+        output_text = gr.Textbox(
+            label="Status",
+            lines=3,
+            interactive=False
+        )
+        output_image = gr.Image(
+            label="Graph Visualization",
+            type="filepath",
+            height=600
+        )
+    # Load model on startup
+    demo.load(
+        fn=load_model_on_startup,
+        outputs=output_text
+    )
+    # Parse button click handler
+    parse_btn.click(
+        fn=process_sentence,
+        inputs=[sentence_input],
+        outputs=[output_image, output_text]
+    )
+    # Example sentences
+    gr.Markdown("### Example Sentences")
+    gr.Examples(
+        examples=[
+            "The cat sat on the mat .",
+            "John loves Mary .",
+            "I want to go to the store .",
+            "The quick brown fox jumps over the lazy dog .",
+            "She gave him a book yesterday .",
         ],
+        inputs=sentence_input
     )
+    gr.Markdown("""
+    ### About
+    This parser uses a BERT-based model with biaffine attention to parse sentences into
+    word-lingua graphs, which represent linguistic structures including:
+    - Predicate-argument relations
+    - Modification relations
+    - Discourse markers
+    - And more...
+    **Model**: [rudaoshi/lingua](https://huggingface.co/rudaoshi/lingua)
+    """)
 if __name__ == "__main__":
     demo.launch()
+# For HuggingFace Space, the demo is already created above
+# No need for main() function

inference.py ADDED Viewed

	@@ -0,0 +1,745 @@

+"""
+Constrained decoding classes for Word-Lingua Graph Parser
+This module contains only the classes needed for constrained decoding in the Space:
+- EdgePatternStats: Statistics for edge patterns
+- GreedyPatternDecoder: Greedy decoder with pattern constraints
+"""
+import os
+import json
+import logging
+from collections import Counter, defaultdict
+from typing import List, Dict, Tuple, Optional, Any
+import numpy as np
+# Import from model.py
+from model import ID2NODE_TYPE
+# Add lingua_space directory to path (lingua is now a package)
+import sys
+sys.path.insert(0, os.path.dirname(__file__))
+# Import graph classes
+from lingua.structure.gpgraph import GPGraph, GPGPhraseNode, GPGEdge
+# Initialize logger if not already initialized
+if not logging.getLogger().handlers:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+logger = logging.getLogger(__name__)
+# ============================================================================
+# Edge Pattern Statistics
+# ============================================================================
+class EdgePatternStats:
+    """
+    Loads and uses edge PATTERN statistics for constrained decoding.
+    Statistics are loaded from edge_pattern_stats.json file which contains:
+    - P(outgoing_edge_pattern | incoming_edge, node_type)
+    - P(outgoing_edge_pattern | node_type) - fallback
+    - Root node type distribution
+    This captures structural constraints like:
+    - ModificationalFunctor must have exactly one "variable" and one "body" edge
+    - FactualPredicator typically has pred.arg.1, pred.arg.2, etc.
+    An edge pattern is a sorted tuple of outgoing edge labels.
+    """
+    def __init__(self):
+        # P(outgoing_edge_pattern | incoming_edge, node_type)
+        # Key: (incoming_edge, node_type)
+        # Value: Counter of edge patterns (sorted tuple of edge labels)
+        self.edge_pattern_counts = defaultdict(Counter)
+        # P(outgoing_edge_pattern | node_type) - fallback
+        self.node_type_pattern_counts = defaultdict(Counter)
+        # Root node type distribution
+        self.root_node_type_counts = Counter()
+        # For debugging: track pattern frequencies
+        self.all_patterns = Counter()
+        self.is_fitted = False
+    def get_valid_patterns(self, incoming_edge: str, node_type: str) -> List[Tuple[Tuple[str, ...], float]]:
+        """
+        Get valid outgoing edge patterns with their probabilities.
+        Returns:
+            List of (pattern, probability) tuples, sorted by probability descending
+        """
+        key = (incoming_edge, node_type)
+        if key in self.edge_pattern_counts and self.edge_pattern_counts[key]:
+            counts = self.edge_pattern_counts[key]
+            total = sum(counts.values())
+            result = [(pattern, count / total) for pattern, count in counts.items()]
+            return sorted(result, key=lambda x: -x[1])
+        # Fallback: use node_type's general pattern distribution
+        if node_type in self.node_type_pattern_counts and self.node_type_pattern_counts[node_type]:
+            counts = self.node_type_pattern_counts[node_type]
+            total = sum(counts.values())
+            result = [(pattern, count / total) for pattern, count in counts.items()]
+            return sorted(result, key=lambda x: -x[1])
+        # Last fallback: empty pattern (leaf node)
+        return [((), 1.0)]
+    def get_pattern_probability(self, incoming_edge: str, node_type: str,
+                                  pattern: Tuple[str, ...]) -> float:
+        """Get probability of a specific pattern."""
+        key = (incoming_edge, node_type)
+        if key in self.edge_pattern_counts:
+            counts = self.edge_pattern_counts[key]
+            total = sum(counts.values())
+            if total > 0:
+                return counts.get(pattern, 0) / total
+        # Fallback
+        if node_type in self.node_type_pattern_counts:
+            counts = self.node_type_pattern_counts[node_type]
+            total = sum(counts.values())
+            if total > 0:
+                return counts.get(pattern, 0) / total
+        return 0.0
+    def load(self, path: str):
+        """Load statistics from file."""
+        with open(path, "r") as f:
+            data = json.load(f)
+        self.edge_pattern_counts = defaultdict(Counter)
+        for k, v in data["edge_pattern_counts"].items():
+            key = eval(k)  # Convert string back to tuple
+            self.edge_pattern_counts[key] = Counter({eval(p): c for p, c in v.items()})
+        self.node_type_pattern_counts = defaultdict(Counter)
+        for k, v in data["node_type_pattern_counts"].items():
+            self.node_type_pattern_counts[k] = Counter({eval(p): c for p, c in v.items()})
+        self.root_node_type_counts = Counter(data["root_node_type_counts"])
+        self.all_patterns = Counter({eval(k): v for k, v in data["all_patterns"].items()})
+        self.is_fitted = True
+        logger.info(f"Loaded edge pattern statistics from {path}")
+# ============================================================================
+# Greedy Pattern Decoder
+# ============================================================================
+class GreedyPatternDecoder:
+    """
+    Bottom-up greedy decoder that constructs the graph edge-by-edge.
+    Strategy:
+    1. Select edges with p(edge) > threshold
+    2. For each candidate edge, compute score = p(edge) * p(label) * p(pattern)
+    3. Greedily select highest-scoring edge that doesn't violate pattern constraints
+    4. After selecting an edge, update pattern constraints for affected nodes
+    5. Connect disconnected components to the root component
+    This avoids cascading errors from top-down approaches.
+    """
+    def __init__(self, stats: EdgePatternStats, id2label: Dict[int, str],
+                 label2id: Dict[str, int], arc_threshold: float = 0.5):
+        self.stats = stats
+        self.id2label = id2label
+        self.label2id = label2id
+        self.arc_threshold = arc_threshold
+    def decode(
+        self,
+        words: List[str],
+        arc_logits: np.ndarray,
+        rel_logits: np.ndarray,
+        node_type_preds: np.ndarray,
+        is_root_logits: np.ndarray,
+        child_of_whether_preds: np.ndarray,
+    ) -> Tuple[GPGraph, Dict[str, Any]]:
+        """
+        Decode predictions using bottom-up greedy search with pattern constraints.
+        Returns:
+            Tuple of (graph, decoding_info) where decoding_info contains:
+            - errors: list of error messages
+            - warnings: list of warning messages
+            - forced_connections: list of forced edge connections
+            - disconnected_nodes: list of nodes that couldn't be connected
+        """
+        num_words = len(words)
+        # Track decoding issues
+        decoding_info = {
+            "errors": [],
+            "warnings": [],
+            "forced_connections": [],
+            "disconnected_nodes": [],
+            "phase3_failures": [],
+            "phase4_force_connects": [],
+        }
+        # Get node types
+        node_types = [ID2NODE_TYPE.get(node_type_preds[i], "NominalConstant")
+                      for i in range(num_words)]
+        # Convert logits to probabilities
+        arc_probs = self._sigmoid(arc_logits)
+        rel_probs = self._softmax(rel_logits, axis=-1)
+        root_probs = self._softmax(is_root_logits, axis=-1)[:, 1]
+        # Select root node
+        root_idx = int(np.argmax(root_probs))
+        # Build graph
+        graph = GPGraph()
+        graph.words = words
+        # Create all nodes
+        word_to_node = {}
+        for i, word in enumerate(words):
+            node = GPGPhraseNode(
+                ID=str(i),
+                spans=[(i, i)],
+                pos=node_types[i]
+            )
+            if child_of_whether_preds[i]:
+                node.child_of_whether = True
+            graph.add_node(node)
+            word_to_node[i] = node
+        # Track outgoing edges for each node (for pattern constraint checking)
+        node_outgoing_edges = {i: [] for i in range(num_words)}
+        # Track incoming edges for each node (for next_word constraint checking)
+        node_incoming_edges = {i: [] for i in range(num_words)}
+        # Track which nodes are targets of next_word edges
+        next_word_targets = set()
+        # Track graph structure for cycle detection
+        children_of = {i: set() for i in range(num_words)}
+        # Define constant types that require adjacent next_word edges
+        ADJACENT_ONLY_TYPES = {
+            "ModificationalConstant", "DeterminerConstant", "OtherConstant",
+            "NominalConstant", "SymbolConstant"
+        }
+        # Note: PunctuationalConstant is excluded (allows non-adjacent for paired punctuation)
+        # Phase 1: Collect all candidate edges with p(edge) > threshold
+        candidate_edges = []
+        for i in range(num_words):
+            for j in range(num_words):
+                if i == j:
+                    continue
+                arc_prob = arc_probs[i, j]
+                if arc_prob > self.arc_threshold:
+                    # Get label probabilities
+                    for label_id in range(rel_probs.shape[2]):
+                        label = self.id2label.get(label_id, "UNK")
+                        rel_prob = rel_probs[i, j, label_id]
+                        if rel_prob > 0.01:  # Filter very low probability labels
+                            candidate_edges.append({
+                                "parent": i,
+                                "child": j,
+                                "label": label,
+                                "label_id": label_id,
+                                "arc_prob": arc_prob,
+                                "rel_prob": rel_prob,
+                                "score": arc_prob * rel_prob,
+                            })
+        # Sort by score (descending)
+        candidate_edges.sort(key=lambda x: -x["score"])
+        # Helper function to check if adding edge would create a cycle
+        def would_create_cycle(parent: int, child: int) -> bool:
+            """Check if child can reach parent through existing edges (would create cycle)."""
+            visited = set()
+            stack = [child]
+            while stack:
+                node = stack.pop()
+                if node == parent:
+                    return True
+                if node in visited:
+                    continue
+                visited.add(node)
+                stack.extend(children_of[node])
+            return False
+        # Phase 2: Greedily select edges while respecting constraints
+        selected_edges = []
+        for edge in candidate_edges:
+            parent_idx = edge["parent"]
+            child_idx = edge["child"]
+            label = edge["label"]
+            # Constraint 1: No cycles (DAG constraint)
+            if would_create_cycle(parent_idx, child_idx):
+                continue
+            # Constraint 2: next_word adjacency constraint for *Constant types
+            if label == "next_word":
+                parent_type = node_types[parent_idx]
+                child_type = node_types[child_idx]
+                distance = abs(parent_idx - child_idx)
+                # For specified constant types, next_word must be adjacent
+                if parent_type in ADJACENT_ONLY_TYPES or child_type in ADJACENT_ONLY_TYPES:
+                    if distance != 1:
+                        continue
+            # Constraint 3: next_word target node constraints
+            if label == "next_word":
+                # 3a: If child already has non-next_word incoming edges, reject
+                non_next_word_incoming = [e for e in node_incoming_edges[child_idx] if e != "next_word"]
+                if non_next_word_incoming:
+                    continue
+            else:
+                # 3b: If child is already a next_word target, only next_word outgoing allowed
+                if child_idx in next_word_targets:
+                    continue
+            # Constraint 4: next_word targets can only have next_word outgoing edges
+            if parent_idx in next_word_targets and label != "next_word":
+                continue
+            # Constraint 5: Check if adding this edge violates pattern constraints
+            is_valid = self._is_edge_valid_for_pattern(
+                parent_idx, label, node_types[parent_idx], node_outgoing_edges[parent_idx],
+                is_root=(parent_idx == root_idx)
+            )
+            if not is_valid:
+                continue
+            # Edge is valid - add it
+            selected_edges.append(edge)
+            node_outgoing_edges[parent_idx].append(label)
+            node_incoming_edges[child_idx].append(label)
+            children_of[parent_idx].add(child_idx)
+            # Track next_word targets
+            if label == "next_word":
+                next_word_targets.add(child_idx)
+        # Add selected edges to graph with probability info
+        for edge in selected_edges:
+            parent_node = word_to_node[edge["parent"]]
+            child_node = word_to_node[edge["child"]]
+            gpg_edge = GPGEdge(label=edge["label"])
+            gpg_edge.arc_prob = edge["arc_prob"]
+            gpg_edge.rel_prob = edge["rel_prob"]
+            graph.add_edge(parent_node, child_node, gpg_edge)
+        # Phase 3: Connect disconnected nodes to ensure connected graph
+        # Find nodes reachable from root using BFS
+        def get_reachable_from(start_node: int) -> set:
+            reachable = set()
+            queue = [start_node]
+            reachable.add(start_node)
+            while queue:
+                node = queue.pop(0)
+                for child in children_of[node]:
+                    if child not in reachable:
+                        reachable.add(child)
+                        queue.append(child)
+            return reachable
+        reachable = get_reachable_from(root_idx)
+        # Find unreachable nodes
+        unreachable = [i for i in range(num_words) if i not in reachable]
+        # Helper function for Phase 3 edge validation
+        def is_edge_valid_phase3(p_idx, c_idx, lbl):
+            """Check if edge is valid considering all constraints."""
+            # Constraint: next_word adjacency for *Constant types
+            if lbl == "next_word":
+                p_type = node_types[p_idx]
+                c_type = node_types[c_idx]
+                dist = abs(p_idx - c_idx)
+                if (p_type in ADJACENT_ONLY_TYPES or c_type in ADJACENT_ONLY_TYPES) and dist != 1:
+                    return False
+                # next_word target constraints
+                non_nw_incoming = [e for e in node_incoming_edges[c_idx] if e != "next_word"]
+                if non_nw_incoming:
+                    return False
+            else:
+                # Non-next_word edge cannot target a next_word target
+                if c_idx in next_word_targets:
+                    return False
+            # next_word targets can only have next_word outgoing
+            if p_idx in next_word_targets and lbl != "next_word":
+                return False
+            # Pattern constraint
+            if not self._is_edge_valid_for_pattern(
+                p_idx, lbl, node_types[p_idx], node_outgoing_edges[p_idx],
+                is_root=(p_idx == root_idx)
+            ):
+                return False
+            return True
+        # Connect unreachable nodes to the main graph
+        while unreachable:
+            best_edge = None
+            best_score = -1
+            best_arc_prob = 0.0
+            best_rel_prob = 0.0
+            for node_idx in unreachable:
+                # Try connecting from any reachable node to this unreachable node
+                for parent_idx in reachable:
+                    # Check cycle constraint first
+                    if would_create_cycle(parent_idx, node_idx):
+                        continue
+                    arc_prob = arc_probs[parent_idx, node_idx]
+                    # Enforce arc_threshold in Phase 3
+                    if arc_prob < self.arc_threshold:
+                        continue
+                    best_label_id = int(np.argmax(rel_probs[parent_idx, node_idx]))
+                    label = self.id2label.get(best_label_id, "UNK")
+                    rel_prob = rel_probs[parent_idx, node_idx, best_label_id]
+                    # Check pattern constraint
+                    if not is_edge_valid_phase3(parent_idx, node_idx, label):
+                        # Try other labels
+                        found_valid = False
+                        for lid in np.argsort(rel_probs[parent_idx, node_idx])[::-1]:
+                            lbl = self.id2label.get(lid, "UNK")
+                            if is_edge_valid_phase3(parent_idx, node_idx, lbl):
+                                label = lbl
+                                rel_prob = rel_probs[parent_idx, node_idx, lid]
+                                found_valid = True
+                                break
+                        if not found_valid:
+                            continue
+                    score = arc_prob * rel_prob
+                    if score > best_score:
+                        best_score = score
+                        best_edge = (parent_idx, node_idx, label)
+                        best_arc_prob = float(arc_prob)
+                        best_rel_prob = float(rel_prob)
+            if best_edge is not None:
+                parent_idx, child_idx, label = best_edge
+                parent_node = word_to_node[parent_idx]
+                child_node = word_to_node[child_idx]
+                gpg_edge = GPGEdge(label=label)
+                gpg_edge.arc_prob = best_arc_prob
+                gpg_edge.rel_prob = best_rel_prob
+                graph.add_edge(parent_node, child_node, gpg_edge)
+                node_outgoing_edges[parent_idx].append(label)
+                node_incoming_edges[child_idx].append(label)
+                children_of[parent_idx].add(child_idx)
+                # Track next_word targets
+                if label == "next_word":
+                    next_word_targets.add(child_idx)
+                # Update reachable set
+                newly_reachable = get_reachable_from(child_idx)
+                reachable.update(newly_reachable)
+                unreachable = [i for i in range(num_words) if i not in reachable]
+            else:
+                # Try force connect to root
+                connected_any = False
+                best_root_edge = None
+                best_root_score = -1
+                best_root_arc_prob = 0.0
+                best_root_rel_prob = 0.0
+                best_root_label = None
+                best_root_node_idx = None
+                for node_idx in list(unreachable):
+                    # Check cycle constraint
+                    if would_create_cycle(root_idx, node_idx):
+                        continue
+                    arc_prob = float(arc_probs[root_idx, node_idx])
+                    # Check arc_threshold for root connections
+                    if arc_prob < self.arc_threshold:
+                        continue
+                    # Find best valid label
+                    found_label = None
+                    found_rel_prob = 0.0
+                    for lid in np.argsort(rel_probs[root_idx, node_idx])[::-1]:
+                        lbl = self.id2label.get(lid, "UNK")
+                        if is_edge_valid_phase3(root_idx, node_idx, lbl):
+                            found_label = lbl
+                            found_rel_prob = float(rel_probs[root_idx, node_idx, lid])
+                            break
+                    if found_label is None:
+                        continue
+                    score = arc_prob * found_rel_prob
+                    if score > best_root_score:
+                        best_root_score = score
+                        best_root_edge = True
+                        best_root_arc_prob = arc_prob
+                        best_root_rel_prob = found_rel_prob
+                        best_root_label = found_label
+                        best_root_node_idx = node_idx
+                if best_root_edge:
+                    parent_node = word_to_node[root_idx]
+                    child_node = word_to_node[best_root_node_idx]
+                    gpg_edge = GPGEdge(label=best_root_label)
+                    gpg_edge.arc_prob = best_root_arc_prob
+                    gpg_edge.rel_prob = best_root_rel_prob
+                    graph.add_edge(parent_node, child_node, gpg_edge)
+                    node_outgoing_edges[root_idx].append(best_root_label)
+                    node_incoming_edges[best_root_node_idx].append(best_root_label)
+                    children_of[root_idx].add(best_root_node_idx)
+                    # Track next_word targets
+                    if best_root_label == "next_word":
+                        next_word_targets.add(best_root_node_idx)
+                    # Update reachable
+                    newly_reachable = get_reachable_from(best_root_node_idx)
+                    reachable.update(newly_reachable)
+                    connected_any = True
+                if not connected_any:
+                    msg = f"Phase 3: Cannot connect remaining {len(unreachable)} nodes with arc_prob >= {self.arc_threshold}"
+                    decoding_info["phase3_failures"].append({
+                        "message": msg,
+                        "unreachable_nodes": list(unreachable),
+                        "unreachable_words": [words[i] for i in unreachable if i < len(words)],
+                    })
+                    break
+                unreachable = [i for i in range(num_words) if i not in reachable]
+        # Phase 4: Force connect disconnected components to root branch
+        unreachable = [i for i in range(num_words) if i not in reachable]
+        if unreachable:
+            # Find connected components among unreachable nodes
+            def find_component(start_idx, nodes_set):
+                """Find all nodes in the same component as start_idx."""
+                component = set()
+                stack = [start_idx]
+                while stack:
+                    node = stack.pop()
+                    if node in component or node not in nodes_set:
+                        continue
+                    component.add(node)
+                    # Follow outgoing edges
+                    for child in children_of[node]:
+                        if child in nodes_set:
+                            stack.append(child)
+                    # Follow incoming edges
+                    for potential_parent in nodes_set:
+                        if node in children_of[potential_parent]:
+                            stack.append(potential_parent)
+                return component
+            def find_local_roots(component):
+                """Find nodes in component that have no incoming edges from within the component."""
+                local_roots = []
+                for node in component:
+                    has_incoming_from_component = False
+                    for potential_parent in component:
+                        if node in children_of[potential_parent]:
+                            has_incoming_from_component = True
+                            break
+                    if not has_incoming_from_component:
+                        local_roots.append(node)
+                return local_roots if local_roots else list(component)
+            # Process disconnected components
+            remaining_unreachable = set(unreachable)
+            while remaining_unreachable:
+                start_node = next(iter(remaining_unreachable))
+                component = find_component(start_node, remaining_unreachable)
+                local_roots = find_local_roots(component)
+                # Try to connect any local root to the root branch
+                best_connection = None
+                best_score = -float('inf')
+                for local_root in local_roots:
+                    for parent_idx in reachable:
+                        if would_create_cycle(parent_idx, local_root):
+                            continue
+                        arc_prob = float(arc_probs[parent_idx, local_root])
+                        # Find best valid label
+                        for lid in np.argsort(rel_probs[parent_idx, local_root])[::-1]:
+                            lbl = self.id2label.get(lid, "UNK")
+                            if is_edge_valid_phase3(parent_idx, local_root, lbl):
+                                rel_prob = float(rel_probs[parent_idx, local_root, lid])
+                                score = arc_prob * rel_prob
+                                if score > best_score:
+                                    best_score = score
+                                    best_connection = (parent_idx, local_root, lbl, arc_prob, rel_prob)
+                                break
+                if best_connection:
+                    parent_idx, child_idx, label, arc_prob, rel_prob = best_connection
+                    parent_node = word_to_node[parent_idx]
+                    child_node = word_to_node[child_idx]
+                    gpg_edge = GPGEdge(label=label)
+                    gpg_edge.arc_prob = arc_prob
+                    gpg_edge.rel_prob = rel_prob
+                    graph.add_edge(parent_node, child_node, gpg_edge)
+                    node_outgoing_edges[parent_idx].append(label)
+                    node_incoming_edges[child_idx].append(label)
+                    children_of[parent_idx].add(child_idx)
+                    if label == "next_word":
+                        next_word_targets.add(child_idx)
+                    # Update reachable set
+                    newly_reachable = get_reachable_from(child_idx)
+                    reachable.update(newly_reachable)
+                else:
+                    # Force connect with any edge as last resort
+                    best_force = None
+                    best_arc = -1
+                    for local_root in local_roots:
+                        for parent_idx in reachable:
+                            if would_create_cycle(parent_idx, local_root):
+                                continue
+                            arc_prob = float(arc_probs[parent_idx, local_root])
+                            if arc_prob > best_arc:
+                                best_arc = arc_prob
+                                best_label_id = int(np.argmax(rel_probs[parent_idx, local_root]))
+                                label = self.id2label.get(best_label_id, "UNK")
+                                rel_prob = float(rel_probs[parent_idx, local_root, best_label_id])
+                                best_force = (parent_idx, local_root, label, arc_prob, rel_prob)
+                    if best_force:
+                        parent_idx, child_idx, label, arc_prob, rel_prob = best_force
+                        parent_node = word_to_node[parent_idx]
+                        child_node = word_to_node[child_idx]
+                        gpg_edge = GPGEdge(label=label)
+                        gpg_edge.arc_prob = arc_prob
+                        gpg_edge.rel_prob = rel_prob
+                        graph.add_edge(parent_node, child_node, gpg_edge)
+                        node_outgoing_edges[parent_idx].append(label)
+                        node_incoming_edges[child_idx].append(label)
+                        children_of[parent_idx].add(child_idx)
+                        if label == "next_word":
+                            next_word_targets.add(child_idx)
+                        decoding_info["phase4_force_connects"].append({
+                            "parent_idx": parent_idx,
+                            "child_idx": child_idx,
+                            "label": label,
+                        })
+                        newly_reachable = get_reachable_from(child_idx)
+                        reachable.update(newly_reachable)
+                    else:
+                        decoding_info["errors"].append(f"Cannot connect component {component}")
+                        decoding_info["disconnected_nodes"].extend(list(component))
+                remaining_unreachable -= component
+        # Check if there are any decoding issues
+        if decoding_info["errors"] or decoding_info["warnings"] or decoding_info["phase4_force_connects"]:
+            decoding_info["has_issues"] = True
+        else:
+            decoding_info["has_issues"] = False
+        return graph, decoding_info
+    def _is_edge_valid_for_pattern(
+        self,
+        parent_idx: int,
+        new_label: str,
+        node_type: str,
+        current_outgoing: List[str],
+        is_root: bool = False
+    ) -> bool:
+        """
+        Check if adding a new edge with the given label is valid for the node's pattern.
+        """
+        max_count = self._get_max_edge_count(node_type, new_label, is_root)
+        current_count = current_outgoing.count(new_label)
+        if current_count >= max_count:
+            return False
+        return True
+    def _get_max_edge_count(self, node_type: str, edge_label: str, is_root: bool = False) -> int:
+        """
+        Get the maximum allowed count for an edge label based on statistics.
+        """
+        patterns = None
+        # Check if we have statistics
+        if hasattr(self.stats, 'edge_pattern_counts') and self.stats.edge_pattern_counts:
+            if is_root:
+                key = ("ROOT", node_type)
+                patterns = self.stats.edge_pattern_counts.get(key, {})
+            if not patterns and hasattr(self.stats, 'node_type_pattern_counts'):
+                patterns = self.stats.node_type_pattern_counts.get(node_type, {})
+        elif hasattr(self.stats, 'node_type_pattern_counts') and self.stats.node_type_pattern_counts:
+            patterns = self.stats.node_type_pattern_counts.get(node_type, {})
+        if patterns:
+            max_count = 0
+            total = sum(patterns.values())
+            if total == 0:
+                return 1
+            cumulative = 0
+            for pattern, count in sorted(patterns.items(), key=lambda x: -x[1]):
+                pattern_counter = Counter(pattern)
+                label_count = pattern_counter.get(edge_label, 0)
+                max_count = max(max_count, label_count)
+                cumulative += count
+                if cumulative >= 0.95 * total:
+                    break
+            if max_count > 0:
+                return max_count
+            return 1
+        # Fallback: use hard-coded rules
+        if edge_label == "func.arg":
+            if node_type == "ConjunctionalFunctor":
+                return 8
+            elif node_type == "ExpressionFunctor":
+                return 3
+            else:
+                return 1
+        return 1
+    def _sigmoid(self, x: np.ndarray) -> np.ndarray:
+        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
+    def _softmax(self, x: np.ndarray, axis: int = -1) -> np.ndarray:
+        x_max = np.max(x, axis=axis, keepdims=True)
+        exp_x = np.exp(x - x_max)
+        return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

lingua/__init__.py ADDED Viewed

File without changes

lingua/concept/standard.py ADDED Viewed

	@@ -0,0 +1,59 @@

+LinguaGraphNodePoses = {
+    "AttributePredicator",
+    "AppositionalPredicator",
+    "FactualPredicator",
+    "LogicalPredicator",
+    "ReferentialPredicator",
+    "ConjunctionalFunctor",
+    "ExpressionFunctor",
+    "GeneralFunctor",
+    "ListFunctor",
+    "ModificationalFunctor",
+    "DeterminerConstant",
+    "InterjectionConstant",
+    "ModificationalConstant",
+    "NominalConstant",
+    "OtherConstant",
+    "PunctuationalConstant",
+    "SymbolConstant"
+}
+LinguaGraphAuxNodeLabels = {
+    "Apposition",
+    "Attribute",
+    "Copula",
+    "Discourse",
+    "Expression",
+    "List",
+    "Missing",
+    "Modification",
+    "Parataxis",
+    "Ref",
+    "Vocative",
+    "Whether",
+}
+LinguaGraphEdges = {
+    "body",
+    "variable",
+    "pred.arg.1",
+    "pred.arg.2",
+    "pred.arg.3",
+    "pred.arg.4",
+    "func.arg",
+    "appositive",
+    "attribute",
+    "discourse",
+    "index",
+    "modification",
+    "punctuation",
+    "ref",
+    "repeat",
+    "vocative",
+}
+LinguaGraphEdges.update( "as:" + edge for edge in list(LinguaGraphEdges))

lingua/learn/__init__.py ADDED Viewed

File without changes

lingua/learn/wordgraph/__init__.py ADDED Viewed

File without changes

lingua/learn/wordgraph/modeler/__init__.py ADDED Viewed

File without changes

lingua/learn/wordgraph/modeler/word2gp.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from graphlib import TopologicalSorter
+from lingua.structure.gpgraph import GPGraph, GPGPhraseNode, GPGAuxNode, GPGEdge, GPGNode
+from collections import defaultdict
+import itertools
+import logging
+logger = logging.getLogger(__name__)
+from lingua.utils.topology_sorter import LinguaGraphTopologySorter
+from lingua.structure.utils import positions2spans
+from lingua.structure.gpgraph import GPGraphVisualizer
+visualizer = GPGraphVisualizer()
+def edge2children(lingua_graph: GPGraph, visiting_node: GPGNode):
+    edge2children_map = defaultdict(list)
+    for child, edge in lingua_graph.children(visiting_node):
+        labels = tuple(edge.label.split("+"))
+        if len(labels) > 1 and not any("@" in label for label in labels):
+            raise ValueError(f"Unsupported edge label: {edge.label}")
+        # if len(labels) > 1 and any(label.startswith("as:") for label in labels[1:]):
+        #     raise ValueError(f"Unsupported edge label: {edge.label}")
+        edge2children_map[labels].append(child)
+    for labels, children in edge2children_map.items():
+        if any("@" in label for label in labels):
+            assert len(children) == 1, f"Expected 1 child for edge {labels}, got {len(children)}"
+    return edge2children_map
+symbol_label2edge = {
+        "Apposition": "@appositive",
+        "Attribute": "@attribute",
+        "Copula": "@copula",
+        "Discourse": "@discourse",
+        "Expression": "@expression",
+        "List": "@list",
+        "Missing": "@missing",
+        "Modification": "@modification",
+        "Parataxis": "@parataxis",
+        "Ref": "@ref",
+        "Vocative": "@vocative",
+        # "Whether": "@whether", # Whether is handled by property
+    }
+symbol_edge2label = {edge: label for label, edge in symbol_label2edge.items()}
+symbol_label2pos = {
+    "Apposition": "AppositionalPredicator",
+    "Attribute": "AttributePredicator",
+    "Copula": "FactualPredicator",
+    "Discourse": "ModificationalFunctor",
+    "Expression": "ExpressionFunctor",
+    "List": "ListFunctor",
+    "Missing": "FactualPredicator",
+    "Modification": "ModificationalFunctor",
+    "Parataxis": "ConjunctionalFunctor",
+    "Ref": "ReferentialPredicator",
+    "Vocative": "ModificationalFunctor",
+    "Whether": "GeneralFunctor", # Whether is handled by property
+}
+def recover_symbolic_nodes(lingua_graph: GPGraph, debug=False):
+    ordered_nodes = list(lingua_graph.topological_sort())
+    updated = False
+    for visiting_node in ordered_nodes:
+        if hasattr(visiting_node, "child_of_whether") and visiting_node.child_of_whether:
+            whether_parent = GPGAuxNode(label="Whether", pos="GeneralFunctor")
+            lingua_graph.add_node(whether_parent)
+            for parent, edge in list(lingua_graph.parents(visiting_node)):
+                # if edge.label.startswith("-"):
+                lingua_graph.remove_relation(parent, visiting_node)
+                lingua_graph.add_relation(parent, whether_parent, edge.label)
+            lingua_graph.add_relation(whether_parent, visiting_node, "func.arg")
+        edge2children_map = edge2children(lingua_graph, visiting_node)
+        if not any("@" in label for labels in edge2children_map.keys() for label in labels):
+            continue
+        # print("I'm processing edge2children_map: ", edge2children_map)
+        edges = sorted(edge2children_map.keys(), key=lambda x: x[0].count("@"), reverse=True)
+        # prefixes = ["@@@", "@@", "@"]
+        # for prefix in prefixes:
+        #     for edge_labels in edges:
+        #         if edge_labels[0].startswith(prefix):
+        #             new_edge_label = edge_labels[0][len(prefix):]
+        #             break
+        for edge_labels in edges:
+            # print("I'm processing edge: ", edge_labels)
+            if not any("@" in label for label in edge_labels):
+                continue
+            children = edge2children_map[edge_labels]
+            assert len(children) == 1, f"Expected 1 child for edge {edge_labels}, got {len(children)}"
+            child = children[0]
+            for this_edge_label in edge_labels:
+                if "@" not in this_edge_label:
+                    continue
+                level = this_edge_label.count("@") - 1
+                if this_edge_label.startswith("as:"):
+                    this_edge_label = this_edge_label[3:][level:]
+                    reverse = True
+                else:
+                    this_edge_label = this_edge_label[level:]
+                    reverse = False
+                symbolic_node_label = symbol_edge2label[this_edge_label]
+                symoblic_node_pos = symbol_label2pos[symbolic_node_label]
+                symbolic_node = GPGAuxNode(label=symbolic_node_label, pos=symoblic_node_pos)
+                lingua_graph.add_node(symbolic_node)
+                if symoblic_node_pos == "ModificationalFunctor":
+                    root = visiting_node
+                    for parent, edge in list(lingua_graph.parents(root)):
+                        if reverse and parent == visiting_node:
+                            continue
+                        #if edge.label.startswith("-"):
+                        lingua_graph.remove_relation(parent, root)
+                        lingua_graph.add_relation(parent, symbolic_node, edge.label)
+                    variable, body = (child, visiting_node) if reverse else (visiting_node, child)
+                    lingua_graph.add_relation(symbolic_node, variable, "variable")
+                    lingua_graph.add_relation(symbolic_node, body, "body")
+                    element_label = "@" * level + this_edge_label
+                    edges_between = lingua_graph.get_edge(visiting_node, child).label.split("+")
+                    assert any(element_label in label for label in edges_between), f"Expected {element_label} in {edges_between}"
+                    left_edges_between = [label for label in edges_between if element_label not in label]
+                    if not left_edges_between:
+                        lingua_graph.remove_relation(visiting_node, child)
+                    else:
+                        lingua_graph.add_relation(visiting_node, child, "+".join(left_edges_between))
+                    updated = True
+                elif symoblic_node_pos.endswith("Predicator"):
+                    root = visiting_node
+                    for parent, edge in list(lingua_graph.parents(root)):
+                        if reverse and parent == visiting_node:
+                            continue
+                        #if edge.label.startswith("-"):
+                        lingua_graph.remove_relation(parent, root)
+                        lingua_graph.add_relation(parent, symbolic_node, edge.label)
+                    arg1, arg2 = (child, visiting_node) if reverse else (visiting_node, child)
+                    lingua_graph.add_relation(symbolic_node, arg1, "pred.arg.1")
+                    lingua_graph.add_relation(symbolic_node, arg2, "pred.arg.2")
+                    element_label = "@" * level + this_edge_label
+                    edges_between = lingua_graph.get_edge(visiting_node, child).label.split("+")
+                    assert any(element_label in label for label in edges_between), f"Expected {element_label} in {edges_between}"
+                    left_edges_between = [label for label in edges_between if element_label not in label]
+                    if not left_edges_between:
+                        lingua_graph.remove_relation(visiting_node, child)
+                    else:
+                        lingua_graph.add_relation(visiting_node, child, "+".join(left_edges_between))
+                    updated = True
+                elif symbolic_node_label in {"Parataxis", "List", "Expression"}:
+                    element_label = "@" * level + this_edge_label
+                    root = child if reverse else visiting_node
+                    for parent, edge in list(lingua_graph.parents(root)):
+                        if reverse and parent == visiting_node:
+                            continue
+                        #if edge.label.startswith("-"):
+                        lingua_graph.remove_relation(parent, root)
+                        lingua_graph.add_relation(parent, symbolic_node, edge.label)
+                    elem_chain_node = visiting_node if reverse else child
+                    elements = [child, visiting_node] if reverse else [visiting_node, child]
+                    while True:
+                        this_edge2children_map = edge2children(lingua_graph, elem_chain_node)
+                        next_element = None
+                        for labels in this_edge2children_map.keys():
+                            if any(element_label in label for label in labels):
+                                assert len(this_edge2children_map[labels]) == 1, f"Expected 1 child for edge {labels}, got {len(this_edge2children_map[labels])}"
+                                next_element = this_edge2children_map[labels][0]
+                                break
+                        if not next_element:
+                            break
+                        elements.append(next_element)
+                        elem_chain_node = next_element
+                    for prev, next in itertools.pairwise(elements):
+                        lingua_graph.add_relation(symbolic_node, prev, "func.arg")
+                        lingua_graph.add_relation(symbolic_node, next, "func.arg")
+                        edges_between = lingua_graph.get_edge(prev, next).label.split("+")
+                        assert element_label in edges_between, f"Expected {element_label} in {edges_between}"
+                        left_edges_between = [label for label in edges_between if label != element_label]
+                        if not left_edges_between:
+                            lingua_graph.remove_relation(prev, next)
+                    updated = True
+    return lingua_graph, updated
+def merge_multi_word_nodes(lingua_graph: GPGraph, debug=False):
+    ordered_nodes = list(lingua_graph.topological_sort())
+    updated = False
+    for visiting_node in ordered_nodes:
+        if not lingua_graph.has_node(visiting_node):
+            continue
+        edge2children_map = edge2children(lingua_graph, visiting_node)
+        if not any("next_word" in label for label in edge2children_map.keys()):
+            continue
+        children = edge2children_map[("next_word",)]
+        assert len(children) == 1, f"Expected 1 child for edge next_word, got {len(children)}"
+        child = children[0]
+        elem_chain_node = child
+        elements = [visiting_node, child]
+        while True:
+            this_edge2children_map = edge2children(lingua_graph, elem_chain_node)
+            if ("next_word",) in this_edge2children_map.keys():
+                next_element = this_edge2children_map[("next_word",)][0]
+                elements.append(next_element)
+                elem_chain_node = next_element
+            else:
+                break
+        node_words = []
+        for word_node in elements:
+            words = list(word_node.words(with_aux=False))
+            assert len(words) == 1, f"Expected 1 word for node {word_node}, got {len(words)}"
+            node_words.append(words[0])
+        new_node = GPGPhraseNode(spans=positions2spans(node_words), pos=visiting_node.pos)
+        lingua_graph.replace(visiting_node, new_node)
+        if hasattr(visiting_node, "child_of_whether") and visiting_node.child_of_whether:
+            new_node.child_of_whether = True
+        for n in elements[1:]:
+            lingua_graph.remove_node(n)
+        updated = True
+    return lingua_graph, updated
+def wordlingua2lingua(lingua_graph: GPGraph, debug=False):
+    lingua_graph, updated1 = merge_multi_word_nodes(lingua_graph, debug=debug)
+    lingua_graph, updated2 = recover_symbolic_nodes(lingua_graph, debug=debug)
+    return lingua_graph, updated1 or updated2

lingua/structure/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Structure package

lingua/structure/basegraph.py ADDED Viewed

	@@ -0,0 +1,887 @@

+"""
+Graph data problem designed for graph2graph learning
+"""
+from typing import List
+import networkx as nx
+class Node:
+    """
+    Node
+    """
+    # def __init__(self, ID=None):
+    #     """
+    #
+    #     :param id:
+    #     :param object:
+    #     :param rep:
+    #     :param state:
+    #     :param prob:
+    #     """
+    #
+    #     self.ID = ID
+    @property
+    def ID(self):
+        """
+        :return:
+        :rtype:
+        """
+        pass
+    @ID.setter
+    def ID(self, id):
+        """
+        :param id:
+        :type id:
+        :return:
+        :rtype:
+        """
+        pass
+    def __hash__(self):
+        """
+        :return:
+        """
+        return hash(self.ID)
+    def __eq__(self, another):
+        """
+        :param another:
+        :return:
+        """
+        return self.ID == another.ID
+class Edge(object):
+    """
+    Edge
+    """
+    def __init__(self, start=None, end=None):
+        """
+        :param object:
+        :param rep:
+        :param state:
+        :param prob:
+        """
+        self.start = start
+        self.end = end
+class Graph(object):
+    """
+    Graph
+    """
+    def __init__(self, g=None):
+        """
+        init graph
+        """
+        super().__init__()
+        if g is None:
+            self.g = nx.Graph()
+            self.node_id_base = 0
+        else:
+            import copy
+            assert (isinstance(g, nx.Graph) or isinstance(g, Graph))
+            if isinstance(g, Graph):
+                self.g = copy.deepcopy(g.g)
+                self.node_id_base = g.node_id_base
+            else:
+                self.g = copy.deepcopy(g)
+                self.node_id_base = max(g.nodes, default=0) + 1
+    def nodes(self):
+        """
+        :return:
+        """
+        for node in self.g.nodes:
+            yield self.get_node(node)
+    def edges(self):
+        """
+        :return:
+        """
+        for s, e in self.g.edges():
+            edge = self.g[s][e]["Edge"]
+            s_node = self.get_node(s)
+            e_node = self.get_node(e)
+            yield (s_node, edge, e_node)
+    def has_node(self, node):
+        """
+        :param node:
+        :return:
+        """
+        return node.ID in self.g
+    def has_edge(self, node1, node2):
+        """
+        :param node1:
+        :param node2:
+        :return:
+        """
+        return node2.ID in self.g[node1.ID]
+    def number_of_nodes(self):
+        """
+        :return:
+        """
+        return nx.number_of_nodes(self.g)
+    def number_of_edges(self):
+        """
+        :return:
+        """
+        return nx.number_of_edges(self.g)
+    def get_node(self, node_id):
+        """
+        :param node_id:
+        :return:
+        """
+        return self.g.nodes[node_id]["Node"]
+    def remove_node(self, node):
+        """
+        :param node:
+        :return:
+        """
+        self.g.remove_node(node.ID)
+    def remove_edge(self, edge):
+        """
+        :param node:
+        :return:
+        """
+        self.g.remove_edge(edge.start, edge.end)
+    def remove_edge_between(self, node1, node2):
+        """
+        :param node1:
+        :type node1:
+        :param node2:
+        :type node2:
+        :return:
+        :rtype:
+        """
+        if self.g.has_edge(node1.ID, node2.ID):
+            self.g.remove_edge(node1.ID, node2.ID)
+    def get_edge(self, node1, node2):
+        """
+        :param node1_id:
+        :param node2_id:
+        :return:
+        """
+        if isinstance(node1, Node):
+            node1 = node1.ID
+        if isinstance(node2, Node):
+            node2 = node2.ID
+        """
+        if type(node1) is not int:
+            node1 = node1.ID
+        if type(node2) is not int:
+            node2 = node2.ID
+        """
+        try:
+            edge = self.g[node1][node2]["Edge"]
+        except KeyError as e:
+            raise Exception("There is no edge between node {0} and {1}".format(node1, node2))
+        return edge
+    def add_node(self, n, reuse_id=False):
+        """
+        :param n:
+        :param id:
+        :return:
+        """
+        if reuse_id:
+            node_id = n.ID
+        else:
+            node_id = self.node_id_base
+            self.node_id_base += 1
+        n.ID = node_id
+        self.g.add_node(node_id, Node=n)
+        return n
+    def add_edge(self, ni, nj, e):
+        """
+        :param ni:
+        :param eij:
+        :param nj:
+        :return:
+        """
+        if not isinstance(ni, Node):
+            ni = self.get_node(ni)
+        if not isinstance(nj, Node):
+            nj = self.get_node(nj)
+        e.start = ni.ID
+        e.end = nj.ID
+        self.g.add_edge(ni.ID, nj.ID, Edge=e)
+    def neighbors(self, node):
+        """
+        :param ni:
+        :return:
+        """
+        for nj in self.g[node.ID]:
+            eij = self.g[node.ID][nj]["Edge"]
+            yield eij, self.get_node(nj)
+    def connected_components(self):
+        """
+        :return:
+        """
+        components = nx.algorithms.components.connected_components(self.g)
+        for component in components:
+            yield [self.get_node(x) for x in component]
+    def breadth_first_dag(self, start_node):
+        """
+        :return:
+        """
+        dag = DirectedGraph()
+        for node in self.nodes():
+            dag.add_node(node.copy(), reuse_id=True)
+        edges = nx.bfs_edges(self.g, start_node.ID)
+        orderd_nodes = [start_node.ID] + [v for u, v in edges]
+        for i, u in enumerate(orderd_nodes):
+            for j, v in enumerate(orderd_nodes):
+                if j <= i:
+                    continue
+                node_u = self.get_node(u)
+                node_v = self.get_node(v)
+                if self.has_edge(node_u, node_v):
+                    edge = self.get_edge(node_u, node_v).copy()
+                    dag.add_edge(node_u, node_v, edge)
+        assert self.number_of_nodes() == dag.number_of_nodes()
+        assert self.number_of_edges() == dag.number_of_edges()
+        return dag
+    def breadth_first_tree(self, start_node):
+        """
+        :return:
+        """
+        dag = DirectedGraph()
+        dag.add_node(start_node.copy(), reuse_id=True)
+        edges = nx.bfs_edges(self.g, start_node.ID)
+        def __get_or_copy_node(u):
+            try:
+                node_u = dag.get_node(u)
+            except:
+                node_u = self.get_node(u).copy()
+                dag.add_node(node_u, reuse_id=True)
+            return node_u
+        for u, v in edges:
+            node_u = __get_or_copy_node(u)
+            node_v = __get_or_copy_node(v)
+            edge = self.get_edge(u, v)
+            dag.add_edge(node_u, node_v, edge)
+        return dag
+    def __copy__(self):
+        """
+        :return:
+        """
+        copied = type(self)()
+        copied.g = self.g.copy()
+        copied.node_id_base = self.node_id_base
+        return copied
+    def __deepcopy__(self, memodict={}):
+        """
+        :param memodict:
+        :type memodict:
+        :return:
+        :rtype:
+        """
+        from copy import deepcopy
+#        copied_g = type(self.g)()
+#
+        copied = type(self)()
+        memodict[id(self)] = copied
+        for node in self.nodes():
+            new_node = deepcopy(node)
+            new_node = copied.add_node(new_node, reuse_id=True)
+            assert new_node.ID == node.ID, "Node ID is not copied correctly {0} {1}".format(new_node.ID, node.ID)
+        for (s_node, edge, e_node) in self.edges():
+            copied.add_edge(s_node, e_node, deepcopy(edge))
+        copied.node_id_base = self.node_id_base
+        return copied
+    def offsprings(self, node, filter=None):
+        """
+        :param node:
+        :return:
+        """
+        for node_id in nx.dfs_postorder_nodes(self.g, node.ID):
+            node = self.get_node(node_id)
+            if not filter or filter(node):
+                yield self.get_node(node_id)
+    def subgraph(self, nodes):
+        """
+        :param nodes:
+        :return:
+        """
+        node_ids = [n.ID if isinstance(n, Node) else n for n in nodes]
+        subgraph = self.g.subgraph(node_ids).copy()
+        result = self.__class__()
+        result.g = subgraph
+        return result
+    def has_path(self, node1, node2):
+        """
+        :param node1:
+        :param node2:
+        :return:
+        """
+        return nx.algorithms.shortest_paths.has_path(self.g, node1.ID, node2.ID)
+    def dual(self):
+        """
+        return the dual graph
+        the dual graph is the graph with edges corresponding nodes and
+        nodes corresponding edges
+        """
+        dual = Graph()
+        edge_node_map = dict()
+        for edge in self.edges():
+            node = Node(value=edge.value)
+            dual.add_node(node)
+            edge_node_map[(edge.start, edge.end)] = node
+            edge_node_map[(edge.end, edge.start)] = node
+        for node in self.nodes():
+            edges = list(self.g.edges(node.ID))
+            # since the end node is added
+            assert len(edges) >= 2, "Edge number should larger than 2 " \
+                                    "since the end node is added"
+            for idx1, (edge1_start, edge1_end) in enumerate(edges):
+                for (edge2_start, edge2_end) in edges[idx1 + 1:]:
+                    node1 = edge_node_map[(edge1_start, edge1_end)]
+                    node2 = edge_node_map[(edge2_start, edge2_end)]
+                    dual.add_edge(node1, node2, Edge(value=node.value))
+        return dual
+    #
+    # def visualize(self, file_name=None):
+    #     """
+    #
+    #     :return:
+    #     """
+    #
+    #     visual_g = type(self.g)()
+    #
+    #     for node in self.nodes():
+    #         visual_g.add_node(node.ID, label=self.node_label(node))
+    #
+    #     for node_s, edge, node_e in self.edges():
+    #
+    #         visual_g.add_edge(node_s.ID, node_e.ID, label=self.edge_label(edge))
+    #
+    #     from networkx.drawing.nx_agraph import graphviz_layout, to_agraph
+    #
+    #     A = to_agraph(visual_g)
+    #     if file_name:
+    #         A.draw(file_name, prog="dot")
+    #
+    #     return A.to_string()
+class DirectedGraph(Graph):
+    """
+    Directed Graph
+    """
+    def __init__(self, g=None):
+        """
+        :param edge_identifier:
+        """
+        if g is None:
+            g = nx.DiGraph()
+        super().__init__(g=g)
+    def is_connected(self):
+        """
+        :return:
+        """
+        return nx.algorithms.components.is_weakly_connected(self.g)
+    def connected_components(self):
+        """
+        :return:
+        """
+        components = nx.algorithms.components.weakly_connected_components(self.g)
+        for component in components:
+            yield [self.get_node(x) for x in component]
+    def is_leaf(self, node):
+        if len(list(self.children(node))) == 0:
+            return True
+        return False
+    def children(self, node):
+        """
+        :param node:
+        :return:
+        """
+        for child_id in self.g.successors(node.ID):
+            child = self.get_node(child_id)
+            rel = self.get_edge(node, child)
+            yield child, rel
+    def offsprings(self, node, filter=None):
+        """
+        :param node:
+        :return:
+        """
+        yield node
+        for node_id in nx.descendants(self.g, node.ID):
+            node = self.get_node(node_id)
+            if not filter or filter(node):
+                yield self.get_node(node_id)
+    def ancestors(self, node, filter=None):
+        """
+        :param node:
+        :return:
+        """
+        yield node
+        for node_id in nx.ancestors(self.g, node.ID):
+            node = self.get_node(node_id)
+            if not filter or filter(node):
+                yield self.get_node(node_id)
+    def parents(self, node):
+        """
+        :param node:
+        :return:
+        """
+        for parent_id in self.g.predecessors(node.ID):
+            parent = self.get_node(parent_id)
+            rel = self.get_edge(parent, node)
+            yield parent, rel
+    def topological_sort(self):
+        """
+        :return:
+        """
+        for id in nx.topological_sort(self.g):
+            yield self.get_node(id)
+class LearnableGraph(object):
+    """
+    LearnableGraph
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def node_types(self, orders: List[Node], node_voc_functor):
+        """
+        :return:
+        :rtype:
+        """
+        import numpy as np
+        return np.narray([node_voc_functor(x) for x in orders])
+    def adjmatrix(self, node_orders: List[Node], edge_voc_functor, empty_id=0):
+        """
+        :return:
+        :rtype:
+        """
+        n_nodes = len(node_orders)
+        node2index = dict((node, idx) for idx, node in enumerate(node_orders))
+        import numpy as np
+        in_a = np.ones([n_nodes, n_nodes], dtype=np.int32) * empty_id
+        out_a = np.ones([n_nodes, n_nodes], dtype=np.int32) * empty_id
+        for u, edge, v in self.edges():
+            u_idx = node2index[u]
+            v_idx = node2index[v]
+            e_idx = edge_voc_functor(edge)  # zero is empty type
+            out_a[u_idx][v_idx] = e_idx
+            in_a[v_idx][u_idx] = e_idx
+            if not nx.is_directed(self.g):
+                in_a[u_idx][v_idx] = e_idx
+                out_a[v_idx][u_idx] = e_idx
+        return (in_a, out_a)
+    def to_tensor(self, node_orders: List[Node], node_voc, edge_voc, end_node=None):
+        """
+        :return:
+        """
+        node_types = self.node_types(node_orders, node_voc)
+        a_in, a_out = self.adjmatrix(node_orders, edge_voc)
+        if end_node:
+            node_num = len(node_types)
+            node_types.resize((node_num + 1,))
+            node_types[-1] = end_node
+            a_in.resize((node_num + 1, node_num + 1))
+            a_out.resize((node_num + 1, node_num + 1))
+        return node_types, a_in, a_out
+def valid_alignment(choices):
+    """
+    :param choices:
+    :return:
+    """
+    def _inner(i):
+        if i == n:
+            yield tuple(result)
+            return
+        for elt in sets[i] - seen:
+            seen.add(elt)
+            result[i] = elt
+            for t in _inner(i + 1):
+                yield t
+            seen.remove(elt)
+    sets = [set(seq) for seq in choices]
+    n = len(sets)
+    seen = set()
+    result = [None] * n
+    for t in _inner(0):
+        yield t
+def is_valid_topology_sort(dag, node_objs):
+    """
+    decide whether the order of nodes in dag2 is a valid topological sort order of dag1
+    :param dag:
+    :param pred_node_objs donot contain the start node:
+    :return:
+    """
+    target_nodes = list(dag.nodes())
+    target_node_objects = [n.object for n in target_nodes]
+    choices = []
+    for i, node_obj in enumerate(node_objs):
+        cur_choice = []
+        for j, target_node in enumerate(target_node_objects):
+            if target_node == node_obj:
+                cur_choice.append(j)
+        if len(cur_choice) == 0:
+            return False
+        choices.append(cur_choice)
+    for align in valid_alignment(choices):
+        if len(set(align)) != len(align):
+            continue
+        node_ids = [target_nodes[i].ID for i in align]
+        bad_align = False
+        for id, node_id in enumerate(node_ids):
+            if nx.descendants(dag.g, node_id).intersection(set(node_ids[:id])):
+                bad_align = True
+                break
+            if nx.ancestors(dag.g, node_id).intersection(set(node_ids[id + 1:])):
+                bad_align = True
+                break
+        if not bad_align:
+            return True
+    return False
+def not_isomorphic(graph_a, graph_b):
+    """
+    :param graph_a:
+    :param graph_b:
+    :return:
+    """
+    return nx.faster_could_be_isomorphic(graph_a.g, graph_b.g)
+def dot2image(dot_string, file_name=None, program="dot", format=None, return_img=False):
+    """
+    @param g:
+    @param file_name:
+    @return:
+    """
+    from PIL import Image
+    import os
+    import tempfile
+    dot_file = tempfile.NamedTemporaryFile(mode='w', suffix=".dot", delete=False)
+    dot_file.write(dot_string)
+    dot_file.close()
+    if not format:
+        format = "svg"
+    if not file_name and return_img:
+        import tempfile
+        fout = tempfile.NamedTemporaryFile(suffix="." + format)
+        file_name = fout.name
+    return_val = os.system(f'{program} -T {format} "{dot_file.name}" -o "{file_name}"')
+    assert return_val == 0
+    if return_img:
+        return Image.open(file_name)
+class GraphVisualizer(object):
+    """
+    BasicGraphVisualizer
+    """
+    def node_label(self, graph, node, *args, **kwargs):
+        """
+        :param node:
+        :type node:
+        :return:
+        :rtype:
+        """
+        return str(node)
+    def node_style(self, graph, node, *args, **kwargs):
+        """
+        @param graph:
+        @param node:
+        @param args:
+        @param kwargs:
+        @return:
+        """
+        return {}
+    def edge_label(self, graph, edge, *args, **kwargs):
+        """
+        :param edge:
+        :type edge:
+        :return:
+        :rtype:
+        """
+        return str(edge)
+    def edge_style(self, graph, edge, *args, **kwargs):
+        """
+        @param graph:
+        @param edge:
+        @param args:
+        @param kwargs:
+        @return:
+        """
+        return {}
+    def visualize(self, graph, file_name=None, return_img=False, format="svg", no_text=False, *args, **kwargs):
+        """
+        @return:
+        @rtype:
+        """
+        import io
+        dot_string = io.StringIO()
+        dot_string.write("strict digraph {\n")
+        node2index = dict()
+        for index, node_id in enumerate(graph.g.nodes()):
+            node = graph.get_node(node_id)
+            node_label = self.node_label(graph, node, no_text=no_text, *args, **kwargs)
+            node_style = self.node_style(graph, node, *args, **kwargs)
+            node2index[node.ID] = index
+            node_attr = ['label="{0}"'.format(node_label)]
+            for k, v in node_style.items():
+                node_attr.append('{0}="{1}"'.format(k, v))
+            vis_node_label = '{0}\t[{1}]; \n'.format(
+                index, ", ".join(node_attr)
+            )
+            dot_string.write(vis_node_label)
+            # if simple:
+            #     g.add_node(id2index[node_id], label=node_text, shape=shape)
+            # else:
+            #     g.add_node(node_id, label=node_text, shape=shape)
+        for s, e in graph.g.edges():
+            edge = graph.g[s][e]["Edge"]
+            edge_label = self.edge_label(graph, edge, *args, **kwargs)
+            edge_style = self.edge_style(graph, edge, *args, **kwargs)
+            edge_attr = ['label="{0}"'.format(edge_label)]
+            for k, v in edge_style.items():
+                edge_attr.append('{0}="{1}"'.format(k, v))
+            s = node2index[s]
+            e = node2index[e]
+            dot_string.write('{0}\t->\t{1}\t[{2}];\n'.format(
+                s, e, ", ".join(edge_attr)
+            ))
+        dot_string.write("}\n")
+        dot_string = dot_string.getvalue()
+        result = dot_string
+        if file_name or return_img:
+            image = dot2image(dot_string, file_name=file_name, return_img=return_img,
+                              format=format)
+        if return_img:
+            result = image
+        return result

lingua/structure/gpgraph.py ADDED Viewed

	@@ -0,0 +1,1683 @@

+"""
+oia graph
+"""
+# import numpy as np
+# logger = logging.getLogger(__name__)
+import copy
+import json
+import re
+from enum import Enum
+from typing import Union
+from .basegraph import Node, Edge, DirectedGraph, LearnableGraph, GraphVisualizer
+from .utils import positions2spans, CompactJSONEncoder
+arg_placeholder_pattern = re.compile(r'{\d+}')
+def sent_spans2list(spans):
+    spans_list = list()
+    for span in spans:
+        if isinstance(span, str):
+            spans_list.append({'label': span})
+        else:
+            assert isinstance(span, tuple) and len(span) == 2
+            spans_list.append({'start': span[0], 'end': span[1]})
+    return spans_list
+def is_arg_placeholder(string):
+    """
+    @param string:
+    @return:
+    """
+    return re.match(arg_placeholder_pattern, string) is not None
+class GPGNode(Node):
+    """
+    GPGNode
+    """
+    def __init__(self, id=None, pos=None, confidence=None, *args, **kwargs):
+        self.id = id
+        self.pos = pos
+        self.confidence = confidence
+    @property
+    def ID(self):
+        """
+        :return:
+        """
+        return self.id
+    @ID.setter
+    def ID(self, id):
+        """
+        setter
+        """
+        self.id = id
+    def __hash__(self):
+        """
+        :return:
+        """
+        return hash(self.ID)
+    def __eq__(self, another):
+        """
+        :param another:
+        :return:
+        """
+        return self.ID == another.ID
+    def copy(self):
+        return copy.deepcopy(self)
+def get_start(x):
+    if isinstance(x, (tuple, list)):
+        return x[0]
+    elif isinstance(x, int):
+        return x
+    else:
+        raise ValueError('unexpected span')
+def standardize_spans(spans):
+    """
+    @param spans:
+    @type spans:
+    @return:
+    @rtype:
+    """
+    standardized = []
+    # deduplicated = []
+    # span_set = set()
+    # for span in spans:
+    #     if isinstance(span, int):
+    #         span = (span, span)
+    #     if tuple(span) not in span_set:
+    #         deduplicated.append(span)
+    #         span_set.add(tuple(span))
+    #     else:
+    #         continue
+    # spans = deduplicated
+    idx = 0
+    while idx < len(spans):
+        span = spans[idx]
+        if isinstance(span, int):
+            standardized.append((span, span))
+        elif isinstance(span, str):
+            standardized.append(span)
+        # elif isinstance(span, tuple) and isinstance(span[0], str):
+        #     standardized.append(span[0])
+        elif isinstance(span, (tuple, list)):
+            assert len(span) == 2
+            standardized.append(tuple(span))
+            # we merge next span if it is continuous
+            # to_merge = idx
+            # break_continuous = False
+            # for j in range(idx + 1, len(spans)):
+            #     if not isinstance(spans[j], (tuple, list)):
+            #         break
+            #     if spans[j][0] == spans[j-1][1] + 1 and not break_continuous:
+            #         to_merge = j
+            #     else:
+            #         break_continuous = True
+            #
+            # merged_span = (span[0], spans[to_merge][1])
+            # idx += to_merge - idx
+            # standardized.append(tuple(merged_span))
+        else:
+            raise ValueError('Invalid span: {}'.format(span))
+        idx += 1
+    return tuple(standardized)
+def readable_spans(spans):
+    """
+    @param spans:
+    @type spans:
+    @return:
+    @rtype:
+    """
+    readable = []
+    for span in spans:
+        if isinstance(span, int):
+            readable.append(span)
+        elif isinstance(span, str):
+            readable.append(span)
+        else:
+            start, end = span
+            if start == end:
+                span = start
+            readable.append(span)
+    return tuple(readable)
+class GPGPhraseNode(GPGNode):
+    """
+    GPGPhraseNode
+    """
+    def __init__(self, spans=None, id=None, pos=None, confidence=None, *args, **kwargs):
+        super().__init__(id=id, pos=pos, confidence=confidence, *args, **kwargs)
+        if spans is not None:
+            self._spans = standardize_spans(spans)
+        else:
+            self._spans = None
+        self.contexts = list()
+    #
+    # @staticmethod
+    # def merge_by_char(spans):
+    #     if any(isinstance(x, str) for x in spans):
+    #         std_span = OIAWordsNode.merge_continuous_spans(spans)
+    #         return std_span
+    #     std_span = list()
+    #     for span in spans:
+    #         for i in range(span[0], span[1] + 1):
+    #             std_span.append(i)
+    #     std_span = list(set(std_span))
+    #     std_span = OIAWordsNode.sort_non_str_spans(std_span)
+    #     std_span = OIAWordsNode.merge_continuous_spans(std_span)
+    #     return std_span
+    #
+    @staticmethod
+    def merge_continuous_spans(spans):
+        new_spans = list()
+        for span in spans:
+            if isinstance(span, int):
+                new_spans.append((span, span))
+            else:
+                new_spans.append(span)
+        spans = new_spans
+        span_list = list()
+        start, end = None, None
+        for idx, span in enumerate(spans):
+            if isinstance(span, str):
+                if start is not None:
+                    span_list.append((start, end))
+                span_list.append(span)
+                start, end = None, None
+            else:
+                s, e = span
+                if len(spans) == 1:
+                    span_list.append(span)
+                    break
+                elif idx == len(spans) - 1:
+                    if start is None:
+                        span_list.append(span)
+                    else:
+                        if end + 1 == s:
+                            span_list.append((start, e))
+                        else:
+                            span_list.append((start, end))
+                            span_list.append(span)
+                else:
+                    if start is None:
+                        start, end = s, e
+                    else:
+                        if end + 1 == s:
+                            end = e
+                        else:
+                            span_list.append((start, end))
+                            start, end = s, e
+        return tuple(span_list)
+    #
+    # @staticmethod
+    # def sort_non_str_spans(spans):
+    #     if not any(isinstance(x, str) for x in spans):
+    #         sorted_spans = sorted(spans, key=lambda x:get_start(x))
+    #         return sorted_spans
+    #     else:
+    #         return spans
+    def has_symbols(self):
+        """
+        @return:
+        @rtype:
+        """
+        for span in self._spans:
+            if isinstance(span, str):
+                return True
+        return False
+    #
+    # def add_span_to_head(self, span):
+    #     """
+    #
+    #     @param span:
+    #     @type span:
+    #     @return:
+    #     @rtype:
+    #     """
+    #     if isinstance(span, str):
+    #         self._spans = list(self._spans)
+    #         self._spans.insert(0, span)
+    #         self._spans = tuple(self._spans)
+    #         return
+    #
+    #     if isinstance(span, int):
+    #         span = [span, span]
+    #     if isinstance(self._spans[0], (list, tuple)) and \
+    #             span[1] == self._spans[0][0] - 1:
+    #         self._spans = tuple([(span[0], self._spans[0][1])] + list(self._spans)[1:])
+    #     else:
+    #         x = list(self._spans)
+    #         x.insert(0, span)
+    #         self._spans = tuple(x)
+    #
+    # def remove_span_from_head(self, span):
+    #     """
+    #
+    #     @param span:
+    #     @type span:
+    #     @return:
+    #     @rtype:
+    #     """
+    #     if isinstance(span, tuple):
+    #         span_list = list(self._spans)
+    #         for s in list(span):
+    #             span_list.remove(s)
+    #             self._spans = standardize_spans(span_list)
+    #
+    # def add_span_to_tail(self, span):
+    #     """
+    #
+    #     @param span:
+    #     @type span:
+    #     @return:
+    #     @rtype:
+    #     """
+    #     if isinstance(span, str):
+    #         self._spans = list(self._spans)
+    #         self._spans.append(span)
+    #         self._spans = tuple(self._spans)
+    #         return
+    #
+    #     if isinstance(span, int):
+    #         span = [span, span]
+    #
+    #     if isinstance(self._spans[-1], (list, tuple)) and \
+    #             span[0] == self._spans[-1][1] + 1:
+    #         self._spans = tuple(list(self._spans)[:-1] + [(self._spans[-1][0], span[1])])
+    #     else:
+    #         x = list(self._spans)
+    #         x.append(span)
+    #         self._spans = tuple(x)
+    #
+    # def add_spans_to_head(self, spans):
+    #     """
+    #
+    #     @param span:
+    #     @type span:
+    #     @return:
+    #     @rtype:
+    #     """
+    #
+    #     for span in reversed(spans):
+    #         self.add_span_to_head(span)
+    #
+    # def add_spans_to_tail(self, spans):
+    #     """
+    #
+    #     @param span:
+    #     @type span:
+    #     @return:
+    #     @rtype:
+    #     """
+    #
+    #     for span in spans:
+    #         self.add_span_to_tail(span)
+    @property
+    def spans(self):
+        """
+        @return:
+        @rtype:
+        """
+        return self._spans
+    @spans.setter
+    def spans(self, spans):
+        """
+        @param spans:
+        @type spans:
+        @return:
+        @rtype:
+        """
+        raise Exception("spans should not be set directly. please use OIAGraph.modify_node_spans")
+        # self._spans = standardize_spans(spans)
+    # def sort_spans(self):
+    #     assert isinstance(self, OIAWordsNode)
+    #     spans = self.spans
+    #     spans = [x for x in spans if isinstance(x, tuple)]
+    #     sorted_spans = sorted(spans, key=lambda x: x[0])
+    #     return sorted_spans
+    #
+    # def merge_span(self):
+    #     if any(isinstance(span, str) for span in self.spans):
+    #         return self.spans
+    #     sorted_spans = self.sort_spans()
+    #     new_span_list = list()
+    #     s = sorted_spans[0][0]
+    #     e = sorted_spans[0][1]
+    #     for span in sorted_spans[1:]:
+    #         if span[0] == e + 1:
+    #             e = span[1]
+    #         else:
+    #             new_span_list.append((s, e))
+    #             s = span[0]
+    #             e = span[1]
+    #     new_span_list.append((s, e))
+    #     self.spans = new_span_list
+    @property
+    def readable_spans(self):
+        """
+        @return:
+        @rtype:
+        """
+        return readable_spans(self._spans)
+    def __str__(self):
+        """
+        :return:
+        """
+        return ",".join(map(str, self.spans))
+    def __contains__(self, word_id):
+        """
+        :param word_id:
+        :return:
+        """
+        for span in self._spans:
+            if isinstance(span, str):
+                if span == word_id:
+                    return True
+                else:
+                    continue
+            else:
+                start, end = span
+                if start <= word_id <= end:
+                    return True
+        return False
+    def words(self, with_aux=True):
+        """
+        :return:
+        """
+        for span in self._spans:
+            if isinstance(span, str):
+                if with_aux:
+                    yield span
+            else:
+                start, end = span
+                for i in range(start, end + 1):
+                    yield i
+    def indexes(self, with_aux=True):
+        """
+        :return:
+        """
+        for span in self._spans:
+            if isinstance(span, str):
+                pass
+            else:
+                start, end = span
+                for i in range(start, end + 1):
+                    yield i
+class GPGAuxNode(GPGNode):
+    """
+    GPGNode
+    """
+    def __init__(self, label=None, id=None, pos=None, confidence=None, *args, **kwargs):
+        super().__init__(id=id, pos=pos, confidence=confidence, *args, **kwargs)
+        self.label = label
+        self.contexts = list()
+    def __str__(self):
+        """
+        :return:
+        """
+        return self.label
+class GPGTextNode(GPGNode):
+    def __init__(self, text, pos, confidence=None):
+        super().__init__()
+        self.text = text
+        self.pos = pos
+        self.confidence = confidence
+    def __str__(self):
+        """
+        :return:
+        """
+        return self.text
+import networkx as nx
+class GPGEdge(Edge):
+    """
+    a set of relations between a pair of nodes for multiple edges.
+    behaves like a single relation (that is, a string)
+    for code compatability
+    """
+    def __init__(self, label=None, mod=False, confidence=None):
+        super().__init__()
+        self.label = label
+        self.confidence = None
+        self.mod = mod
+        self.contexts = []
+        self.confidence = confidence
+    def __str__(self):
+        return self.label
+    def __bool__(self):
+        """
+        :return:
+        """
+        return self.label is not None
+    @property
+    def value(self):
+        """
+        :return:
+        :rtype:
+        """
+        return self.label
+    @value.setter
+    def value(self, value):
+        """
+        :return:
+        :rtype:
+        """
+        self.label = value
+def _empty_hook():
+    """
+    _empty_hook
+    """
+    pass
+class GraphRootMixin:
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.root_ = None
+    @property
+    def root(self):
+        """
+        Return the root
+            * roots = [nodes with zero in-degree].
+            * if only one root found, return it, whether it is virtual or not
+            * else return the node with label == 'Root', that is, the virtual root
+        Contracts:
+            * if no virtual root, the program is running on the training data, and there is only one
+              node with zero in-degree
+        Returns:
+        """
+        if self.root_ is None:
+            topo_roots = [node for node in self.nodes()
+                          if self.g.in_degree[node.ID] == 0]
+            if len(topo_roots) == 1:
+                self.root_ = topo_roots[0]
+            else:
+                virtual_roots = [node for node in self.nodes()
+                                 if isinstance(node, GPGAuxNode) and node.label == 'Root']
+                if len(virtual_roots) == 1:
+                    self.root_ = virtual_roots[0]
+                else:
+                    raise Exception("Bad Graph with multiple roots or no roots. ")
+        return self.root_
+    @root.setter
+    def root(self, value):
+        self.root_ = value
+    def update_root(self):
+        topo_roots = [node for node in self.nodes()
+                          if self.g.in_degree[node.ID] == 0]
+        if len(topo_roots) == 1:
+            self.root_ = topo_roots[0]
+        else:
+            virtual_roots = [node for node in self.nodes()
+                                if isinstance(node, GPGAuxNode) and node.label == 'Root']
+            if len(virtual_roots) == 1:
+                self.root_ = virtual_roots[0]
+            else:
+                raise Exception("Bad Graph with multiple roots or no roots. ")
+class TextGPGraph(DirectedGraph, LearnableGraph, GraphRootMixin):
+    def add_node(self, node: Union[GPGTextNode, GPGAuxNode], reuse_id=False):
+        """
+        Add a node to the graph
+        """
+        assert isinstance(node, (GPGTextNode, GPGAuxNode)), type(node)
+        return super().add_node(node, reuse_id=reuse_id)
+    def add_edge(self, ni, nj, e):
+        """
+        Add an edge to the graph
+        """
+        if isinstance(e, str):
+            e = GPGEdge(label=e)
+        return super().add_edge(ni, nj, e)
+    def node_text(self, node, interval=" "):
+        """
+        Return the text of a node
+        """
+        if isinstance(node, GPGAuxNode):
+            return node.label
+        else:
+            return node.text
+    def add_relation(self, ni, nj, rel):
+        """
+        Add an edge to the graph
+        """
+        if isinstance(rel, str):
+            edge = GPGEdge(rel)
+        elif isinstance(rel, GPGEdge):
+            confidence = rel.confidence
+            edge = GPGEdge(rel.label, confidence=confidence)
+        else:
+            raise Exception("Unknown rel type")
+        return super().add_edge(ni, nj, edge)
+    def remove_relation(self, ni, nj):
+        """
+        Remove an edge from the graph
+        """
+        super().remove_edge_between(ni, nj)
+class GPGraph(DirectedGraph, LearnableGraph, GraphRootMixin):
+    """
+    Dependency graph
+    """
+    def __init__(self, g=None):
+        super().__init__(g=g)
+        self.meta = dict()
+        self.words = []
+        self.context_hook = _empty_hook
+        self.spans2node = dict()
+        if isinstance(g, GPGraph):
+            self.meta = g.meta
+            self.words = g.words
+            self._root = g.root_
+            self.spans2node = g.spans2node
+    def __copy__(self):
+        """
+        :return:
+        """
+        from copy import copy
+        copied = DirectedGraph.__copy__(self)
+        copied.meta = copy(self.meta)
+        copied.words = copy(self.words)
+        copied.context_hook = self.context_hook
+        return copied
+    def __deepcopy__(self, memodict={}):
+        """
+        :param memodict:
+        :type memodict:
+        :return:
+        :rtype:
+        """
+        from copy import deepcopy
+        copied = DirectedGraph.__deepcopy__(self)
+        copied.meta = deepcopy(self.meta)
+        copied.words = deepcopy(self.words)
+        copied.context_hook = deepcopy(self.context_hook)
+        return copied
+    def set_words(self, words):
+        """
+        :param words:
+        :return:
+        """
+        if isinstance(words, list):
+            self.words = words
+        else:
+            raise Exception("words input not list.")
+    #
+    # def modify_node_spans(self, node, spans):
+    #     original_spans = node.spans
+    #     node.spans = spans
+    #     del self.spans2node[original_spans]
+    #     self.spans2node[node.spans] = node
+    def add_words(self, words, pos=None):
+        """
+        :param head:
+        :return:
+        """
+        #
+        # if len(words) == 1 and isinstance(words[0], float):
+        #     words = tuple(words)
+        #     if words not in self.spans2node:
+        #         self.spans2node[words] = self.add_aux("(be)")
+        #
+        #     return self.spans2node[words]
+        #
+        # if any(isinstance(x, float) for x in words):
+        #     raise Exception("float found")
+        spans = positions2spans(words)
+        return self.add_spans(spans, pos)
+    def set_context_hook(self, hook):
+        """
+        :param hook:
+        :return:
+        """
+        self.context_hook = hook
+    def clear_context_hook(self):
+        """
+        :param hook:
+        :return:
+        """
+        self.context_hook = _empty_hook
+    def add_node(self, node, reuse_id=False):
+        """
+        :param node:
+        :return:
+        """
+        assert isinstance(node, GPGPhraseNode) or isinstance(node, GPGAuxNode), f"Invalid node type: {type(node)}"
+        context = self.context_hook()
+        if context:
+            node.contexts.extend(context)
+        if isinstance(node, GPGPhraseNode):
+            node._spans = standardize_spans(node.spans)
+            if node.spans in self.spans2node:
+                raise Exception("Repeated node found: {}".format(node.spans))
+            self.spans2node[node.spans] = super().add_node(node, reuse_id=reuse_id)
+            return self.spans2node[node.spans]
+        else:
+            node = super().add_node(node, reuse_id=reuse_id)
+            return node
+    def modify_node_spans(self, node, spans):
+        """
+        Modify the spans of a node
+        """
+        assert node.spans in self.spans2node, f"Node spans {node.spans} not found in spans2node: {self.spans2node}"
+        if self.spans2node[node.spans] == node:
+            del self.spans2node[node.spans]
+        node._spans = standardize_spans(spans)
+        self.spans2node[node.spans] = node
+    #
+    # def add_repeated_node(self, node, reuse_id=False):
+    #     """
+    #
+    #     :param node:
+    #     :return:
+    #     """
+    #     assert isinstance(node, OIAWordsNode) or isinstance(node, OIAAuxNode)
+    #
+    #     context = self.context_hook()
+    #     if context:
+    #         node.contexts.extend(context)
+    #
+    #     if isinstance(node, OIAWordsNode):
+    #         node._spans = standardize_spans(node.spans)
+    #
+    #         if node.spans in self.spans2node:
+    #             raise Exception("Repeated node found: {}".format(node.spans))
+    #         self.spans2node[node.spans] = super().add_node(node, reuse_id=reuse_id)
+    #         return self.spans2node[node.spans]
+    #     else:
+    #         node = super().add_node(node, reuse_id=reuse_id)
+    #         return node
+    def remove_node(self, node):
+        if isinstance(node, GPGPhraseNode):
+            # node.spans = standardize_spans(node.spans)
+            if node.spans in self.spans2node:
+                del self.spans2node[node.spans]
+        node = super().remove_node(node)
+        return node
+    def add_spans(self, spans, pos=None):
+        """
+        :param node:
+        :return:
+        """
+        spans = standardize_spans(spans)
+        if spans not in self.spans2node:
+            added_node = GPGPhraseNode(spans)
+            added_node.pos = pos
+            self.spans2node[spans] = super().add_node(added_node)
+        return self.spans2node[spans]
+    def has_node(self, node: GPGNode):
+        """
+        @param node:
+        @type node:
+        @return:
+        @rtype:
+        """
+        if self.g.has_node(node.ID):
+            return True
+        else:
+            return False
+    def has_word(self, words):
+        """
+        :param word:
+        :return:
+        """
+        spans = positions2spans(words)
+        for node in self.nodes():
+            if isinstance(node, GPGPhraseNode) and node.spans == spans:
+                added_node = node
+                return True
+        return False
+    def get_node_by_words(self, positions):
+        """
+        @param positions:
+        @type positions:
+        @return:
+        @rtype:
+        """
+        spans = positions2spans(positions)
+        spans = standardize_spans(spans)
+        if spans in self.spans2node:
+            return self.spans2node[spans]
+        return None
+    def get_node_by_spans(self, spans):
+        spans = standardize_spans(spans)
+        if spans in self.spans2node:
+            return self.spans2node[spans]
+        return None
+    def has_relation(self, node1: GPGNode,
+                     node2: GPGNode,
+                     direct_link=True):
+        """
+        :return:
+        @param node1:
+        @type node1:
+        @param node2:
+        @type node2:
+        """
+        if node1 is None or node2 is None:
+            return False
+        if direct_link and (self.g.has_edge(node1.ID, node2.ID) or self.g.has_edge(node2.ID, node1.ID)):
+            return True
+        elif not direct_link and (
+                nx.algorithms.shortest_paths.generic.has_path(self.g, node1.ID, node2.ID) or
+                nx.algorithms.shortest_paths.generic.has_path(self.g, node2.ID, node1.ID)):
+            return True
+        return False
+    def add_aux(self, label, pos=None):
+        """
+        :param label:
+        :return:
+        """
+        node = GPGAuxNode(label)
+        node.pos = pos
+        self.add_node(node)
+        return node
+    def get_aux(self, label):
+        """
+        :param label:
+        :return:
+        """
+        for node_id in self.g.nodes:
+            node = self.get_node(node_id)
+            if isinstance(node, GPGAuxNode) and node.label == label:
+                yield node
+    def get_edge(self, node1, node2):
+        """
+        :param node1:
+        :param node2:
+        :return:
+        """
+        try:
+            return super().get_edge(node1, node2)
+        except:
+            return None
+    def spans(self):
+        """
+        @return:
+        @rtype:
+        """
+        spans = []
+        for x in self.nodes():
+            if isinstance(x, GPGPhraseNode):
+                for span in x.spans:
+                    if isinstance(span, str):
+                        continue
+                    elif isinstance(span, int):
+                        span = (span, span)
+                        spans.append(span)
+                    else:
+                        spans.append(tuple(span))
+        spans.sort(key=lambda x: x[0])
+        return spans
+    def parents_on_path(self, node, ancestor):
+        """
+        :param node:
+        :param ancestor:
+        :return:
+        """
+        for path in nx.all_simple_paths(self.g, ancestor.ID, node.ID):
+            yield self.get_node(path[-2])
+    def paths(self, node1, node2):
+        """
+        :param node1:
+        :param node2:
+        :return:
+        """
+        for path in nx.all_simple_paths(self.g, node1.ID, node2.ID):
+            yield [self.get_node(x) for x in path]
+    def replace(self, old_node, new_node):
+        """
+        :param old_node:
+        :param new_node:
+        :return:
+        """
+        if new_node.ID == old_node.ID:
+            raise Exception("Bad business logic: cannot replace a node with itself")
+        # if self.g.has_node(new_node.ID):
+        #     raise Exception("Bad business logic: the new node is already in the graph")
+        if not self.has_node(new_node):
+            self.add_node(new_node)
+        for child, rel in self.children(old_node):
+            self.g.add_edge(new_node.ID, child.ID, Edge=rel)
+        for parent, rel in self.parents(old_node):
+            self.g.add_edge(parent.ID, new_node.ID, Edge=rel)
+        self.remove_node(old_node)
+    def add_edge(self, start_node, end_node, edge):
+        """
+        @param start_node:
+        @type start_node:
+        @param end_node:
+        @type end_node:
+        @param edge:
+        @type edge:
+        @return:
+        @rtype:
+        """
+        context = self.context_hook()
+        if context:
+            edge.contexts.extend(context)
+        if start_node.ID not in self.g.nodes():
+            raise ValueError('start node not in graph')
+        if end_node.ID not in self.g.nodes():
+            raise ValueError('end node not in graph')
+        if isinstance(edge, str):
+            edge = GPGEdge(label=edge)
+        # self.add_node(start_node, reuse_id=True)
+        # self.add_node(end_node, reuse_id=True)
+        edge.start = start_node.ID
+        edge.end = end_node.ID
+        self.g.add_edge(start_node.ID, end_node.ID, Edge=edge)
+    # def add_argument(self, pred_node, arg_node, index, mod=False):
+    #     """
+    #     :param node1:
+    #     :param node2:
+    #     :param rel:
+    #     :return:
+    #     """
+    #     # if isinstance(pred_node.ID, int) or isinstance(pred_node.ID, str):
+    #     #     raise Exception("Bad ID")
+    #     # if isinstance(arg_node.ID, int) or isinstance(arg_node.ID, str):
+    #     #     raise Exception("Bad ID")
+    #     if mod:
+    #         if any(self.node_text(arg_node).lower().startswith(x)
+    #                for x in {"what ", "which ", "where ", "who ",
+    #                          "whom ", "when ", "why ", "how "}):
+    #             edge_label = "func.arg"
+    #         else:
+    #             edge_label = "as:pred.arg.{0}".format(index)
+    #         edge = GPGEdge(label=edge_label, mod=mod)
+    #         self.add_edge(arg_node, pred_node, edge)
+    #     else:
+    #         edge_label = "pred.arg.{0}".format(index)
+    #         edge = GPGEdge(label=edge_label, mod=mod)
+    #         self.add_edge(pred_node, arg_node, edge)
+    # def add_mod(self, modifier, center):
+    #     """
+    #     :param target:
+    #     :param source:
+    #     :return:
+    #     """
+    #     edge = GPGEdge(label="modification", mod=False)
+    #     self.add_edge(center, modifier, edge)
+    # def add_function(self, functor, argument, index=None):
+    #     """
+    #     :param functor:
+    #     :param argument:
+    #     :return:
+    #     """
+    #     if index is None:
+    #         index = "1"
+    #     edge_label = "func.arg" #.format(index)
+    #     #        if isinstance(functor.ID, int) or isinstance(functor.ID, str):
+    #     #            raise Exception("Bad ID")
+    #     #        if isinstance(argument.ID, int) or isinstance(argument.ID, str):
+    #     #            raise Exception("Bad ID")
+    #     edge = GPGEdge(label=edge_label, mod=False)
+    #     functor.is_func = True
+    #     self.add_edge(functor, argument, edge)
+    # def add_ref(self, source, ref):
+    #     """
+    #     :param target:
+    #     :param source:
+    #     :return:
+    #     """
+    #     edge = GPGEdge(label="ref", mod=False)
+    #     self.add_edge(source, ref, edge)
+    def add_relation(self, node1, node2, rel, confidence=None):
+        """
+        :param node1:
+        :param node2:
+        :param rel:
+        :return:
+        """
+        if isinstance(rel, str):
+            edge = GPGEdge(rel, confidence=confidence)
+        elif isinstance(rel, GPGEdge):
+            if confidence is None:
+                confidence = rel.confidence
+            edge = GPGEdge(rel.label, confidence=confidence)
+        else:
+            raise Exception("Unknown rel type")
+        self.add_edge(node1, node2, edge)
+    def merge_continuous_spans(self):
+        nodes = list(self.nodes())
+        for node in nodes:
+            if isinstance(node, GPGAuxNode):
+                continue
+            spans = node.spans
+            span_tuple = GPGPhraseNode.merge_continuous_spans(spans)
+            # print('old', spans)
+            # print('new', span_tuple)
+            #del self.spans2node[node.spans]
+            self.modify_node_spans(node, span_tuple)
+            #node.spans = span_list
+            #self.spans2node[node.spans] = node
+    def remove_relation(self, node1, node2):
+        """
+        :param node1:
+        :param node2:
+        :return:
+        """
+        super().remove_edge_between(node1, node2)
+    def relations(self):
+        """
+        :return:
+        """
+        return super().edges()
+    def node_text(self, node, interval=" "):
+        """
+        @param node:
+        @type node:
+        @return:
+        @rtype:
+        """
+        if isinstance(node, GPGPhraseNode):
+            node_texts = []
+            for span in node.spans:
+                if isinstance(span, str):
+                    node_texts.append(span)
+                elif isinstance(span, tuple) and isinstance(span[0], str):
+                    node_texts.append(span[0])
+                else:
+                    start, end = span
+                    for i in range(start, end + 1):
+                        node_texts.append(self.words[i])
+            node_text = interval.join(node_texts)
+        else:
+            node_text = node.label
+        return node_text
+    def topological_sort(self):
+        """
+        :return:
+        """
+        for id in nx.topological_sort(self.g):
+            yield self.get_node(id)
+    @staticmethod
+    def parse(json_obj):
+        """
+        Parse a JSON object into an OIAGraph.
+        :param json_obj: JSON object representing the graph
+        :param check_valid: Whether to validate the graph after parsing
+        :return: OIAGraph instance
+        """
+        if isinstance(json_obj, str):
+            json_obj = json.loads(json_obj)
+        assert isinstance(json_obj, dict)
+        graph = GPGraph()
+        graph.meta = json_obj["meta"]
+        graph.words = [0] * len(json_obj["words"])
+        if isinstance(json_obj["words"][0], list):
+            for id, word in json_obj["words"]:
+                graph.words[id] = word
+        elif isinstance(json_obj["words"][0], str):
+            for id, word in enumerate(json_obj["words"]):
+                graph.words[id] = word
+        else:
+            raise TypeError("Invalid word format")
+        if len(json_obj["oia"]["nodes"]) == 0:
+            return graph
+        nodes = dict()
+        for node_info in json_obj["oia"]["nodes"]:
+            if isinstance(node_info, list):
+                id, spans, pos = node_info[:3]
+                confidence = node_info[3] if len(node_info) > 3 else None
+            elif isinstance(node_info, dict):
+                spans = node_info['spans']
+                pos = node_info['type']
+                id = len(nodes)
+                confidence = node_info.get('confidence')
+            else:
+                raise TypeError("Invalid node format")
+            if isinstance(spans, str) or (isinstance(spans, (list, tuple)) and len(spans) == 1 and isinstance(spans[0], str)):
+                aux_node = GPGAuxNode(spans[0] if isinstance(spans, (list, tuple)) else spans)
+                aux_node.ID = id
+                node = graph.add_node(aux_node, reuse_id=True)
+            # elif all(span in ArgPlaceholders for span in spans):
+            #     aux_node = OIAAuxNode('Parataxis')
+            #     aux_node.ID = id
+            #     node = graph.add_node(aux_node, reuse_id=True)
+            else:
+                node = GPGPhraseNode(spans)
+                node.ID = id
+                graph.add_node(node, reuse_id=True)
+            node.pos = pos
+            node.confidence = confidence
+            nodes[id] = node
+        graph.node_id_base = max([node.ID for node in graph.nodes()]) + 1
+        for edge_info in json_obj["oia"]["edges"]:
+            if isinstance(edge_info, list):
+                n1_id, edge_label, n2_id = edge_info[:3]
+                confidence = edge_info[3] if len(edge_info) > 3 else None
+            elif isinstance(edge_info, dict):
+                n1_id = edge_info['start']
+                n2_id = edge_info['end']
+                edge_label = edge_info['label']
+                confidence = edge_info.get('confidence')
+            else:
+                raise TypeError("Invalid edge format")
+            node1 = nodes[n1_id]
+            node2 = nodes[n2_id]
+            graph.add_relation(node1, node2, edge_label, confidence)
+        return graph
+    # @staticmethod
+    # def validate(graph):
+    #     """
+    #     Validate the OIAGraph structure.
+    #     Raises an exception if the graph is invalid.
+    #     """
+    #     for node in graph.nodes():
+    #         if isinstance(node, OIAAuxNode):
+    #             if node.label[0] not in ExtraNodeToken:
+    #                 raise Exception(f"Invalid auxiliary node {node.label}")
+    #         elif isinstance(node, OIAWordsNode):
+    #             for span in node.spans:
+    #                 if isinstance(span, str) and span not in ExtraNodeToken:
+    #                     raise Exception(f"Invalid node span {span}")
+    #         if node.pos is not None and node.pos not in NodePoses:
+    #             raise Exception(f"Invalid node type {node.pos}")
+    #     for _, edge, _ in graph.relations():
+    #         if edge.label not in Edges:
+    #             raise Exception(f"Invalid edge {edge.label}")
+    # @staticmethod
+    # def validate(graph):
+    #     """
+    #     Validate the OIAGraph structure.
+    #     Raises an exception if the graph is invalid.
+    #     """
+    #     for node in graph.nodes():
+    #         if isinstance(node, OIAAuxNode):
+    #             if node.label[0] not in ExtraNodeToken:
+    #                 raise Exception(f"Invalid auxiliary node {node.label}")
+    #         elif isinstance(node, OIAWordsNode):
+    #             for span in node.spans:
+    #                 if isinstance(span, str) and span not in ExtraNodeToken:
+    #                     raise Exception(f"Invalid node span {span}")
+    #         if node.pos is not None and node.pos not in NodePoses:
+    #             raise Exception(f"Invalid node type {node.pos}")
+    #     for _, edge, _ in graph.relations():
+    #         if edge.label not in Edges:
+    #             raise Exception(f"Invalid edge {edge.label}")
+    def data_for_label(self):
+        """
+        @return:
+        @rtype:
+        """
+        oia = dict()
+        oia["nodes"] = []
+        node2idx = dict()
+        for idx, node in enumerate(self.nodes()):
+            node2idx[node.ID] = idx
+            # oia["nodes"].append((idx, node.spans, node.pos))
+            if isinstance(node, GPGPhraseNode):
+                oia["nodes"].append((idx, node.spans, node.pos, node.confidence))
+            else:
+                oia["nodes"].append((idx, node.label, node.pos, node.confidence))
+        oia["edges"] = []
+        for idx, (n1, edge, n2) in enumerate(self.relations()):
+            n1_id = node2idx[n1.ID]
+            n2_id = node2idx[n2.ID]
+            oia["edges"].append((n1_id, edge.label, n2_id, edge.confidence))
+        data = dict()
+        data["meta"] = self.meta
+        data['words'] = [(idx, word) for idx, word in enumerate(self.words)]
+        data['oia'] = oia
+        return data
+    def data(self):
+        """
+        @return:
+        @rtype:
+        """
+        oia = dict()
+        oia["nodes"] = []
+        node2idx = dict()
+        for idx, node in enumerate(self.nodes()):
+            node2idx[node.ID] = idx
+            # oia["nodes"].append((idx, node.spans, node.pos))
+            if isinstance(node, GPGPhraseNode):
+                if node.confidence is None:
+                    oia["nodes"].append({'spans': node.spans, 'type': node.pos})
+                else:
+                    oia["nodes"].append({'spans': node.spans, 'type': node.pos, 'confidence': node.confidence})
+            else:
+                if node.confidence is None:
+                    oia["nodes"].append({'spans': node.label, 'type': node.pos})
+                else:
+                    oia["nodes"].append({'spans': node.label, 'type': node.pos, 'confidence': node.confidence})
+        oia["edges"] = []
+        for idx, (n1, edge, n2) in enumerate(self.relations()):
+            n1_id = node2idx[n1.ID]
+            n2_id = node2idx[n2.ID]
+            if edge.confidence is None:
+                oia["edges"].append({'start': n1_id, 'end': n2_id, 'label': edge.label})
+            else:
+                oia["edges"].append({'start': n1_id, 'end': n2_id, 'label': edge.label, 'confidence': edge.confidence})
+        data = dict()
+        data["meta"] = self.meta
+        data['words'] = self.words
+        data['oia'] = oia
+        return data
+    def readable(self):
+        """
+        @return:
+        @rtype:
+        """
+        oia = dict()
+        oia["nodes"] = []
+        node2idx = dict()
+        for idx, node in enumerate(self.nodes()):
+            node2idx[node.ID] = idx
+            # oia["nodes"].append((idx, node.spans, node.pos))
+            if isinstance(node, GPGPhraseNode):
+                if not hasattr(node, 'concept'):
+                    oia["nodes"].append({'nid': node2idx[node.ID], 'spans': sent_spans2list(node.spans),
+                            'type': node.pos, 'text': self.node_text(node, interval='')})
+                else:
+                    oia["nodes"].append({'nid': node2idx[node.ID], 'spans': sent_spans2list(node.spans),
+                            'type': node.pos, 'text': self.node_text(node, interval=''), 'concept': node.concept})
+            else:
+                if not hasattr(node, 'concept'):
+                    oia["nodes"].append({'nid': node2idx[node.ID], 'spans': node.label, 'type': node.pos,
+                            'text': self.node_text(node, interval='')})
+                else:
+                    oia["nodes"].append({'nid': node2idx[node.ID], 'spans': node.label, 'type': node.pos,
+                            'text': self.node_text(node, interval=''), 'concept': node.concept})
+        oia["edges"] = []
+        for idx, (n1, edge, n2) in enumerate(self.relations()):
+            n1_id = node2idx[n1.ID]
+            n2_id = node2idx[n2.ID]
+            if edge.confidence is None:
+                oia["edges"].append({'start': n1_id, 'end': n2_id, 'label': edge.label})
+            else:
+                oia["edges"].append({'start': n1_id, 'end': n2_id, 'label': edge.label, 'confidence': edge.confidence})
+        data = dict()
+        data["meta"] = self.meta
+        data['words'] = self.words
+        data['oia'] = oia
+        return data
+    # @staticmethod
+    # def parse_readable(yaml_obj, check_valid=True):
+    #     """
+    #     :param oia_graph:
+    #     :return:
+    #     """
+    #     if isinstance(yaml_obj, str):
+    #         yaml_obj = json.loads(yaml_obj)
+    #     assert isinstance(yaml_obj, dict)
+    #     graph = OIAGraph()
+    #     graph.meta = yaml_obj["meta"]
+    #     graph.words = yaml_obj['words']
+    #     for node_info in yaml_obj["oia"]["nodes"]:
+    #         spans = node_info['spans']
+    #         if isinstance(spans, str):
+    #             if check_valid:
+    #                 if spans not in ExtraNodeToken:
+    #                     raise Exception("Invalid node {0} ".format(spans))
+    #             aux_node = OIAAuxNode(spans)
+    #             aux_node.ID = node_info['nid']
+    #             aux_node.pos = node_info['type']
+    #             if 'concept' in node_info:
+    #                 aux_node.concept = node_info['concept']
+    #             node = graph.add_node(aux_node, reuse_id=True)
+    #         else:
+    #             node_spans = list()
+    #             for span in spans:
+    #                 if 'label' in span:
+    #                     node_spans.append(span['label'])
+    #                 else:
+    #                     assert 'start' in span and 'end' in span
+    #                     node_spans.append((span['start'], span['end']))
+    #             for span in node_spans:
+    #                 if check_valid:
+    #                     if isinstance(span, str) and span not in ExtraNodeToken:
+    #                         raise Exception("Invalid node {0} ".format(span))
+    #             node = OIAWordsNode(node_spans)
+    #             node.ID = node_info['nid']
+    #             node.pos = node_info['type']
+    #             if 'concept' in node_info:
+    #                 node.concept = node_info['concept']
+    #             graph.add_node(node, reuse_id=True)
+    #     for edge_info in yaml_obj["oia"]["edges"]:
+    #         edge_label = edge_info['label']
+    #         if check_valid:
+    #             if edge_label not in Edges:
+    #                 raise Exception("Invalid edge {0} ".format(edge_label))
+    #         start = graph.get_node(edge_info['start'])
+    #         end = graph.get_node(edge_info['end'])
+    #         graph.add_relation(start, end, edge_label)
+    #     return graph
+    def save(self, output_file_path):
+        """
+        :param output_file_path:
+        :return:
+        """
+        data = self.data()
+        with open(output_file_path, "w", encoding="UTF8") as output_file:
+            json.dump(data, output_file, cls=CompactJSONEncoder, ensure_ascii=False)
+class GPGraphVisualizer(GraphVisualizer):
+    """
+    GPGraphVisualizer
+    """
+    def __init__(self, debug=False):
+        self.debug = debug
+    def escape(self, node_text):
+        """
+        @param node_text:
+        @return:
+        """
+        special_tokens = '{}<>"'
+        for token in special_tokens:
+            node_text = node_text.replace(token, "\\" + token)
+        return node_text
+    def node_label(self, graph, node, no_text=False, *args, **kwargs):
+        """
+        :param node:
+        :param dep_graph:
+        :return:
+        """
+        components = []
+        components.append(str(node.ID))
+        if no_text:
+            node_text = ""
+        else:
+            node_text = self.escape(graph.node_text(node))
+        components.append(node_text)
+        if isinstance(node, GPGPhraseNode):
+            span_str = self.escape(str(tuple(node.readable_spans)))
+            components.append(span_str)
+        x = node.pos
+        if x is None:
+            x = 'None'
+        components.append(x)
+        if hasattr(node, 'concept'):
+            components.append(node.concept)
+        label = "{0}".format(" | ".join(components))
+        if self.debug and node.contexts:
+            label = "{{{0}}}".format(" | ".join([label, "\n".join(node.contexts)]))
+        return label
+    def node_style(self, graph, node, *args, **kwargs):
+        """
+        @param node:
+        @return:
+        """
+        style = {}
+        style['shape'] = "record"
+        style['fillcolor'] = "grey"
+        style['style'] = "filled"
+        return style
+    def edge_label(self, graph, edge, *args, **kwargs):
+        """
+        @param edge:
+        @param debug:
+        @return:
+        """
+        edge_label = edge.label
+        if self.debug and edge.contexts:
+            edge_label = "{{{0}}}".format("|".join([edge_label, "\n".join(edge.contexts)]))
+        return edge_label
+    def edge_style(self, graph, edge, *args, **kwargs):
+        """
+        @param node:
+        @return:
+        """
+        style = {}
+        return style
+from typing import List
+import string
+def get_word_positions(sentence: str, words: List[str]) -> List[tuple]:
+    """
+    Get the starting and ending character positions for each word in the sentence.
+    Args:
+        sentence (str): The original sentence.
+        words (List[str]): The list of words in the sentence.
+    Returns:
+        List[tuple]: A list of tuples where each tuple contains the starting and ending positions of a word.
+    """
+    positions = []
+    current_pos = 0
+    for word in words:
+        start_pos = sentence.find(word, current_pos)
+        if start_pos == -1:
+            raise ValueError(f"Word '{word}' in [{words}]not found in the sentence [{sentence}] starting from position {current_pos}.")
+        end_pos = start_pos + len(word)
+        positions.append((start_pos, end_pos))
+        current_pos = end_pos
+    return positions
+def get_node_text(node: GPGPhraseNode, sentence: str, word_positions: List[tuple]):
+    word_indexes = list(node.words(with_aux=True))
+    node_label = ""
+    for w in word_indexes:
+        if isinstance(w, int):
+            word_pos = word_positions[w]
+            word_text = sentence[word_pos[0]: word_pos[1]]
+            if word_pos[0] > 0 and sentence[word_pos[0] - 1] in string.whitespace:
+                word_text = " " + word_text
+            node_label += word_text
+        else:
+            node_label += w # should add space ? += (" " + " ")?
+    return node_label
+class GraphValidator(object):
+    @property
+    def name(self):
+        return None
+    """
+    check whether the graph is valid, and return the severity of the error, and details of the error
+    The severity of the error:
+    *  is an float between 0 and 1. The larger the value, the more severe the error.
+    *  0 means the graph is perfect in the aspect of the check
+    """
+    def validate(self, graph: GPGraph):
+        pass

lingua/structure/utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+utilities for graph manipulation and visualization
+"""
+import json
+from typing import Union
+from enum import Enum, auto
+class AutoNamedEnum(Enum):
+    def _generate_next_value_(name, start, count, last_values):
+        return name
+class CompactJSONEncoder(json.JSONEncoder):
+    """A JSON Encoder that puts small containers on single lines."""
+    CONTAINER_TYPES = (list, tuple, dict)
+    """Container datatypes include primitives or other containers."""
+    MAX_WIDTH = 70
+    """Maximum width of a container that might be put on a single line."""
+    MAX_ITEMS = 3
+    """Maximum number of items in container that might be put on single line."""
+    INDENTATION_CHAR = " "
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.indentation_level = 0
+        if self.indent is None:
+            self.indent = 2
+        self.list_nest_level = 0
+        self.kwargs = kwargs
+    def encode(self, o):
+        """Encode JSON object *o* with respect to single line lists."""
+        if isinstance(o, (list, tuple)):
+            if self._put_on_single_line(o):
+                return "[" + ", ".join(self.encode(el) for el in o) + "]"
+            else:
+                self.indentation_level += 1
+                self.list_nest_level += 1
+                output = [self.indent_str + self.encode(el) for el in o]
+                self.indentation_level -= 1
+                self.list_nest_level -= 1
+                return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]"
+        elif isinstance(o, dict):
+            if o:
+                if self._put_on_single_line(o):
+                    return "{ " + ", ".join(f"{self.encode(k)}: {self.encode(el)}" for k, el in o.items()) + " }"
+                else:
+                    self.indentation_level += 1
+                    output = [self.indent_str + f"{json.dumps(k)}: {self.encode(v)}" for k, v in o.items()]
+                    self.indentation_level -= 1
+                    return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}"
+            else:
+                return "{}"
+        elif isinstance(o, float):  # Use scientific notation for floats, where appropiate
+            return format(o, "g")
+#        elif isinstance(o, str):  # escape newlines
+#            o = o.replace("\n", "\\n")
+#            return f'"{o}"'
+        else:
+            return json.dumps(o, **self.kwargs)
+    def _put_on_single_line(self, o):
+        return self._primitives_only(o) and len(o) <= self.MAX_ITEMS and len(str(o)) - 2 <= self.MAX_WIDTH
+    def _primitives_only(self, o: Union[list, tuple, dict]):
+        if self.list_nest_level >= 1:
+            return True
+        if isinstance(o, (list, tuple)):
+            return not any(isinstance(el, self.CONTAINER_TYPES) for el in o)
+        elif isinstance(o, dict):
+            return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values())
+    @property
+    def indent_str(self) -> str:
+        return self.INDENTATION_CHAR*(self.indentation_level*self.indent)
+def positions2spans(words):
+    """
+    check
+    """
+    if isinstance(words, int):
+        words = tuple([words])
+    elif isinstance(words, (list, tuple)):
+        words = tuple(words)
+    else:
+        raise Exception("the words must be an int or a list of int/str")
+    spans = []
+    idx = 0
+    while idx < len(words):
+        if isinstance(words[idx], str):
+            spans.append(words[idx])
+            idx += 1
+        else:
+            start_idx = idx
+            while start_idx + 1 < len(words) \
+                and (words[start_idx + 1] == words[start_idx] + 1 \
+                    or words[start_idx + 1] == words[start_idx]):
+                start_idx += 1
+            spans.append((words[idx], words[start_idx]))
+            idx = start_idx + 1
+    return tuple(spans)

lingua/utils/__init__.py ADDED Viewed

File without changes

lingua/utils/topology_sorter.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from lingua.structure.gpgraph import GPGraph, GPGNode, GPGEdge, GPGPhraseNode, GPGAuxNode
+from lingua.concept.standard import LinguaGraphEdges
+from typing import List, Tuple
+from collections import deque
+class LinguaGraphTopologySorter:
+    """
+    Sorter a LinguaGraph by un-ambiguous topology order .
+    """
+    def __init__(self):
+        """Define edge type priorities"""
+        self.edge_priorities = {
+            # Core semantic relations
+            'pred.arg.1': 0, 'pred.arg.2': 1, 'pred.arg.3': 2, 'pred.arg.4': 3,
+            'func.arg': 4, 'variable': 5, 'body': 6,
+            # Predicate relations
+            'copula': 10, 'appositive': 11, 'discourse': 12, 'ref': 13,
+            'repeat': 14, 'vocative': 15, 'nonsense': 16, 'modification': 17,
+            'parataxis': 18, 'index': 19, 'attribute': 20,
+            # As-predicate relations
+            'as:pred.arg': 30, 'as:pred.arg.1': 31, 'as:pred.arg.2': 32,
+            'as:pred.arg.3': 33, 'as:pred.arg.4': 34, 'as:func.arg': 35,
+            'as:copula': 40, 'as:appositive': 41, 'as:discourse': 42,
+            'as:ref': 43, 'as:repeat': 44, 'as:vocative': 45,
+            'as:nonsense': 46, 'as:parataxis': 47, 'as:index': 48,
+            'as:attribute': 49, 'as:modification': 50, 'as:punctuation': 51, 'punctuation': 52,
+            'as:body': 53, 'as:variable': 54,
+            # Punctuation and structural
+            'local': 60, 'tail': 61,
+        }
+        missing_labels = set(LinguaGraphEdges) - set(self.edge_priorities.keys())
+        if missing_labels:
+            raise ValueError(f"Missing edge labels: {missing_labels}")
+    def _get_edge_rank(self, edge_label: str) -> int:
+        """
+        Rank edge types by importance.
+        """
+        return self.edge_priorities[edge_label]
+    def _get_node_rank_info(self, gpg_graph: GPGraph) -> int:
+        """
+        Get node rank info for a GPGraph.
+        """
+        noderank_info = dict()
+        for node in reversed(list(gpg_graph.topological_sort())):
+            node_words = set()
+            if isinstance(node, GPGPhraseNode):
+                node_words.update(node.words(with_aux=False))
+            for child, _ in gpg_graph.children(node):
+                if isinstance(child, GPGPhraseNode):
+                    node_words.update(child.words(with_aux=False))
+            if node_words:
+                noderank_info[node.ID] = (tuple(sorted(node_words)), "PHRASE")
+            else:
+                assert isinstance(node, GPGAuxNode), "node must be a aux node"
+                noderank_info[node.ID] = ((-100,), node.label)
+        return noderank_info
+    def edges(self, lingua_graph: GPGraph) -> List[Tuple[GPGNode, GPGEdge, GPGNode]]:
+        """
+        Sort edges based on breadth-first search traversal with edge priorities.
+        Ensures uniqueness by processing nodes level-by-level and sorting children
+        by edge priority (with node ID as tie-breaker) before processing.
+        """
+        root = lingua_graph.root
+        visited = set()
+        queue = deque([root])
+        visited.add(root.ID)
+        noderank_info = self._get_node_rank_info(lingua_graph)
+        while queue:
+            node = queue.popleft()
+            # Get all children and sort by edge priority, node rank, then node ID for uniqueness
+            children = list(lingua_graph.children(node))
+            children.sort(key=lambda child_edge: (
+                self._get_edge_rank(child_edge[1].label),  # Primary: edge priority
+                noderank_info[child_edge[0].ID],            # Secondary: node rank (position/label)
+                child_edge[0].ID,                           # Tie-breaker: node ID
+            ))
+            # Yield edges and enqueue children
+            for child, edge in children:
+                yield (node, edge, child)
+                # Add child to queue if not already visited
+                if child.ID not in visited:
+                    visited.add(child.ID)
+                    queue.append(child)
+    def nodes(self, lingua_graph: GPGraph) -> List[GPGNode]:
+        """
+        Sort nodes by topology, ensuring each node is yielded only after all its parents.
+        Maintains the same ordering as edges() for nodes at the same level.
+        """
+        # 1. Calculate in-degrees
+        in_degree = {node.ID: 0 for node in lingua_graph.nodes()}
+        for node in lingua_graph.nodes():
+            for child, _ in lingua_graph.children(node):
+                in_degree[child.ID] += 1
+        # 2. Get rank info
+        noderank_info = self._get_node_rank_info(lingua_graph)
+        # 3. BFS-based Kahn's Algorithm
+        queue = deque([lingua_graph.root])
+        visited = {lingua_graph.root.ID}
+        while queue:
+            node = queue.popleft()
+            yield node
+            # Get children and sort exactly like in edges()
+            children = list(lingua_graph.children(node))
+            children.sort(key=lambda child_edge: (
+                self._get_edge_rank(child_edge[1].label),
+                noderank_info[child_edge[0].ID],
+                child_edge[0].ID,
+            ))
+            for child, edge in children:
+                in_degree[child.ID] -= 1
+                if in_degree[child.ID] == 0:
+                    if child.ID not in visited:
+                        visited.add(child.ID)
+                        queue.append(child)

model.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Model definitions for Word-Lingua Graph Parser
+This module contains all model-related classes and constants.
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, Optional
+# ============================================================================
+# Node Type Constants
+# ============================================================================
+# Node types (pos) in word-lingua-graph
+NODE_TYPES = [
+    "NominalConstant",
+    "FactualPredicator",
+    "ModificationalFunctor",
+    "PunctuationalConstant",
+    "DeterminerConstant",
+    "ModificationalConstant",
+    "ConjunctionalFunctor",
+    "GeneralFunctor",
+    "LogicalPredicator",
+    "ExpressionFunctor",
+    "SymbolConstant",
+    "OtherConstant",
+    "InterjectionConstant",
+    "AttributePredicator",
+]
+# Build node type mappings
+NODE_TYPE2ID = {nt: i for i, nt in enumerate(NODE_TYPES)}
+NODE_TYPE2ID["UNK"] = len(NODE_TYPES)
+ID2NODE_TYPE = {v: k for k, v in NODE_TYPE2ID.items()}
+# ============================================================================
+# Bilinear Module
+# ============================================================================
+class PairwiseBilinear(nn.Module):
+    """Pairwise bilinear layer for biaffine attention."""
+    def __init__(self, in1_features: int, in2_features: int,
+                 out_features: int, bias_x: bool = True, bias_y: bool = True):
+        super().__init__()
+        self.bias_x = bias_x
+        self.bias_y = bias_y
+        self.in1_features = in1_features
+        self.in2_features = in2_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.zeros(out_features, in1_features + int(bias_x), in2_features + int(bias_y))
+        )
+        bound = 1 / (self.in1_features * self.in2_features) ** 0.25
+        nn.init.uniform_(self.weight, -bound, bound)
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        if self.bias_x:
+            x1 = torch.cat([x1, torch.ones_like(x1[..., :1])], dim=-1)
+        if self.bias_y:
+            x2 = torch.cat([x2, torch.ones_like(x2[..., :1])], dim=-1)
+        return torch.einsum('bxi,oij,byj->bxyo', x1, self.weight, x2)
+# ============================================================================
+# MLP Module
+# ============================================================================
+class MLP(nn.Module):
+    """Multi-layer perceptron with dropout."""
+    def __init__(self, input_size: int, hidden_size: int,
+                 dropout: float = 0.1, activation: nn.Module = nn.ReLU):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            activation(),
+            nn.Dropout(p=dropout)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layers(x)
+# ============================================================================
+# Biaffine Layer
+# ============================================================================
+class BiaffineLayer(nn.Module):
+    """Biaffine attention layer for arc and relation prediction."""
+    def __init__(self, input_size: int, label_num: int,
+                 arc_hidden_size: int = 500, rel_hidden_size: int = 100,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.label_num = label_num
+        # MLPs for arc prediction
+        self.mlp_arc_h = MLP(input_size, arc_hidden_size, dropout)
+        self.mlp_arc_d = MLP(input_size, arc_hidden_size, dropout)
+        # MLPs for relation prediction
+        self.mlp_rel_h = MLP(input_size, rel_hidden_size, dropout)
+        self.mlp_rel_d = MLP(input_size, rel_hidden_size, dropout)
+        # Bilinear layers
+        self.arc_atten = PairwiseBilinear(arc_hidden_size, arc_hidden_size, 1)
+        self.rel_atten = PairwiseBilinear(rel_hidden_size, rel_hidden_size, label_num)
+    def forward(self, input: torch.Tensor,
+                attention_mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        """
+        Args:
+            input: (batch, seq_len, hidden_size)
+            attention_mask: (batch, seq_len) - True for valid tokens
+        Returns:
+            dict with arc_logits (batch, seq_len, seq_len) and
+            rel_logits (batch, seq_len, seq_len, label_num)
+        """
+        # Compute representations for arc prediction
+        arc_h = self.mlp_arc_h(input)  # head
+        arc_d = self.mlp_arc_d(input)  # dependent
+        # Compute representations for relation prediction
+        rel_h = self.mlp_rel_h(input)
+        rel_d = self.mlp_rel_d(input)
+        # Compute arc scores: (batch, seq_len, seq_len, 1) -> (batch, seq_len, seq_len)
+        s_arc = self.arc_atten(arc_d, arc_h).squeeze(-1)
+        # Compute relation scores: (batch, seq_len, seq_len, label_num)
+        s_rel = self.rel_atten(rel_d, rel_h)
+        # Create pairwise mask if attention_mask is provided
+        if attention_mask is not None:
+            pairwise_mask = attention_mask.unsqueeze(-1) & attention_mask.unsqueeze(-2)
+            s_rel = s_rel.masked_fill(~pairwise_mask.unsqueeze(-1), -1e9)
+        else:
+            pairwise_mask = None
+        return {
+            "arc_logits": s_arc,
+            "rel_logits": s_rel,
+            "mask": pairwise_mask
+        }
+# ============================================================================
+# Node Classifier (for child_of_whether and is_root)
+# ============================================================================
+class NodeClassifier(nn.Module):
+    """Classifier for node-level binary attributes."""
+    def __init__(self, input_size: int, hidden_size: int = 256,
+                 dropout: float = 0.1, num_classes: int = 2):
+        super().__init__()
+        self.classifier = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size, num_classes)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch, seq_len, hidden_size)
+        Returns:
+            logits: (batch, seq_len, num_classes)
+        """
+        return self.classifier(x)
+# ============================================================================
+# Word-Lingua Parser Model (v2)
+# ============================================================================
+class WordLinguaParserV2(nn.Module):
+    """
+    BERT-based parser for word-lingua-graph prediction.
+    This version adds:
+    - child_of_whether prediction (node-level binary classification)
+    - is_root prediction (node-level binary classification)
+    - node_type prediction (node-level multi-class classification)
+    """
+    def __init__(self, bert_model_name: str, label_num: int,
+                 num_node_types: int = len(NODE_TYPE2ID),
+                 arc_hidden_size: int = 500, rel_hidden_size: int = 100,
+                 node_hidden_size: int = 256,
+                 dropout: float = 0.1, word_pooling: str = "mean"):
+        super().__init__()
+        from transformers import AutoModel
+        self.bert = AutoModel.from_pretrained(bert_model_name)
+        hidden_size = self.bert.config.hidden_size
+        self.word_pooling = word_pooling
+        # Biaffine layer for arc and relation prediction
+        self.biaffine = BiaffineLayer(
+            input_size=hidden_size,
+            label_num=label_num,
+            arc_hidden_size=arc_hidden_size,
+            rel_hidden_size=rel_hidden_size,
+            dropout=dropout
+        )
+        # Node classifiers
+        self.child_of_whether_classifier = NodeClassifier(
+            input_size=hidden_size,
+            hidden_size=node_hidden_size,
+            dropout=dropout,
+            num_classes=2
+        )
+        self.is_root_classifier = NodeClassifier(
+            input_size=hidden_size,
+            hidden_size=node_hidden_size,
+            dropout=dropout,
+            num_classes=2
+        )
+        # Node type classifier (multi-class)
+        self.node_type_classifier = NodeClassifier(
+            input_size=hidden_size,
+            hidden_size=node_hidden_size,
+            dropout=dropout,
+            num_classes=num_node_types
+        )
+        self.dropout = nn.Dropout(dropout)
+    def aggregate_subwords(self, hidden_states: torch.Tensor,
+                           word_to_subword: torch.Tensor,
+                           word_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Aggregate subword embeddings to word-level representations.
+        Memory-efficient implementation that avoids creating large intermediate tensors.
+        """
+        batch_size, max_words, max_subwords = word_to_subword.shape
+        hidden_size = hidden_states.size(-1)
+        device = hidden_states.device
+        # Create output tensor
+        word_repr = torch.zeros(batch_size, max_words, hidden_size, device=device, dtype=hidden_states.dtype)
+        # Get valid subword mask
+        subword_mask = (word_to_subword >= 0)  # [batch, max_words, max_subwords]
+        safe_indices = word_to_subword.clamp(min=0)  # [batch, max_words, max_subwords]
+        if self.word_pooling == "first":
+            # Simply take the first subword for each word
+            first_indices = safe_indices[:, :, 0]  # [batch, max_words]
+            # Gather from hidden_states: [batch, seq_len, hidden] -> [batch, max_words, hidden]
+            gather_indices = first_indices.unsqueeze(-1).expand(-1, -1, hidden_size)
+            word_repr = torch.gather(hidden_states, dim=1, index=gather_indices)
+        elif self.word_pooling == "last":
+            # Take the last valid subword for each word
+            num_subwords = subword_mask.sum(dim=2).clamp(min=1)  # [batch, max_words]
+            last_subword_pos = num_subwords - 1  # [batch, max_words]
+            # Get the index from safe_indices at the last valid position
+            last_indices = torch.gather(safe_indices, dim=2,
+                                        index=last_subword_pos.unsqueeze(-1)).squeeze(-1)  # [batch, max_words]
+            gather_indices = last_indices.unsqueeze(-1).expand(-1, -1, hidden_size)
+            word_repr = torch.gather(hidden_states, dim=1, index=gather_indices)
+        elif self.word_pooling in ["mean", "max"]:
+            # For mean/max pooling, we need to gather all subwords
+            # But we do it more efficiently by flattening and using scatter/gather
+            # Flatten indices: [batch, max_words * max_subwords]
+            flat_indices = safe_indices.view(batch_size, -1)  # [batch, max_words * max_subwords]
+            flat_mask = subword_mask.view(batch_size, -1)  # [batch, max_words * max_subwords]
+            # Gather all subword embeddings at once
+            gather_indices = flat_indices.unsqueeze(-1).expand(-1, -1, hidden_size)
+            flat_embeds = torch.gather(hidden_states, dim=1, index=gather_indices)  # [batch, max_words * max_subwords, hidden]
+            # Reshape back to [batch, max_words, max_subwords, hidden]
+            subword_embeds = flat_embeds.view(batch_size, max_words, max_subwords, hidden_size)
+            subword_mask_expanded = subword_mask.unsqueeze(-1)  # [batch, max_words, max_subwords, 1]
+            if self.word_pooling == "mean":
+                subword_embeds = subword_embeds * subword_mask_expanded.float()
+                word_repr = subword_embeds.sum(dim=2)
+                num_subwords = subword_mask.sum(dim=2, keepdim=True).clamp(min=1).float()
+                word_repr = word_repr / num_subwords
+            else:  # max
+                subword_embeds = subword_embeds.masked_fill(~subword_mask_expanded, float('-inf'))
+                word_repr = subword_embeds.max(dim=2)[0]
+        else:
+            raise ValueError(f"Unknown word_pooling: {self.word_pooling}")
+        return word_repr
+    def forward(self, input_ids: torch.Tensor,
+                attention_mask: torch.Tensor,
+                word_to_subword: torch.Tensor,
+                word_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Returns:
+            dict with arc_logits, rel_logits, child_of_whether_logits, is_root_logits, node_type_logits
+        """
+        # Get BERT outputs
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        hidden_states = outputs.last_hidden_state
+        hidden_states = self.dropout(hidden_states)
+        # Aggregate subword embeddings to word-level representations
+        word_repr = self.aggregate_subwords(hidden_states, word_to_subword, word_mask)
+        # Apply biaffine attention for arc/relation prediction
+        biaffine_result = self.biaffine(word_repr, word_mask)
+        # Node-level predictions
+        child_of_whether_logits = self.child_of_whether_classifier(word_repr)
+        is_root_logits = self.is_root_classifier(word_repr)
+        node_type_logits = self.node_type_classifier(word_repr)
+        return {
+            "arc_logits": biaffine_result["arc_logits"],
+            "rel_logits": biaffine_result["rel_logits"],
+            "mask": biaffine_result["mask"],
+            "child_of_whether_logits": child_of_whether_logits,
+            "is_root_logits": is_root_logits,
+            "node_type_logits": node_type_logits,
+        }

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
-accelerate
-diffusers
-invisible_watermark
-torch
-transformers
-xformers

+torch>=2.0.0
+transformers>=4.30.0
+gradio>=4.0.0
+huggingface-hub>=0.16.0
+numpy>=1.24.0
+jsonlines>=3.1.0
+networkx>=3.0