File size: 1,551 Bytes
34c8a90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations

from typing import Dict, List, Any, Optional

from mecari.utils.signature import signature_key


def dedup_morphemes(morphs: List[Dict]) -> List[Dict]:
    seen = set()
    out: List[Dict] = []
    for m in morphs:
        key = signature_key(m)
        if key in seen:
            continue
        seen.add(key)
        out.append(m)
    out.sort(key=lambda m: (
        m.get("start_pos", 0),
        -(m.get("end_pos", 0) - m.get("start_pos", 0)),
        m.get("surface", ""),
        m.get("reading", ""),
        m.get("pos", "*"),
    ))
    return out


def build_adjacent_edges(morphs: List[Dict]) -> List[Dict]:
    edges: List[Dict] = []
    for i, s in enumerate(morphs):
        for j, t in enumerate(morphs):
            if i >= j:
                continue
            if s.get("end_pos", 0) == t.get("start_pos", 0):
                edges.append({"source_idx": i, "target_idx": j, "edge_type": "forward"})
    return edges


def normalize_mecab_candidates(candidates: List[Dict]) -> List[Dict]:
    """Normalize MeCab candidates consistently for preprocessing/inference.

    - If surface is digit-only and base_form is empty/missing, set base_form = surface.
    Modifies candidates in place and returns the list for convenience.
    """
    for c in candidates:
        surf = c.get("surface", "")
        bf = c.get("base_form")
        if (bf is None or bf == "") and surf and surf.isdigit():
            c["base_form"] = surf
    return candidates