File size: 3,525 Bytes
ee749be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""RDFLib-backed KG store for PoC.

Stores nodes and simple edges; links DocumentChunk IDs to KG entities using
`mentions` predicate. Persists to a TTL file.
"""
try:
    from rdflib import Graph, URIRef, Literal, Namespace
    from rdflib.namespace import RDF, RDFS
    _HAS_RDFLIB = True
except Exception:
    _HAS_RDFLIB = False

from typing import List, Dict, Optional
import uuid
import os

NS_URI = "http://example.org/abalone/"

if _HAS_RDFLIB:
    NS = Namespace(NS_URI)

    class KGStore:
        def __init__(self, path: str = "./kg_store.ttl"):
            self.path = path
            self.graph = Graph()
            if os.path.exists(self.path):
                try:
                    self.graph.parse(self.path, format="turtle")
                except Exception:
                    # start empty if parse fails
                    self.graph = Graph()

        def _entity_uri(self, label: str) -> URIRef:
            safe = label.strip().lower().replace(" ", "_")
            return URIRef(f"{NS_URI}entity/{safe}")

        def _chunk_uri(self, chunk_id: str) -> URIRef:
            return URIRef(f"{NS_URI}chunk/{chunk_id}")

        def add_entity(self, label: str, description: Optional[str] = None) -> URIRef:
            u = self._entity_uri(label)
            self.graph.add((u, RDFS.label, Literal(label)))
            if description:
                self.graph.add((u, NS.description, Literal(description)))
            return u

        def link_chunk_to_entity(self, chunk_id: str, entity_label: str, sentence: str = "", confidence: float = 0.5):
            e = self.add_entity(entity_label)
            c = self._chunk_uri(chunk_id)
            self.graph.add((c, NS.mentions, e))
            # add provenance as reified data on the chunk node
            self.graph.add((c, NS.sentence, Literal(sentence)))
            self.graph.add((c, NS.confidence, Literal(str(confidence))))

        def add_triple(self, subj_label: str, pred_label: str, obj_label: str, provenance: Optional[Dict] = None):
            s = self.add_entity(subj_label)
            o = self.add_entity(obj_label)
            p = URIRef(f"{NS_URI}relation/{pred_label.strip().lower().replace(' ', '_')}")
            self.graph.add((s, p, o))
            if provenance:
                # store provenance on subject node for simplicity
                self.graph.add((s, NS.provenance, Literal(str(provenance))))

        def save(self):
            self.graph.serialize(destination=self.path, format="turtle")

        def find_chunks_for_entity(self, entity_label: str) -> List[str]:
            e = self._entity_uri(entity_label)
            q = f"SELECT ?chunk WHERE {{ ?chunk <{NS_URI}mentions> <{e}> . }}"
            res = self.graph.query(q)
            out = []
            for r in res:
                uri = str(r[0])
                if uri.startswith(NS_URI + "chunk/"):
                    out.append(uri.split("chunk/", 1)[1])
            return out

        def query_entities(self, text: str) -> List[str]:
            # naive: find entities whose label appears in text
            text_l = text.lower()
            out = []
            for s, p, o in self.graph.triples((None, RDFS.label, None)):
                label = str(o).lower()
                if label in text_l:
                    out.append(str(s))
            return out

else:
    class KGStore:
        def __init__(self, *args, **kwargs):
            raise RuntimeError("rdflib is required for KGStore. Install with `pip install rdflib`")