|
|
"""RDFLib-backed KG store for PoC. |
|
|
|
|
|
Stores nodes and simple edges; links DocumentChunk IDs to KG entities using |
|
|
`mentions` predicate. Persists to a TTL file. |
|
|
""" |
|
|
try: |
|
|
from rdflib import Graph, URIRef, Literal, Namespace |
|
|
from rdflib.namespace import RDF, RDFS |
|
|
_HAS_RDFLIB = True |
|
|
except Exception: |
|
|
_HAS_RDFLIB = False |
|
|
|
|
|
from typing import List, Dict, Optional |
|
|
import uuid |
|
|
import os |
|
|
|
|
|
NS_URI = "http://example.org/abalone/" |
|
|
|
|
|
if _HAS_RDFLIB: |
|
|
NS = Namespace(NS_URI) |
|
|
|
|
|
class KGStore: |
|
|
def __init__(self, path: str = "./kg_store.ttl"): |
|
|
self.path = path |
|
|
self.graph = Graph() |
|
|
if os.path.exists(self.path): |
|
|
try: |
|
|
self.graph.parse(self.path, format="turtle") |
|
|
except Exception: |
|
|
|
|
|
self.graph = Graph() |
|
|
|
|
|
def _entity_uri(self, label: str) -> URIRef: |
|
|
safe = label.strip().lower().replace(" ", "_") |
|
|
return URIRef(f"{NS_URI}entity/{safe}") |
|
|
|
|
|
def _chunk_uri(self, chunk_id: str) -> URIRef: |
|
|
return URIRef(f"{NS_URI}chunk/{chunk_id}") |
|
|
|
|
|
def add_entity(self, label: str, description: Optional[str] = None) -> URIRef: |
|
|
u = self._entity_uri(label) |
|
|
self.graph.add((u, RDFS.label, Literal(label))) |
|
|
if description: |
|
|
self.graph.add((u, NS.description, Literal(description))) |
|
|
return u |
|
|
|
|
|
def link_chunk_to_entity(self, chunk_id: str, entity_label: str, sentence: str = "", confidence: float = 0.5): |
|
|
e = self.add_entity(entity_label) |
|
|
c = self._chunk_uri(chunk_id) |
|
|
self.graph.add((c, NS.mentions, e)) |
|
|
|
|
|
self.graph.add((c, NS.sentence, Literal(sentence))) |
|
|
self.graph.add((c, NS.confidence, Literal(str(confidence)))) |
|
|
|
|
|
def add_triple(self, subj_label: str, pred_label: str, obj_label: str, provenance: Optional[Dict] = None): |
|
|
s = self.add_entity(subj_label) |
|
|
o = self.add_entity(obj_label) |
|
|
p = URIRef(f"{NS_URI}relation/{pred_label.strip().lower().replace(' ', '_')}") |
|
|
self.graph.add((s, p, o)) |
|
|
if provenance: |
|
|
|
|
|
self.graph.add((s, NS.provenance, Literal(str(provenance)))) |
|
|
|
|
|
def save(self): |
|
|
self.graph.serialize(destination=self.path, format="turtle") |
|
|
|
|
|
def find_chunks_for_entity(self, entity_label: str) -> List[str]: |
|
|
e = self._entity_uri(entity_label) |
|
|
q = f"SELECT ?chunk WHERE {{ ?chunk <{NS_URI}mentions> <{e}> . }}" |
|
|
res = self.graph.query(q) |
|
|
out = [] |
|
|
for r in res: |
|
|
uri = str(r[0]) |
|
|
if uri.startswith(NS_URI + "chunk/"): |
|
|
out.append(uri.split("chunk/", 1)[1]) |
|
|
return out |
|
|
|
|
|
def query_entities(self, text: str) -> List[str]: |
|
|
|
|
|
text_l = text.lower() |
|
|
out = [] |
|
|
for s, p, o in self.graph.triples((None, RDFS.label, None)): |
|
|
label = str(o).lower() |
|
|
if label in text_l: |
|
|
out.append(str(s)) |
|
|
return out |
|
|
|
|
|
else: |
|
|
class KGStore: |
|
|
def __init__(self, *args, **kwargs): |
|
|
raise RuntimeError("rdflib is required for KGStore. Install with `pip install rdflib`") |
|
|
|