cmd0160's picture
Adding files
ee749be
"""RDFLib-backed KG store for PoC.
Stores nodes and simple edges; links DocumentChunk IDs to KG entities using
`mentions` predicate. Persists to a TTL file.
"""
try:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
_HAS_RDFLIB = True
except Exception:
_HAS_RDFLIB = False
from typing import List, Dict, Optional
import uuid
import os
NS_URI = "http://example.org/abalone/"
if _HAS_RDFLIB:
NS = Namespace(NS_URI)
class KGStore:
def __init__(self, path: str = "./kg_store.ttl"):
self.path = path
self.graph = Graph()
if os.path.exists(self.path):
try:
self.graph.parse(self.path, format="turtle")
except Exception:
# start empty if parse fails
self.graph = Graph()
def _entity_uri(self, label: str) -> URIRef:
safe = label.strip().lower().replace(" ", "_")
return URIRef(f"{NS_URI}entity/{safe}")
def _chunk_uri(self, chunk_id: str) -> URIRef:
return URIRef(f"{NS_URI}chunk/{chunk_id}")
def add_entity(self, label: str, description: Optional[str] = None) -> URIRef:
u = self._entity_uri(label)
self.graph.add((u, RDFS.label, Literal(label)))
if description:
self.graph.add((u, NS.description, Literal(description)))
return u
def link_chunk_to_entity(self, chunk_id: str, entity_label: str, sentence: str = "", confidence: float = 0.5):
e = self.add_entity(entity_label)
c = self._chunk_uri(chunk_id)
self.graph.add((c, NS.mentions, e))
# add provenance as reified data on the chunk node
self.graph.add((c, NS.sentence, Literal(sentence)))
self.graph.add((c, NS.confidence, Literal(str(confidence))))
def add_triple(self, subj_label: str, pred_label: str, obj_label: str, provenance: Optional[Dict] = None):
s = self.add_entity(subj_label)
o = self.add_entity(obj_label)
p = URIRef(f"{NS_URI}relation/{pred_label.strip().lower().replace(' ', '_')}")
self.graph.add((s, p, o))
if provenance:
# store provenance on subject node for simplicity
self.graph.add((s, NS.provenance, Literal(str(provenance))))
def save(self):
self.graph.serialize(destination=self.path, format="turtle")
def find_chunks_for_entity(self, entity_label: str) -> List[str]:
e = self._entity_uri(entity_label)
q = f"SELECT ?chunk WHERE {{ ?chunk <{NS_URI}mentions> <{e}> . }}"
res = self.graph.query(q)
out = []
for r in res:
uri = str(r[0])
if uri.startswith(NS_URI + "chunk/"):
out.append(uri.split("chunk/", 1)[1])
return out
def query_entities(self, text: str) -> List[str]:
# naive: find entities whose label appears in text
text_l = text.lower()
out = []
for s, p, o in self.graph.triples((None, RDFS.label, None)):
label = str(o).lower()
if label in text_l:
out.append(str(s))
return out
else:
class KGStore:
def __init__(self, *args, **kwargs):
raise RuntimeError("rdflib is required for KGStore. Install with `pip install rdflib`")