cmd0160's picture
Adding files
ee749be
"""LLM-backed triple/entity extractor for PoC.
This module provides a small wrapper that asks the LLM (via LangChain ChatOpenAI)
to extract a small set of triples from a text chunk. It returns a list of dicts:
{"subject": ..., "predicate": ..., "object": ..., "sentence": ..., "confidence": float}
The implementation is intentionally conservative and small for a Spaces-compatible PoC.
"""
from typing import List, Dict
import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
def extract_triples_with_llm(text: str, max_triples: int = 6, model_name: str = "gpt-3.5-turbo") -> List[Dict]:
"""Extract triples from text using a Chat LLM. Returns parsed JSON triples.
Note: requires OPENAI_API_KEY in env for ChatOpenAI to work.
"""
prompt = (
"You are an assistant that extracts factual triples from a short text.\n"
"Return a JSON array where each element is an object with keys: subject, predicate, object, sentence, confidence.\n"
"Be concise and only return JSON. Confidence should be a float between 0.0 and 1.0.\n"
f"Limit results to at most {max_triples} triples.\n\n"
"Text:\n<<<TEXT_START>>>\n"
+ text
+ "\n<<<TEXT_END>>>\n"
)
# system message to instruct format strictly
system = SystemMessage(content="You output only JSON arrays. Do not add any extra text.")
human = HumanMessage(content=prompt)
llm = ChatOpenAI(model_name=model_name, temperature=0.0)
resp = llm([system, human])
raw = resp.content.strip()
# Attempt to find JSON in the output
try:
data = json.loads(raw)
except Exception:
# try to find first JSON substring
start = raw.find("[")
end = raw.rfind("]")
if start != -1 and end != -1:
try:
data = json.loads(raw[start:end+1])
except Exception:
data = []
else:
data = []
cleaned: List[Dict] = []
for item in data:
if not isinstance(item, dict):
continue
subj = item.get("subject") or item.get("s")
pred = item.get("predicate") or item.get("p")
obj = item.get("object") or item.get("o")
sent = item.get("sentence") or ""
conf = item.get("confidence")
try:
conf = float(conf) if conf is not None else 0.5
except Exception:
conf = 0.5
if subj and pred and obj:
cleaned.append({
"subject": str(subj),
"predicate": str(pred),
"object": str(obj),
"sentence": str(sent),
"confidence": conf,
})
return cleaned