|
|
"""LLM-backed triple/entity extractor for PoC. |
|
|
|
|
|
This module provides a small wrapper that asks the LLM (via LangChain ChatOpenAI) |
|
|
to extract a small set of triples from a text chunk. It returns a list of dicts: |
|
|
{"subject": ..., "predicate": ..., "object": ..., "sentence": ..., "confidence": float} |
|
|
|
|
|
The implementation is intentionally conservative and small for a Spaces-compatible PoC. |
|
|
""" |
|
|
from typing import List, Dict |
|
|
import json |
|
|
|
|
|
from langchain.chat_models import ChatOpenAI |
|
|
from langchain.schema import HumanMessage, SystemMessage |
|
|
|
|
|
|
|
|
def extract_triples_with_llm(text: str, max_triples: int = 6, model_name: str = "gpt-3.5-turbo") -> List[Dict]: |
|
|
"""Extract triples from text using a Chat LLM. Returns parsed JSON triples. |
|
|
|
|
|
Note: requires OPENAI_API_KEY in env for ChatOpenAI to work. |
|
|
""" |
|
|
prompt = ( |
|
|
"You are an assistant that extracts factual triples from a short text.\n" |
|
|
"Return a JSON array where each element is an object with keys: subject, predicate, object, sentence, confidence.\n" |
|
|
"Be concise and only return JSON. Confidence should be a float between 0.0 and 1.0.\n" |
|
|
f"Limit results to at most {max_triples} triples.\n\n" |
|
|
"Text:\n<<<TEXT_START>>>\n" |
|
|
+ text |
|
|
+ "\n<<<TEXT_END>>>\n" |
|
|
) |
|
|
|
|
|
|
|
|
system = SystemMessage(content="You output only JSON arrays. Do not add any extra text.") |
|
|
human = HumanMessage(content=prompt) |
|
|
|
|
|
llm = ChatOpenAI(model_name=model_name, temperature=0.0) |
|
|
resp = llm([system, human]) |
|
|
raw = resp.content.strip() |
|
|
|
|
|
|
|
|
try: |
|
|
data = json.loads(raw) |
|
|
except Exception: |
|
|
|
|
|
start = raw.find("[") |
|
|
end = raw.rfind("]") |
|
|
if start != -1 and end != -1: |
|
|
try: |
|
|
data = json.loads(raw[start:end+1]) |
|
|
except Exception: |
|
|
data = [] |
|
|
else: |
|
|
data = [] |
|
|
|
|
|
cleaned: List[Dict] = [] |
|
|
for item in data: |
|
|
if not isinstance(item, dict): |
|
|
continue |
|
|
subj = item.get("subject") or item.get("s") |
|
|
pred = item.get("predicate") or item.get("p") |
|
|
obj = item.get("object") or item.get("o") |
|
|
sent = item.get("sentence") or "" |
|
|
conf = item.get("confidence") |
|
|
try: |
|
|
conf = float(conf) if conf is not None else 0.5 |
|
|
except Exception: |
|
|
conf = 0.5 |
|
|
if subj and pred and obj: |
|
|
cleaned.append({ |
|
|
"subject": str(subj), |
|
|
"predicate": str(pred), |
|
|
"object": str(obj), |
|
|
"sentence": str(sent), |
|
|
"confidence": conf, |
|
|
}) |
|
|
return cleaned |
|
|
|