"""LLM-backed triple/entity extractor for PoC. This module provides a small wrapper that asks the LLM (via LangChain ChatOpenAI) to extract a small set of triples from a text chunk. It returns a list of dicts: {"subject": ..., "predicate": ..., "object": ..., "sentence": ..., "confidence": float} The implementation is intentionally conservative and small for a Spaces-compatible PoC. """ from typing import List, Dict import json from langchain.chat_models import ChatOpenAI from langchain.schema import HumanMessage, SystemMessage def extract_triples_with_llm(text: str, max_triples: int = 6, model_name: str = "gpt-3.5-turbo") -> List[Dict]: """Extract triples from text using a Chat LLM. Returns parsed JSON triples. Note: requires OPENAI_API_KEY in env for ChatOpenAI to work. """ prompt = ( "You are an assistant that extracts factual triples from a short text.\n" "Return a JSON array where each element is an object with keys: subject, predicate, object, sentence, confidence.\n" "Be concise and only return JSON. Confidence should be a float between 0.0 and 1.0.\n" f"Limit results to at most {max_triples} triples.\n\n" "Text:\n<<>>\n" + text + "\n<<>>\n" ) # system message to instruct format strictly system = SystemMessage(content="You output only JSON arrays. Do not add any extra text.") human = HumanMessage(content=prompt) llm = ChatOpenAI(model_name=model_name, temperature=0.0) resp = llm([system, human]) raw = resp.content.strip() # Attempt to find JSON in the output try: data = json.loads(raw) except Exception: # try to find first JSON substring start = raw.find("[") end = raw.rfind("]") if start != -1 and end != -1: try: data = json.loads(raw[start:end+1]) except Exception: data = [] else: data = [] cleaned: List[Dict] = [] for item in data: if not isinstance(item, dict): continue subj = item.get("subject") or item.get("s") pred = item.get("predicate") or item.get("p") obj = item.get("object") or item.get("o") sent = item.get("sentence") or "" conf = item.get("confidence") try: conf = float(conf) if conf is not None else 0.5 except Exception: conf = 0.5 if subj and pred and obj: cleaned.append({ "subject": str(subj), "predicate": str(pred), "object": str(obj), "sentence": str(sent), "confidence": conf, }) return cleaned