Spaces:
Running
Running
Komalpreet Kaur commited on
feat: use robust Pydantic structured output for 100% stable graph relationships extraction
Browse files- app/services/neocortex.py +24 -23
- scratch/inspect_llm_output.py +45 -0
- scratch/test_json_failures.py +52 -0
- scratch/test_real_extraction.py +36 -0
- scratch/test_structured_output.py +38 -0
app/services/neocortex.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import json
|
| 2 |
import re
|
|
|
|
|
|
|
| 3 |
from langchain_groq import ChatGroq
|
| 4 |
from langchain_core.messages import HumanMessage
|
| 5 |
from app.core.config import settings
|
| 6 |
from app.db.neo4j_driver import neo4j_db
|
| 7 |
|
| 8 |
-
|
| 9 |
# ── Blocked meta-nodes that should never become graph entities ──
|
| 10 |
BLOCKED_NODES = {
|
| 11 |
"USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
|
|
@@ -15,6 +16,15 @@ BLOCKED_NODES = {
|
|
| 15 |
"YES", "NO", "OK", "OKAY",
|
| 16 |
}
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def _clean_text(text: str) -> str:
|
| 20 |
"""Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
|
|
@@ -74,13 +84,10 @@ def _sanitize_relation(rel: str) -> str:
|
|
| 74 |
|
| 75 |
def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
|
| 76 |
"""
|
| 77 |
-
Child-brain knowledge extraction.
|
| 78 |
|
| 79 |
Reads a conversation and extracts simple, clean concept associations —
|
| 80 |
the way a child's brain naturally builds connections between ideas.
|
| 81 |
-
|
| 82 |
-
"I play cricket on Sundays" → KOMAL --PLAYS--> CRICKET, CRICKET --PLAYED_ON--> SUNDAY
|
| 83 |
-
"My dog Baxter loves the park" → BAXTER --IS_A--> DOG, BAXTER --LOVES--> PARK
|
| 84 |
"""
|
| 85 |
if not neo4j_db.driver:
|
| 86 |
print("Knowledge Graph disabled (No DB connection).")
|
|
@@ -96,9 +103,11 @@ def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
|
|
| 96 |
print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
|
| 97 |
return 0
|
| 98 |
|
| 99 |
-
# The user's name is the identity anchor — capitalize for consistency
|
| 100 |
owner = user_id.upper()
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
|
| 103 |
|
| 104 |
Think like a child drawing a mind-map:
|
|
@@ -111,30 +120,23 @@ RULES:
|
|
| 111 |
2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
|
| 112 |
3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
|
| 113 |
4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
|
| 114 |
-
5. If the text is just greetings or small talk with zero factual content, return
|
| 115 |
|
| 116 |
Text:
|
| 117 |
-
{clean}
|
| 118 |
-
|
| 119 |
-
Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
|
| 120 |
-
No facts? Return: []"""
|
| 121 |
|
| 122 |
try:
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
match = re.search(r'\[.*\]', content, re.DOTALL)
|
| 127 |
-
if not match:
|
| 128 |
-
print("Neocortex: No JSON array in LLM response.")
|
| 129 |
return 0
|
| 130 |
|
| 131 |
-
triples = json.loads(match.group(0))
|
| 132 |
stored_count = 0
|
| 133 |
|
| 134 |
-
for t in triples:
|
| 135 |
-
subj = str(t.
|
| 136 |
-
rel = _sanitize_relation(str(t.
|
| 137 |
-
obj = str(t.
|
| 138 |
|
| 139 |
# Validate both nodes
|
| 140 |
if not _is_valid_node(subj) or not _is_valid_node(obj):
|
|
@@ -164,7 +166,6 @@ def retrieve_graph_context(query: str, user_id: str = "default_user"):
|
|
| 164 |
if not neo4j_db.driver:
|
| 165 |
return [], []
|
| 166 |
|
| 167 |
-
# Naive keyword matching: if any node name is in the query, pull its connections.
|
| 168 |
cypher = """
|
| 169 |
MATCH (n:Entity)-[r]->(m:Entity)
|
| 170 |
WHERE (n.user_id = $user_id)
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
+
from typing import List
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
from langchain_groq import ChatGroq
|
| 6 |
from langchain_core.messages import HumanMessage
|
| 7 |
from app.core.config import settings
|
| 8 |
from app.db.neo4j_driver import neo4j_db
|
| 9 |
|
|
|
|
| 10 |
# ── Blocked meta-nodes that should never become graph entities ──
|
| 11 |
BLOCKED_NODES = {
|
| 12 |
"USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
|
|
|
|
| 16 |
"YES", "NO", "OK", "OKAY",
|
| 17 |
}
|
| 18 |
|
| 19 |
+
# ── Pydantic Models for Structured LLM Output ──
|
| 20 |
+
class RelationshipTriple(BaseModel):
|
| 21 |
+
subject: str = Field(description="The subject entity (1-3 words, short CAPITALIZED concept, e.g. KOMAL, BAXTER, CRICKET)")
|
| 22 |
+
relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS, HAS, OWNS")
|
| 23 |
+
object: str = Field(description="The object entity (1-3 words, short CAPITALIZED concept, e.g. DELHI, DOG, CRICKET)")
|
| 24 |
+
|
| 25 |
+
class KnowledgeGraphExtraction(BaseModel):
|
| 26 |
+
triples: List[RelationshipTriple] = Field(description="List of simple extracted concept relationships")
|
| 27 |
+
|
| 28 |
|
| 29 |
def _clean_text(text: str) -> str:
|
| 30 |
"""Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
|
|
|
|
| 84 |
|
| 85 |
def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
|
| 86 |
"""
|
| 87 |
+
Child-brain knowledge extraction with 100% structurally guaranteed JSON output.
|
| 88 |
|
| 89 |
Reads a conversation and extracts simple, clean concept associations —
|
| 90 |
the way a child's brain naturally builds connections between ideas.
|
|
|
|
|
|
|
|
|
|
| 91 |
"""
|
| 92 |
if not neo4j_db.driver:
|
| 93 |
print("Knowledge Graph disabled (No DB connection).")
|
|
|
|
| 103 |
print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
|
| 104 |
return 0
|
| 105 |
|
|
|
|
| 106 |
owner = user_id.upper()
|
| 107 |
|
| 108 |
+
# Bind the structured Pydantic model to the Groq LLM
|
| 109 |
+
structured_llm = llm.with_structured_output(KnowledgeGraphExtraction)
|
| 110 |
+
|
| 111 |
prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
|
| 112 |
|
| 113 |
Think like a child drawing a mind-map:
|
|
|
|
| 120 |
2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
|
| 121 |
3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
|
| 122 |
4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
|
| 123 |
+
5. If the text is just greetings or small talk with zero factual content, return an empty triples list.
|
| 124 |
|
| 125 |
Text:
|
| 126 |
+
{clean}"""
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
try:
|
| 129 |
+
result = structured_llm.invoke([HumanMessage(content=prompt)])
|
| 130 |
+
if not result or not result.triples:
|
| 131 |
+
print("Neocortex: No triples extracted.")
|
|
|
|
|
|
|
|
|
|
| 132 |
return 0
|
| 133 |
|
|
|
|
| 134 |
stored_count = 0
|
| 135 |
|
| 136 |
+
for t in result.triples:
|
| 137 |
+
subj = str(t.subject).strip().upper()
|
| 138 |
+
rel = _sanitize_relation(str(t.relation))
|
| 139 |
+
obj = str(t.object).strip().upper()
|
| 140 |
|
| 141 |
# Validate both nodes
|
| 142 |
if not _is_valid_node(subj) or not _is_valid_node(obj):
|
|
|
|
| 166 |
if not neo4j_db.driver:
|
| 167 |
return [], []
|
| 168 |
|
|
|
|
| 169 |
cypher = """
|
| 170 |
MATCH (n:Entity)-[r]->(m:Entity)
|
| 171 |
WHERE (n.user_id = $user_id)
|
scratch/inspect_llm_output.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
# Add project root to path
|
| 7 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
+
|
| 9 |
+
from langchain_groq import ChatGroq
|
| 10 |
+
from langchain_core.messages import HumanMessage
|
| 11 |
+
from app.core.config import settings
|
| 12 |
+
|
| 13 |
+
def extract_and_print(text: str, user_id: str = "default_user"):
|
| 14 |
+
api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
|
| 15 |
+
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
|
| 16 |
+
|
| 17 |
+
owner = user_id.upper()
|
| 18 |
+
prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
|
| 19 |
+
|
| 20 |
+
Think like a child drawing a mind-map:
|
| 21 |
+
- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
|
| 22 |
+
- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
|
| 23 |
+
- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
|
| 24 |
+
|
| 25 |
+
RULES:
|
| 26 |
+
1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
|
| 27 |
+
2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
|
| 28 |
+
3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
|
| 29 |
+
4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
|
| 30 |
+
5. If the text is just greetings or small talk with zero factual content, return: []
|
| 31 |
+
|
| 32 |
+
Text:
|
| 33 |
+
{text}
|
| 34 |
+
|
| 35 |
+
Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
|
| 36 |
+
No facts? Return: []"""
|
| 37 |
+
|
| 38 |
+
print(f"--- Prompt sent to LLM ---")
|
| 39 |
+
response = llm.invoke([HumanMessage(content=prompt)])
|
| 40 |
+
content = response.content.strip()
|
| 41 |
+
print(f"--- LLM Output Content ---")
|
| 42 |
+
print(content)
|
| 43 |
+
print(f"--------------------------")
|
| 44 |
+
|
| 45 |
+
extract_and_print("My dog Baxter likes chasing tennis balls in Delhi", "komal")
|
scratch/test_json_failures.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
from langchain_groq import ChatGroq
|
| 6 |
+
from langchain_core.messages import HumanMessage
|
| 7 |
+
from app.core.config import settings
|
| 8 |
+
|
| 9 |
+
def extract_and_inspect():
|
| 10 |
+
api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
|
| 11 |
+
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
|
| 12 |
+
|
| 13 |
+
owner = "KOMAL"
|
| 14 |
+
inp = "My dog Baxter likes chasing tennis balls in Delhi"
|
| 15 |
+
prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
|
| 16 |
+
|
| 17 |
+
Think like a child drawing a mind-map:
|
| 18 |
+
- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
|
| 19 |
+
- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
|
| 20 |
+
- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
|
| 21 |
+
|
| 22 |
+
RULES:
|
| 23 |
+
1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
|
| 24 |
+
2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
|
| 25 |
+
3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
|
| 26 |
+
4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
|
| 27 |
+
5. If the text is just greetings or small talk with zero factual content, return: []
|
| 28 |
+
|
| 29 |
+
Text:
|
| 30 |
+
{inp}
|
| 31 |
+
|
| 32 |
+
Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
|
| 33 |
+
No facts? Return: []"""
|
| 34 |
+
|
| 35 |
+
# We do 5 calls to see if it ever generates invalid JSON
|
| 36 |
+
for i in range(5):
|
| 37 |
+
response = llm.invoke([HumanMessage(content=prompt)])
|
| 38 |
+
content = response.content.strip()
|
| 39 |
+
print(f"\n--- Run {i+1} Output ---")
|
| 40 |
+
print(content)
|
| 41 |
+
|
| 42 |
+
match = re.search(r'\[.*\]', content, re.DOTALL)
|
| 43 |
+
if match:
|
| 44 |
+
json_str = match.group(0)
|
| 45 |
+
try:
|
| 46 |
+
json.loads(json_str)
|
| 47 |
+
print("Valid JSON: Yes")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"Valid JSON: No (Error: {e})")
|
| 50 |
+
print(f"Extracted string: {json_str!r}")
|
| 51 |
+
|
| 52 |
+
extract_and_inspect()
|
scratch/test_real_extraction.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import asyncio
|
| 5 |
+
|
| 6 |
+
# Add project root to path
|
| 7 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
+
|
| 9 |
+
from app.services.neocortex import extract_and_store_knowledge
|
| 10 |
+
from app.db.neo4j_driver import neo4j_db
|
| 11 |
+
|
| 12 |
+
async def main():
|
| 13 |
+
print("Testing neocortex on real DB and Groq connection...")
|
| 14 |
+
|
| 15 |
+
# Check driver
|
| 16 |
+
if not neo4j_db.driver:
|
| 17 |
+
print("Neo4j driver is not connected. Attempting connect...")
|
| 18 |
+
from app.core.config import settings
|
| 19 |
+
print(f"Connecting to {settings.NEO4J_URI}...")
|
| 20 |
+
neo4j_db.connect(settings.NEO4J_URI, settings.NEO4J_USER, settings.NEO4J_PASSWORD)
|
| 21 |
+
|
| 22 |
+
test_inputs = [
|
| 23 |
+
"I love coding in Python and playing cricket on Sundays",
|
| 24 |
+
"My dog Baxter likes chasing tennis balls in Delhi",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
for inp in test_inputs:
|
| 28 |
+
print(f"\n========================================\nInput: '{inp}'")
|
| 29 |
+
try:
|
| 30 |
+
triples = extract_and_store_knowledge(inp, "komal")
|
| 31 |
+
print(f"Result: extracted {triples} relationships successfully.")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"Exception during execution: {e}")
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
asyncio.run(main())
|
scratch/test_structured_output.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add project root to path
|
| 5 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from typing import List
|
| 9 |
+
from langchain_groq import ChatGroq
|
| 10 |
+
from app.core.config import settings
|
| 11 |
+
|
| 12 |
+
class RelationshipTriple(BaseModel):
|
| 13 |
+
subject: str = Field(description="The subject entity (1-3 words, UPPERCASE concept)")
|
| 14 |
+
relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS")
|
| 15 |
+
object: str = Field(description="The object entity (1-3 words, UPPERCASE concept)")
|
| 16 |
+
|
| 17 |
+
class KnowledgeGraphExtraction(BaseModel):
|
| 18 |
+
triples: List[RelationshipTriple] = Field(description="List of extracted concept relationships")
|
| 19 |
+
|
| 20 |
+
def test_structured_output():
|
| 21 |
+
api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
|
| 22 |
+
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
structured_llm = llm.with_structured_output(KnowledgeGraphExtraction)
|
| 26 |
+
print("Success: ChatGroq.with_structured_output is fully supported!")
|
| 27 |
+
|
| 28 |
+
# Test it on a simple prompt
|
| 29 |
+
result = structured_llm.invoke("My dog Baxter likes chasing tennis balls in Delhi")
|
| 30 |
+
print("Result object:", result)
|
| 31 |
+
print("Extracted triples:")
|
| 32 |
+
for t in result.triples:
|
| 33 |
+
print(f"- {t.subject} --{t.relation}--> {t.object}")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print("Failed to run structured output:", e)
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
test_structured_output()
|