Komalpreet Kaur commited on
Commit
7c1bea9
·
unverified ·
1 Parent(s): dc97067

feat: use robust Pydantic structured output for 100% stable graph relationships extraction

Browse files
app/services/neocortex.py CHANGED
@@ -1,11 +1,12 @@
1
  import json
2
  import re
 
 
3
  from langchain_groq import ChatGroq
4
  from langchain_core.messages import HumanMessage
5
  from app.core.config import settings
6
  from app.db.neo4j_driver import neo4j_db
7
 
8
-
9
  # ── Blocked meta-nodes that should never become graph entities ──
10
  BLOCKED_NODES = {
11
  "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
@@ -15,6 +16,15 @@ BLOCKED_NODES = {
15
  "YES", "NO", "OK", "OKAY",
16
  }
17
 
 
 
 
 
 
 
 
 
 
18
 
19
  def _clean_text(text: str) -> str:
20
  """Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
@@ -74,13 +84,10 @@ def _sanitize_relation(rel: str) -> str:
74
 
75
  def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
76
  """
77
- Child-brain knowledge extraction.
78
 
79
  Reads a conversation and extracts simple, clean concept associations —
80
  the way a child's brain naturally builds connections between ideas.
81
-
82
- "I play cricket on Sundays" → KOMAL --PLAYS--> CRICKET, CRICKET --PLAYED_ON--> SUNDAY
83
- "My dog Baxter loves the park" → BAXTER --IS_A--> DOG, BAXTER --LOVES--> PARK
84
  """
85
  if not neo4j_db.driver:
86
  print("Knowledge Graph disabled (No DB connection).")
@@ -96,9 +103,11 @@ def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
96
  print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
97
  return 0
98
 
99
- # The user's name is the identity anchor — capitalize for consistency
100
  owner = user_id.upper()
101
 
 
 
 
102
  prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
103
 
104
  Think like a child drawing a mind-map:
@@ -111,30 +120,23 @@ RULES:
111
  2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
112
  3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
113
  4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
114
- 5. If the text is just greetings or small talk with zero factual content, return: []
115
 
116
  Text:
117
- {clean}
118
-
119
- Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
120
- No facts? Return: []"""
121
 
122
  try:
123
- response = llm.invoke([HumanMessage(content=prompt)])
124
- content = response.content.strip()
125
-
126
- match = re.search(r'\[.*\]', content, re.DOTALL)
127
- if not match:
128
- print("Neocortex: No JSON array in LLM response.")
129
  return 0
130
 
131
- triples = json.loads(match.group(0))
132
  stored_count = 0
133
 
134
- for t in triples:
135
- subj = str(t.get("subject", "")).strip().upper()
136
- rel = _sanitize_relation(str(t.get("relation", "")))
137
- obj = str(t.get("object", "")).strip().upper()
138
 
139
  # Validate both nodes
140
  if not _is_valid_node(subj) or not _is_valid_node(obj):
@@ -164,7 +166,6 @@ def retrieve_graph_context(query: str, user_id: str = "default_user"):
164
  if not neo4j_db.driver:
165
  return [], []
166
 
167
- # Naive keyword matching: if any node name is in the query, pull its connections.
168
  cypher = """
169
  MATCH (n:Entity)-[r]->(m:Entity)
170
  WHERE (n.user_id = $user_id)
 
1
  import json
2
  import re
3
+ from typing import List
4
+ from pydantic import BaseModel, Field
5
  from langchain_groq import ChatGroq
6
  from langchain_core.messages import HumanMessage
7
  from app.core.config import settings
8
  from app.db.neo4j_driver import neo4j_db
9
 
 
10
  # ── Blocked meta-nodes that should never become graph entities ──
11
  BLOCKED_NODES = {
12
  "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
 
16
  "YES", "NO", "OK", "OKAY",
17
  }
18
 
19
+ # ── Pydantic Models for Structured LLM Output ──
20
+ class RelationshipTriple(BaseModel):
21
+ subject: str = Field(description="The subject entity (1-3 words, short CAPITALIZED concept, e.g. KOMAL, BAXTER, CRICKET)")
22
+ relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS, HAS, OWNS")
23
+ object: str = Field(description="The object entity (1-3 words, short CAPITALIZED concept, e.g. DELHI, DOG, CRICKET)")
24
+
25
+ class KnowledgeGraphExtraction(BaseModel):
26
+ triples: List[RelationshipTriple] = Field(description="List of simple extracted concept relationships")
27
+
28
 
29
  def _clean_text(text: str) -> str:
30
  """Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
 
84
 
85
  def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
86
  """
87
+ Child-brain knowledge extraction with 100% structurally guaranteed JSON output.
88
 
89
  Reads a conversation and extracts simple, clean concept associations —
90
  the way a child's brain naturally builds connections between ideas.
 
 
 
91
  """
92
  if not neo4j_db.driver:
93
  print("Knowledge Graph disabled (No DB connection).")
 
103
  print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
104
  return 0
105
 
 
106
  owner = user_id.upper()
107
 
108
+ # Bind the structured Pydantic model to the Groq LLM
109
+ structured_llm = llm.with_structured_output(KnowledgeGraphExtraction)
110
+
111
  prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
112
 
113
  Think like a child drawing a mind-map:
 
120
  2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
121
  3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
122
  4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
123
+ 5. If the text is just greetings or small talk with zero factual content, return an empty triples list.
124
 
125
  Text:
126
+ {clean}"""
 
 
 
127
 
128
  try:
129
+ result = structured_llm.invoke([HumanMessage(content=prompt)])
130
+ if not result or not result.triples:
131
+ print("Neocortex: No triples extracted.")
 
 
 
132
  return 0
133
 
 
134
  stored_count = 0
135
 
136
+ for t in result.triples:
137
+ subj = str(t.subject).strip().upper()
138
+ rel = _sanitize_relation(str(t.relation))
139
+ obj = str(t.object).strip().upper()
140
 
141
  # Validate both nodes
142
  if not _is_valid_node(subj) or not _is_valid_node(obj):
 
166
  if not neo4j_db.driver:
167
  return [], []
168
 
 
169
  cypher = """
170
  MATCH (n:Entity)-[r]->(m:Entity)
171
  WHERE (n.user_id = $user_id)
scratch/inspect_llm_output.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+ import re
5
+
6
+ # Add project root to path
7
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from langchain_groq import ChatGroq
10
+ from langchain_core.messages import HumanMessage
11
+ from app.core.config import settings
12
+
13
+ def extract_and_print(text: str, user_id: str = "default_user"):
14
+ api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
15
+ llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
16
+
17
+ owner = user_id.upper()
18
+ prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
19
+
20
+ Think like a child drawing a mind-map:
21
+ - "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
22
+ - Extract only SHORT concept names (1-3 words). Never use full sentences as names.
23
+ - Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
24
+
25
+ RULES:
26
+ 1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
27
+ 2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
28
+ 3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
29
+ 4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
30
+ 5. If the text is just greetings or small talk with zero factual content, return: []
31
+
32
+ Text:
33
+ {text}
34
+
35
+ Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
36
+ No facts? Return: []"""
37
+
38
+ print(f"--- Prompt sent to LLM ---")
39
+ response = llm.invoke([HumanMessage(content=prompt)])
40
+ content = response.content.strip()
41
+ print(f"--- LLM Output Content ---")
42
+ print(content)
43
+ print(f"--------------------------")
44
+
45
+ extract_and_print("My dog Baxter likes chasing tennis balls in Delhi", "komal")
scratch/test_json_failures.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+ import re
5
+ from langchain_groq import ChatGroq
6
+ from langchain_core.messages import HumanMessage
7
+ from app.core.config import settings
8
+
9
+ def extract_and_inspect():
10
+ api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
11
+ llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
12
+
13
+ owner = "KOMAL"
14
+ inp = "My dog Baxter likes chasing tennis balls in Delhi"
15
+ prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
16
+
17
+ Think like a child drawing a mind-map:
18
+ - "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
19
+ - Extract only SHORT concept names (1-3 words). Never use full sentences as names.
20
+ - Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
21
+
22
+ RULES:
23
+ 1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
24
+ 2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
25
+ 3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
26
+ 4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
27
+ 5. If the text is just greetings or small talk with zero factual content, return: []
28
+
29
+ Text:
30
+ {inp}
31
+
32
+ Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
33
+ No facts? Return: []"""
34
+
35
+ # We do 5 calls to see if it ever generates invalid JSON
36
+ for i in range(5):
37
+ response = llm.invoke([HumanMessage(content=prompt)])
38
+ content = response.content.strip()
39
+ print(f"\n--- Run {i+1} Output ---")
40
+ print(content)
41
+
42
+ match = re.search(r'\[.*\]', content, re.DOTALL)
43
+ if match:
44
+ json_str = match.group(0)
45
+ try:
46
+ json.loads(json_str)
47
+ print("Valid JSON: Yes")
48
+ except Exception as e:
49
+ print(f"Valid JSON: No (Error: {e})")
50
+ print(f"Extracted string: {json_str!r}")
51
+
52
+ extract_and_inspect()
scratch/test_real_extraction.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+ import asyncio
5
+
6
+ # Add project root to path
7
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from app.services.neocortex import extract_and_store_knowledge
10
+ from app.db.neo4j_driver import neo4j_db
11
+
12
+ async def main():
13
+ print("Testing neocortex on real DB and Groq connection...")
14
+
15
+ # Check driver
16
+ if not neo4j_db.driver:
17
+ print("Neo4j driver is not connected. Attempting connect...")
18
+ from app.core.config import settings
19
+ print(f"Connecting to {settings.NEO4J_URI}...")
20
+ neo4j_db.connect(settings.NEO4J_URI, settings.NEO4J_USER, settings.NEO4J_PASSWORD)
21
+
22
+ test_inputs = [
23
+ "I love coding in Python and playing cricket on Sundays",
24
+ "My dog Baxter likes chasing tennis balls in Delhi",
25
+ ]
26
+
27
+ for inp in test_inputs:
28
+ print(f"\n========================================\nInput: '{inp}'")
29
+ try:
30
+ triples = extract_and_store_knowledge(inp, "komal")
31
+ print(f"Result: extracted {triples} relationships successfully.")
32
+ except Exception as e:
33
+ print(f"Exception during execution: {e}")
34
+
35
+ if __name__ == "__main__":
36
+ asyncio.run(main())
scratch/test_structured_output.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Add project root to path
5
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
+
7
+ from pydantic import BaseModel, Field
8
+ from typing import List
9
+ from langchain_groq import ChatGroq
10
+ from app.core.config import settings
11
+
12
+ class RelationshipTriple(BaseModel):
13
+ subject: str = Field(description="The subject entity (1-3 words, UPPERCASE concept)")
14
+ relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS")
15
+ object: str = Field(description="The object entity (1-3 words, UPPERCASE concept)")
16
+
17
+ class KnowledgeGraphExtraction(BaseModel):
18
+ triples: List[RelationshipTriple] = Field(description="List of extracted concept relationships")
19
+
20
+ def test_structured_output():
21
+ api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
22
+ llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
23
+
24
+ try:
25
+ structured_llm = llm.with_structured_output(KnowledgeGraphExtraction)
26
+ print("Success: ChatGroq.with_structured_output is fully supported!")
27
+
28
+ # Test it on a simple prompt
29
+ result = structured_llm.invoke("My dog Baxter likes chasing tennis balls in Delhi")
30
+ print("Result object:", result)
31
+ print("Extracted triples:")
32
+ for t in result.triples:
33
+ print(f"- {t.subject} --{t.relation}--> {t.object}")
34
+ except Exception as e:
35
+ print("Failed to run structured output:", e)
36
+
37
+ if __name__ == "__main__":
38
+ test_structured_output()