Komalpreet Kaur commited on
Commit
209f9ba
Β·
unverified Β·
1 Parent(s): 7ab617e

refactor: child-brain knowledge extraction - simple concept nodes only

Browse files
Files changed (1) hide show
  1. app/services/neocortex.py +82 -49
app/services/neocortex.py CHANGED
@@ -1,13 +1,55 @@
1
  import json
 
2
  from langchain_groq import ChatGroq
3
  from langchain_core.messages import HumanMessage
4
  from app.core.config import settings
5
  from app.db.neo4j_driver import neo4j_db
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
8
  """
9
- The Neocortex extraction.
10
- Takes plain text, finds logical triples, and stores them in Neo4j.
 
 
 
 
 
11
  """
12
  if not neo4j_db.driver:
13
  print("Knowledge Graph disabled (No DB connection).")
@@ -16,82 +58,73 @@ def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
16
  api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
17
  llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
18
 
19
- # Strip conversational prefixes so the LLM sees clean text, not chat format.
20
- import re as _re
21
- clean_text = _re.sub(r'^(User|Soma|Assistant|AI|Human):\s*', '', text, flags=_re.MULTILINE).strip()
22
-
23
- # Skip extraction for very short inputs (e.g. just a name) β€” not enough
24
- # content to contain meaningful relationships.
25
- if len(clean_text.split()) < 5:
26
- print(f"Neocortex: Input too short for extraction ({len(clean_text.split())} words), skipping.")
27
  return 0
28
 
29
- prompt = f"""Extract real-world knowledge entities and their factual relationships from the text below.
 
30
 
31
- You must extract ONLY concrete knowledge β€” people, places, organizations, concepts, skills, events, and their relationships.
32
 
33
- STRICT RULES:
34
- 1. DO NOT create nodes for conversational actors (e.g. "USER", "SOMA", "AI", "ASSISTANT", "BOT").
35
- 2. DO NOT create relationships about who said what, who asked whom, or who responded to whom.
36
- 3. ONLY extract factual, real-world knowledge that someone would put in an encyclopedia or knowledge base.
37
- 4. Entity names must be SHORT (1-3 words max), CAPITALIZED, and represent real concepts β€” NOT sentences or phrases.
38
- 5. If the text is just casual chat with no real knowledge content, return an empty array: []
39
 
40
- Text:
41
- {clean_text}
 
 
 
 
42
 
43
- Return ONLY a valid JSON array: [{{"subject": "ENTITY", "relation": "RELATION", "object": "ENTITY"}}]
44
- If no real-world knowledge exists, return: []"""
45
 
46
- # Nodes to block β€” meta-conversational entities that pollute the graph
47
- BLOCKED_NODES = {
48
- "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
49
- "CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
50
- }
51
 
52
  try:
53
  response = llm.invoke([HumanMessage(content=prompt)])
54
  content = response.content.strip()
55
 
56
- # Use regex to find the JSON array in case the LLM added conversational text
57
- import re
58
  match = re.search(r'\[.*\]', content, re.DOTALL)
59
  if not match:
60
- print("No JSON array found in LLM response.")
61
  return 0
62
 
63
- json_str = match.group(0)
64
- triples = json.loads(json_str)
65
  stored_count = 0
66
 
67
  for t in triples:
68
  subj = str(t.get("subject", "")).strip().upper()
69
- rel = str(t.get("relation", "")).strip().upper()
70
- obj = str(t.get("object", "")).strip().upper()
71
 
72
- # Neo4j relation names can't have spaces or special non-alphanumeric chars
73
- rel = rel.replace(" ", "_").replace("-", "_")
74
-
75
- # Block meta-conversational nodes and overly long node names
76
- if subj in BLOCKED_NODES or obj in BLOCKED_NODES:
77
  continue
78
- if len(subj) > 50 or len(obj) > 50: # Node names shouldn't be sentences
79
  continue
80
 
81
- if subj and rel and obj:
82
- cypher = f"""
83
- MERGE (s:Entity {{name: $subject, user_id: $user_id}})
84
- MERGE (o:Entity {{name: $object, user_id: $user_id}})
85
- MERGE (s)-[r:`{rel}`]->(o)
86
- """
87
- neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
88
- stored_count += 1
89
 
90
  return stored_count
91
  except Exception as e:
92
- print(f"Error in Neocortex extraction: {e}")
93
  return 0
94
 
 
95
  def retrieve_graph_context(query: str, user_id: str = "default_user"):
96
  """
97
  Search the Knowledge Graph for entities mentioned in the query.
 
1
  import json
2
+ import re
3
  from langchain_groq import ChatGroq
4
  from langchain_core.messages import HumanMessage
5
  from app.core.config import settings
6
  from app.db.neo4j_driver import neo4j_db
7
 
8
+
9
+ # ── Blocked meta-nodes that should never become graph entities ──
10
+ BLOCKED_NODES = {
11
+ "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
12
+ "CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
13
+ "QUESTION", "ANSWER", "RESPONSE", "MESSAGE", "CHAT",
14
+ "CONVERSATION", "HELLO", "HI", "HEY", "THANKS", "THANK YOU",
15
+ "YES", "NO", "OK", "OKAY",
16
+ }
17
+
18
+
19
+ def _clean_text(text: str) -> str:
20
+ """Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
21
+ cleaned = re.sub(r'^(User|Soma|Assistant|AI|Human):\s*', '', text, flags=re.MULTILINE)
22
+ return cleaned.strip()
23
+
24
+
25
+ def _is_valid_node(name: str) -> bool:
26
+ """Check if a node name is a clean, short concept β€” not junk."""
27
+ if not name or name in BLOCKED_NODES:
28
+ return False
29
+ if len(name) > 40: # Nodes must be short concepts, not sentences
30
+ return False
31
+ if len(name.split()) > 4: # Max 4 words
32
+ return False
33
+ return True
34
+
35
+
36
+ def _sanitize_relation(rel: str) -> str:
37
+ """Clean a relation name for Neo4j compatibility."""
38
+ rel = rel.upper().strip()
39
+ rel = re.sub(r'[^A-Z0-9_]', '_', rel) # Only alphanumeric + underscore
40
+ rel = re.sub(r'_+', '_', rel).strip('_') # Collapse multiple underscores
41
+ return rel or "RELATED_TO"
42
+
43
+
44
  def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
45
  """
46
+ Child-brain knowledge extraction.
47
+
48
+ Reads a conversation and extracts simple, clean concept associations β€”
49
+ the way a child's brain naturally builds connections between ideas.
50
+
51
+ "I play cricket on Sundays" β†’ KOMAL --PLAYS--> CRICKET, CRICKET --PLAYED_ON--> SUNDAY
52
+ "My dog Baxter loves the park" β†’ BAXTER --IS_A--> DOG, BAXTER --LOVES--> PARK
53
  """
54
  if not neo4j_db.driver:
55
  print("Knowledge Graph disabled (No DB connection).")
 
58
  api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
59
  llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
60
 
61
+ clean = _clean_text(text)
62
+
63
+ # Need at least a few words to extract anything meaningful
64
+ if len(clean.split()) < 5:
65
+ print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
 
 
 
66
  return 0
67
 
68
+ # The user's name is the identity anchor β€” capitalize for consistency
69
+ owner = user_id.upper()
70
 
71
+ prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
72
 
73
+ Think like a child drawing a mind-map:
74
+ - "{owner}" is the person speaking. If they say "I like X" β†’ {owner} --LIKES--> X
75
+ - Extract only SHORT concept names (1-3 words). Never use full sentences as names.
76
+ - Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
 
 
77
 
78
+ RULES:
79
+ 1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
80
+ 2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
81
+ 3. "I" or "my" in the text refers to "{owner}" β€” always use "{owner}" as the node name for the speaker.
82
+ 4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
83
+ 5. If the text is just greetings or small talk with zero factual content, return: []
84
 
85
+ Text:
86
+ {clean}
87
 
88
+ Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
89
+ No facts? Return: []"""
 
 
 
90
 
91
  try:
92
  response = llm.invoke([HumanMessage(content=prompt)])
93
  content = response.content.strip()
94
 
 
 
95
  match = re.search(r'\[.*\]', content, re.DOTALL)
96
  if not match:
97
+ print("Neocortex: No JSON array in LLM response.")
98
  return 0
99
 
100
+ triples = json.loads(match.group(0))
 
101
  stored_count = 0
102
 
103
  for t in triples:
104
  subj = str(t.get("subject", "")).strip().upper()
105
+ rel = _sanitize_relation(str(t.get("relation", "")))
106
+ obj = str(t.get("object", "")).strip().upper()
107
 
108
+ # Validate both nodes
109
+ if not _is_valid_node(subj) or not _is_valid_node(obj):
 
 
 
110
  continue
111
+ if subj == obj: # Self-loops are meaningless
112
  continue
113
 
114
+ cypher = f"""
115
+ MERGE (s:Entity {{name: $subject, user_id: $user_id}})
116
+ MERGE (o:Entity {{name: $object, user_id: $user_id}})
117
+ MERGE (s)-[r:`{rel}`]->(o)
118
+ """
119
+ neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
120
+ stored_count += 1
 
121
 
122
  return stored_count
123
  except Exception as e:
124
+ print(f"Neocortex extraction error: {e}")
125
  return 0
126
 
127
+
128
  def retrieve_graph_context(query: str, user_id: str = "default_user"):
129
  """
130
  Search the Knowledge Graph for entities mentioned in the query.