Spaces:
Running
Running
Komalpreet Kaur commited on
refactor: child-brain knowledge extraction - simple concept nodes only
Browse files- app/services/neocortex.py +82 -49
app/services/neocortex.py
CHANGED
|
@@ -1,13 +1,55 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
from langchain_groq import ChatGroq
|
| 3 |
from langchain_core.messages import HumanMessage
|
| 4 |
from app.core.config import settings
|
| 5 |
from app.db.neo4j_driver import neo4j_db
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
|
| 8 |
"""
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
if not neo4j_db.driver:
|
| 13 |
print("Knowledge Graph disabled (No DB connection).")
|
|
@@ -16,82 +58,73 @@ def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
|
|
| 16 |
api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
|
| 17 |
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# content to contain meaningful relationships.
|
| 25 |
-
if len(clean_text.split()) < 5:
|
| 26 |
-
print(f"Neocortex: Input too short for extraction ({len(clean_text.split())} words), skipping.")
|
| 27 |
return 0
|
| 28 |
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
4. Entity names must be SHORT (1-3 words max), CAPITALIZED, and represent real concepts β NOT sentences or phrases.
|
| 38 |
-
5. If the text is just casual chat with no real knowledge content, return an empty array: []
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
"USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
|
| 49 |
-
"CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
|
| 50 |
-
}
|
| 51 |
|
| 52 |
try:
|
| 53 |
response = llm.invoke([HumanMessage(content=prompt)])
|
| 54 |
content = response.content.strip()
|
| 55 |
|
| 56 |
-
# Use regex to find the JSON array in case the LLM added conversational text
|
| 57 |
-
import re
|
| 58 |
match = re.search(r'\[.*\]', content, re.DOTALL)
|
| 59 |
if not match:
|
| 60 |
-
print("No JSON array
|
| 61 |
return 0
|
| 62 |
|
| 63 |
-
|
| 64 |
-
triples = json.loads(json_str)
|
| 65 |
stored_count = 0
|
| 66 |
|
| 67 |
for t in triples:
|
| 68 |
subj = str(t.get("subject", "")).strip().upper()
|
| 69 |
-
rel
|
| 70 |
-
obj
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
# Block meta-conversational nodes and overly long node names
|
| 76 |
-
if subj in BLOCKED_NODES or obj in BLOCKED_NODES:
|
| 77 |
continue
|
| 78 |
-
if
|
| 79 |
continue
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
stored_count += 1
|
| 89 |
|
| 90 |
return stored_count
|
| 91 |
except Exception as e:
|
| 92 |
-
print(f"
|
| 93 |
return 0
|
| 94 |
|
|
|
|
| 95 |
def retrieve_graph_context(query: str, user_id: str = "default_user"):
|
| 96 |
"""
|
| 97 |
Search the Knowledge Graph for entities mentioned in the query.
|
|
|
|
| 1 |
import json
|
| 2 |
+
import re
|
| 3 |
from langchain_groq import ChatGroq
|
| 4 |
from langchain_core.messages import HumanMessage
|
| 5 |
from app.core.config import settings
|
| 6 |
from app.db.neo4j_driver import neo4j_db
|
| 7 |
|
| 8 |
+
|
| 9 |
+
# ββ Blocked meta-nodes that should never become graph entities ββ
|
| 10 |
+
BLOCKED_NODES = {
|
| 11 |
+
"USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
|
| 12 |
+
"CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
|
| 13 |
+
"QUESTION", "ANSWER", "RESPONSE", "MESSAGE", "CHAT",
|
| 14 |
+
"CONVERSATION", "HELLO", "HI", "HEY", "THANKS", "THANK YOU",
|
| 15 |
+
"YES", "NO", "OK", "OKAY",
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _clean_text(text: str) -> str:
|
| 20 |
+
"""Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
|
| 21 |
+
cleaned = re.sub(r'^(User|Soma|Assistant|AI|Human):\s*', '', text, flags=re.MULTILINE)
|
| 22 |
+
return cleaned.strip()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _is_valid_node(name: str) -> bool:
|
| 26 |
+
"""Check if a node name is a clean, short concept β not junk."""
|
| 27 |
+
if not name or name in BLOCKED_NODES:
|
| 28 |
+
return False
|
| 29 |
+
if len(name) > 40: # Nodes must be short concepts, not sentences
|
| 30 |
+
return False
|
| 31 |
+
if len(name.split()) > 4: # Max 4 words
|
| 32 |
+
return False
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _sanitize_relation(rel: str) -> str:
|
| 37 |
+
"""Clean a relation name for Neo4j compatibility."""
|
| 38 |
+
rel = rel.upper().strip()
|
| 39 |
+
rel = re.sub(r'[^A-Z0-9_]', '_', rel) # Only alphanumeric + underscore
|
| 40 |
+
rel = re.sub(r'_+', '_', rel).strip('_') # Collapse multiple underscores
|
| 41 |
+
return rel or "RELATED_TO"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
|
| 45 |
"""
|
| 46 |
+
Child-brain knowledge extraction.
|
| 47 |
+
|
| 48 |
+
Reads a conversation and extracts simple, clean concept associations β
|
| 49 |
+
the way a child's brain naturally builds connections between ideas.
|
| 50 |
+
|
| 51 |
+
"I play cricket on Sundays" β KOMAL --PLAYS--> CRICKET, CRICKET --PLAYED_ON--> SUNDAY
|
| 52 |
+
"My dog Baxter loves the park" β BAXTER --IS_A--> DOG, BAXTER --LOVES--> PARK
|
| 53 |
"""
|
| 54 |
if not neo4j_db.driver:
|
| 55 |
print("Knowledge Graph disabled (No DB connection).")
|
|
|
|
| 58 |
api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
|
| 59 |
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
|
| 60 |
|
| 61 |
+
clean = _clean_text(text)
|
| 62 |
+
|
| 63 |
+
# Need at least a few words to extract anything meaningful
|
| 64 |
+
if len(clean.split()) < 5:
|
| 65 |
+
print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
|
|
|
|
|
|
|
|
|
|
| 66 |
return 0
|
| 67 |
|
| 68 |
+
# The user's name is the identity anchor β capitalize for consistency
|
| 69 |
+
owner = user_id.upper()
|
| 70 |
|
| 71 |
+
prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
|
| 72 |
|
| 73 |
+
Think like a child drawing a mind-map:
|
| 74 |
+
- "{owner}" is the person speaking. If they say "I like X" β {owner} --LIKES--> X
|
| 75 |
+
- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
|
| 76 |
+
- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
RULES:
|
| 79 |
+
1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
|
| 80 |
+
2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
|
| 81 |
+
3. "I" or "my" in the text refers to "{owner}" β always use "{owner}" as the node name for the speaker.
|
| 82 |
+
4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
|
| 83 |
+
5. If the text is just greetings or small talk with zero factual content, return: []
|
| 84 |
|
| 85 |
+
Text:
|
| 86 |
+
{clean}
|
| 87 |
|
| 88 |
+
Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
|
| 89 |
+
No facts? Return: []"""
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
try:
|
| 92 |
response = llm.invoke([HumanMessage(content=prompt)])
|
| 93 |
content = response.content.strip()
|
| 94 |
|
|
|
|
|
|
|
| 95 |
match = re.search(r'\[.*\]', content, re.DOTALL)
|
| 96 |
if not match:
|
| 97 |
+
print("Neocortex: No JSON array in LLM response.")
|
| 98 |
return 0
|
| 99 |
|
| 100 |
+
triples = json.loads(match.group(0))
|
|
|
|
| 101 |
stored_count = 0
|
| 102 |
|
| 103 |
for t in triples:
|
| 104 |
subj = str(t.get("subject", "")).strip().upper()
|
| 105 |
+
rel = _sanitize_relation(str(t.get("relation", "")))
|
| 106 |
+
obj = str(t.get("object", "")).strip().upper()
|
| 107 |
|
| 108 |
+
# Validate both nodes
|
| 109 |
+
if not _is_valid_node(subj) or not _is_valid_node(obj):
|
|
|
|
|
|
|
|
|
|
| 110 |
continue
|
| 111 |
+
if subj == obj: # Self-loops are meaningless
|
| 112 |
continue
|
| 113 |
|
| 114 |
+
cypher = f"""
|
| 115 |
+
MERGE (s:Entity {{name: $subject, user_id: $user_id}})
|
| 116 |
+
MERGE (o:Entity {{name: $object, user_id: $user_id}})
|
| 117 |
+
MERGE (s)-[r:`{rel}`]->(o)
|
| 118 |
+
"""
|
| 119 |
+
neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
|
| 120 |
+
stored_count += 1
|
|
|
|
| 121 |
|
| 122 |
return stored_count
|
| 123 |
except Exception as e:
|
| 124 |
+
print(f"Neocortex extraction error: {e}")
|
| 125 |
return 0
|
| 126 |
|
| 127 |
+
|
| 128 |
def retrieve_graph_context(query: str, user_id: str = "default_user"):
|
| 129 |
"""
|
| 130 |
Search the Knowledge Graph for entities mentioned in the query.
|