Soma / app /services /neocortex.py
Komalpreet Kaur
feat: implement responsive, theme-adaptive visitor analytics modal dialog box
df43d43 unverified
import json
import re
from typing import List
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from app.core.config import settings
from app.db.neo4j_driver import neo4j_db
# ── Blocked meta-nodes that should never become graph entities ──
BLOCKED_NODES = {
"USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
"CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
"QUESTION", "ANSWER", "RESPONSE", "MESSAGE", "CHAT",
"CONVERSATION", "HELLO", "HI", "HEY", "THANKS", "THANK YOU",
"YES", "NO", "OK", "OKAY",
}
# ── Pydantic Models for Structured LLM Output ──
class RelationshipTriple(BaseModel):
subject: str = Field(description="The subject entity (1-3 words, short CAPITALIZED concept, e.g. KOMAL, BAXTER, CRICKET)")
relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS, HAS, OWNS")
object: str = Field(description="The object entity (1-3 words, short CAPITALIZED concept, e.g. DELHI, DOG, CRICKET)")
class KnowledgeGraphExtraction(BaseModel):
triples: List[RelationshipTriple] = Field(description="List of simple extracted concept relationships")
def _clean_text(text: str) -> str:
"""Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
cleaned = re.sub(r'^(User|Soma|Assistant|AI|Human):\s*', '', text, flags=re.MULTILINE)
return cleaned.strip()
def _is_valid_node(name: str) -> bool:
"""
STRICT validation: only allow clean, short concept names as graph nodes.
Blocks sentences, conversational text, and anything that isn't a real concept.
"""
if not name or name in BLOCKED_NODES:
return False
# Hard length limits — concepts are SHORT
if len(name) > 30 or len(name.split()) > 3:
return False
# Block anything with sentence punctuation (periods, question marks, exclamation, commas)
if re.search(r'[.!?,;:\'"()]', name):
return False
# Block anything that looks like a sentence/phrase (contains common filler words)
FILLER_WORDS = {
"THE", "A", "AN", "IS", "ARE", "WAS", "WERE", "BE", "BEEN",
"HAVE", "HAS", "HAD", "DO", "DOES", "DID", "WILL", "WOULD",
"COULD", "SHOULD", "MAY", "MIGHT", "SHALL", "CAN",
"THIS", "THAT", "THESE", "THOSE", "IT", "ITS",
"VERY", "REALLY", "JUST", "ALSO", "TOO", "SO",
"HOW", "WHAT", "WHERE", "WHEN", "WHY", "WHO",
"YOUR", "MY", "OUR", "THEIR", "HIS", "HER",
"NOT", "BUT", "AND", "OR", "IF", "THEN",
"THERE", "HERE", "NICE", "MEET", "GOING",
"ABOUT", "WITH", "FROM", "INTO", "OVER",
}
words = set(name.split())
# If more than half the words are filler, it's a sentence not a concept
filler_count = len(words & FILLER_WORDS)
if filler_count >= 2 or (len(words) == 1 and name in FILLER_WORDS):
return False
# Must contain at least one letter
if not re.search(r'[A-Z]', name):
return False
return True
def _sanitize_relation(rel: str) -> str:
"""Clean a relation name for Neo4j compatibility."""
rel = rel.upper().strip()
rel = re.sub(r'[^A-Z0-9_]', '_', rel) # Only alphanumeric + underscore
rel = re.sub(r'_+', '_', rel).strip('_') # Collapse multiple underscores
return rel or "RELATED_TO"
def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
"""
Child-brain knowledge extraction with 100% structurally guaranteed JSON output.
Reads a conversation and extracts simple, clean concept associations —
the way a child's brain naturally builds connections between ideas.
"""
if not neo4j_db.driver:
print("Knowledge Graph disabled (No DB connection).")
return 0
api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
clean = _clean_text(text)
# Need at least 3 words to extract a relationship (e.g., "I like apples")
if len(clean.split()) < 3:
print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
return 0
owner = user_id.upper()
prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
Think like a child drawing a mind-map:
- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
RULES:
1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
5. If the text is just greetings or small talk with zero factual content, return an empty triples list.
Text:
{clean}
Return the extracted facts ONLY as a valid JSON block in this exact format:
{{
"triples": [
{{"subject": "SUBJECT", "relation": "RELATION", "object": "OBJECT"}}
]
}}
Do not write any other explanation or thoughts outside the JSON block. If there are no facts, return: {{"triples": []}}"""
try:
response = llm.invoke([HumanMessage(content=prompt)])
content = response.content.strip()
# Robustly extract the JSON block
triples_data = []
match = re.search(r'\{.*\}', content, re.DOTALL)
if match:
json_str = match.group(0)
try:
data = json.loads(json_str)
triples_data = data.get("triples", [])
except Exception as e:
print(f"Neocortex: Failed to parse JSON block: {e}")
return 0
else:
print("Neocortex: No JSON block found in LLM response.")
return 0
if not triples_data:
print("Neocortex: No triples extracted.")
return 0
stored_count = 0
for t in triples_data:
subj = str(t.get("subject", "")).strip().upper()
rel = _sanitize_relation(str(t.get("relation", "")))
obj = str(t.get("object", "")).strip().upper()
# Validate both nodes
if not _is_valid_node(subj) or not _is_valid_node(obj):
continue
if subj == obj: # Self-loops are meaningless
continue
cypher = f"""
MERGE (s:Entity {{name: $subject, user_id: $user_id}})
MERGE (o:Entity {{name: $object, user_id: $user_id}})
MERGE (s)-[r:`{rel}`]->(o)
"""
neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
stored_count += 1
return stored_count
except Exception as e:
print(f"Neocortex extraction error: {e}")
return 0
def retrieve_graph_context(query: str, user_id: str = "default_user"):
"""
Search the Knowledge Graph for entities mentioned in the query.
Returns (context_strings, touched_entities)
"""
if not neo4j_db.driver:
return [], []
cypher = """
MATCH (n:Entity)-[r]->(m:Entity)
WHERE (n.user_id = $user_id)
AND (m.user_id = $user_id)
AND (toLower($query) CONTAINS toLower(n.name) OR toLower($query) CONTAINS toLower(m.name))
RETURN n.name AS s, type(r) AS rel, m.name AS o
LIMIT 15
"""
try:
results = neo4j_db.query(cypher, {"query": query, "user_id": user_id})
if not results:
return [], []
context = []
touched = set()
for res in results:
context.append(f"{res['s']} [{res['rel']}] {res['o']}")
touched.add(res['s'])
touched.add(res['o'])
return context, list(touched)
except Exception as e:
print(f"Error retrieving from Neocortex: {e}")
return [], []