Spaces:

komalsohal
/

Soma

Running

App Files Files Community

Komalpreet Kaur commited on 21 days ago

Commit

209f9ba

unverified ·

1 Parent(s): 7ab617e

refactor: child-brain knowledge extraction - simple concept nodes only

Browse files

Files changed (1) hide show

app/services/neocortex.py +82 -49

app/services/neocortex.py CHANGED Viewed

@@ -1,13 +1,55 @@
 import json
 from langchain_groq import ChatGroq
 from langchain_core.messages import HumanMessage
 from app.core.config import settings
 from app.db.neo4j_driver import neo4j_db
 def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
     """
-    The Neocortex extraction.
-    Takes plain text, finds logical triples, and stores them in Neo4j.
     """
     if not neo4j_db.driver:
         print("Knowledge Graph disabled (No DB connection).")
@@ -16,82 +58,73 @@ def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
     api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
     llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
-    # Strip conversational prefixes so the LLM sees clean text, not chat format.
-    import re as _re
-    clean_text = _re.sub(r'^(User|Soma|Assistant|AI|Human):\s*', '', text, flags=_re.MULTILINE).strip()
-    # Skip extraction for very short inputs (e.g. just a name) — not enough
-    # content to contain meaningful relationships.
-    if len(clean_text.split()) < 5:
-        print(f"Neocortex: Input too short for extraction ({len(clean_text.split())} words), skipping.")
         return 0
-    prompt = f"""Extract real-world knowledge entities and their factual relationships from the text below.
-You must extract ONLY concrete knowledge — people, places, organizations, concepts, skills, events, and their relationships.
-STRICT RULES:
-1. DO NOT create nodes for conversational actors (e.g. "USER", "SOMA", "AI", "ASSISTANT", "BOT").
-2. DO NOT create relationships about who said what, who asked whom, or who responded to whom.
-3. ONLY extract factual, real-world knowledge that someone would put in an encyclopedia or knowledge base.
-4. Entity names must be SHORT (1-3 words max), CAPITALIZED, and represent real concepts — NOT sentences or phrases.
-5. If the text is just casual chat with no real knowledge content, return an empty array: []
-Text:
-{clean_text}
-Return ONLY a valid JSON array: [{{"subject": "ENTITY", "relation": "RELATION", "object": "ENTITY"}}]
-If no real-world knowledge exists, return: []"""
-    # Nodes to block — meta-conversational entities that pollute the graph
-    BLOCKED_NODES = {
-        "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
-        "CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
-    }
     try:
         response = llm.invoke([HumanMessage(content=prompt)])
         content = response.content.strip()
-        # Use regex to find the JSON array in case the LLM added conversational text
-        import re
         match = re.search(r'\[.*\]', content, re.DOTALL)
         if not match:
-            print("No JSON array found in LLM response.")
             return 0
-        json_str = match.group(0)
-        triples = json.loads(json_str)
         stored_count = 0
         for t in triples:
             subj = str(t.get("subject", "")).strip().upper()
-            rel = str(t.get("relation", "")).strip().upper()
-            obj = str(t.get("object", "")).strip().upper()
-            # Neo4j relation names can't have spaces or special non-alphanumeric chars
-            rel = rel.replace(" ", "_").replace("-", "_")
-            # Block meta-conversational nodes and overly long node names
-            if subj in BLOCKED_NODES or obj in BLOCKED_NODES:
                 continue
-            if len(subj) > 50 or len(obj) > 50:  # Node names shouldn't be sentences
                 continue
-            if subj and rel and obj:
-                cypher = f"""
-                MERGE (s:Entity {{name: $subject, user_id: $user_id}})
-                MERGE (o:Entity {{name: $object, user_id: $user_id}})
-                MERGE (s)-[r:`{rel}`]->(o)
-                """
-                neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
-                stored_count += 1
         return stored_count
     except Exception as e:
-        print(f"Error in Neocortex extraction: {e}")
         return 0
 def retrieve_graph_context(query: str, user_id: str = "default_user"):
     """
     Search the Knowledge Graph for entities mentioned in the query.

 import json
+import re
 from langchain_groq import ChatGroq
 from langchain_core.messages import HumanMessage
 from app.core.config import settings
 from app.db.neo4j_driver import neo4j_db
+# ── Blocked meta-nodes that should never become graph entities ──
+BLOCKED_NODES = {
+    "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
+    "CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
+    "QUESTION", "ANSWER", "RESPONSE", "MESSAGE", "CHAT",
+    "CONVERSATION", "HELLO", "HI", "HEY", "THANKS", "THANK YOU",
+    "YES", "NO", "OK", "OKAY",
+}
+def _clean_text(text: str) -> str:
+    """Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
+    cleaned = re.sub(r'^(User|Soma|Assistant|AI|Human):\s*', '', text, flags=re.MULTILINE)
+    return cleaned.strip()
+def _is_valid_node(name: str) -> bool:
+    """Check if a node name is a clean, short concept — not junk."""
+    if not name or name in BLOCKED_NODES:
+        return False
+    if len(name) > 40:      # Nodes must be short concepts, not sentences
+        return False
+    if len(name.split()) > 4:  # Max 4 words
+        return False
+    return True
+def _sanitize_relation(rel: str) -> str:
+    """Clean a relation name for Neo4j compatibility."""
+    rel = rel.upper().strip()
+    rel = re.sub(r'[^A-Z0-9_]', '_', rel)  # Only alphanumeric + underscore
+    rel = re.sub(r'_+', '_', rel).strip('_')  # Collapse multiple underscores
+    return rel or "RELATED_TO"
 def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
     """
+    Child-brain knowledge extraction.
+    Reads a conversation and extracts simple, clean concept associations —
+    the way a child's brain naturally builds connections between ideas.
+    "I play cricket on Sundays" → KOMAL --PLAYS--> CRICKET, CRICKET --PLAYED_ON--> SUNDAY
+    "My dog Baxter loves the park" → BAXTER --IS_A--> DOG, BAXTER --LOVES--> PARK
     """
     if not neo4j_db.driver:
         print("Knowledge Graph disabled (No DB connection).")
     api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
     llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
+    clean = _clean_text(text)
+    # Need at least a few words to extract anything meaningful
+    if len(clean.split()) < 5:
+        print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
         return 0
+    # The user's name is the identity anchor — capitalize for consistency
+    owner = user_id.upper()
+    prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
+Think like a child drawing a mind-map:
+- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
+- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
+- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
+RULES:
+1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
+2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
+3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
+4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
+5. If the text is just greetings or small talk with zero factual content, return: []
+Text:
+{clean}
+Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
+No facts? Return: []"""
     try:
         response = llm.invoke([HumanMessage(content=prompt)])
         content = response.content.strip()
         match = re.search(r'\[.*\]', content, re.DOTALL)
         if not match:
+            print("Neocortex: No JSON array in LLM response.")
             return 0
+        triples = json.loads(match.group(0))
         stored_count = 0
         for t in triples:
             subj = str(t.get("subject", "")).strip().upper()
+            rel  = _sanitize_relation(str(t.get("relation", "")))
+            obj  = str(t.get("object", "")).strip().upper()
+            # Validate both nodes
+            if not _is_valid_node(subj) or not _is_valid_node(obj):
                 continue
+            if subj == obj:  # Self-loops are meaningless
                 continue
+            cypher = f"""
+            MERGE (s:Entity {{name: $subject, user_id: $user_id}})
+            MERGE (o:Entity {{name: $object, user_id: $user_id}})
+            MERGE (s)-[r:`{rel}`]->(o)
+            """
+            neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
+            stored_count += 1
         return stored_count
     except Exception as e:
+        print(f"Neocortex extraction error: {e}")
         return 0
 def retrieve_graph_context(query: str, user_id: str = "default_user"):
     """
     Search the Knowledge Graph for entities mentioned in the query.