Spaces:

komalsohal
/

Soma

Running

App Files Files Community

Komalpreet Kaur commited on 15 days ago

Commit

7c1bea9

unverified ·

1 Parent(s): dc97067

feat: use robust Pydantic structured output for 100% stable graph relationships extraction

Browse files

Files changed (5) hide show

app/services/neocortex.py +24 -23
scratch/inspect_llm_output.py +45 -0
scratch/test_json_failures.py +52 -0
scratch/test_real_extraction.py +36 -0
scratch/test_structured_output.py +38 -0

app/services/neocortex.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import json
 import re
 from langchain_groq import ChatGroq
 from langchain_core.messages import HumanMessage
 from app.core.config import settings
 from app.db.neo4j_driver import neo4j_db
 # ── Blocked meta-nodes that should never become graph entities ──
 BLOCKED_NODES = {
     "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
@@ -15,6 +16,15 @@ BLOCKED_NODES = {
     "YES", "NO", "OK", "OKAY",
 }
 def _clean_text(text: str) -> str:
     """Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
@@ -74,13 +84,10 @@ def _sanitize_relation(rel: str) -> str:
 def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
     """
-    Child-brain knowledge extraction.
     Reads a conversation and extracts simple, clean concept associations —
     the way a child's brain naturally builds connections between ideas.
-    "I play cricket on Sundays" → KOMAL --PLAYS--> CRICKET, CRICKET --PLAYED_ON--> SUNDAY
-    "My dog Baxter loves the park" → BAXTER --IS_A--> DOG, BAXTER --LOVES--> PARK
     """
     if not neo4j_db.driver:
         print("Knowledge Graph disabled (No DB connection).")
@@ -96,9 +103,11 @@ def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
         print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
         return 0
-    # The user's name is the identity anchor — capitalize for consistency
     owner = user_id.upper()
     prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
 Think like a child drawing a mind-map:
@@ -111,30 +120,23 @@ RULES:
 2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
 3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
 4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
-5. If the text is just greetings or small talk with zero factual content, return: []
 Text:
-{clean}
-Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
-No facts? Return: []"""
     try:
-        response = llm.invoke([HumanMessage(content=prompt)])
-        content = response.content.strip()
-        match = re.search(r'\[.*\]', content, re.DOTALL)
-        if not match:
-            print("Neocortex: No JSON array in LLM response.")
             return 0
-        triples = json.loads(match.group(0))
         stored_count = 0
-        for t in triples:
-            subj = str(t.get("subject", "")).strip().upper()
-            rel  = _sanitize_relation(str(t.get("relation", "")))
-            obj  = str(t.get("object", "")).strip().upper()
             # Validate both nodes
             if not _is_valid_node(subj) or not _is_valid_node(obj):
@@ -164,7 +166,6 @@ def retrieve_graph_context(query: str, user_id: str = "default_user"):
     if not neo4j_db.driver:
         return [], []
-    # Naive keyword matching: if any node name is in the query, pull its connections.
     cypher = """
     MATCH (n:Entity)-[r]->(m:Entity)
     WHERE (n.user_id = $user_id)

 import json
 import re
+from typing import List
+from pydantic import BaseModel, Field
 from langchain_groq import ChatGroq
 from langchain_core.messages import HumanMessage
 from app.core.config import settings
 from app.db.neo4j_driver import neo4j_db
 # ── Blocked meta-nodes that should never become graph entities ──
 BLOCKED_NODES = {
     "USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
     "YES", "NO", "OK", "OKAY",
 }
+# ── Pydantic Models for Structured LLM Output ──
+class RelationshipTriple(BaseModel):
+    subject: str = Field(description="The subject entity (1-3 words, short CAPITALIZED concept, e.g. KOMAL, BAXTER, CRICKET)")
+    relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS, HAS, OWNS")
+    object: str = Field(description="The object entity (1-3 words, short CAPITALIZED concept, e.g. DELHI, DOG, CRICKET)")
+class KnowledgeGraphExtraction(BaseModel):
+    triples: List[RelationshipTriple] = Field(description="List of simple extracted concept relationships")
 def _clean_text(text: str) -> str:
     """Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
 def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
     """
+    Child-brain knowledge extraction with 100% structurally guaranteed JSON output.
     Reads a conversation and extracts simple, clean concept associations —
     the way a child's brain naturally builds connections between ideas.
     """
     if not neo4j_db.driver:
         print("Knowledge Graph disabled (No DB connection).")
         print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
         return 0
     owner = user_id.upper()
+    # Bind the structured Pydantic model to the Groq LLM
+    structured_llm = llm.with_structured_output(KnowledgeGraphExtraction)
     prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
 Think like a child drawing a mind-map:
 2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
 3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
 4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
+5. If the text is just greetings or small talk with zero factual content, return an empty triples list.
 Text:
+{clean}"""
     try:
+        result = structured_llm.invoke([HumanMessage(content=prompt)])
+        if not result or not result.triples:
+            print("Neocortex: No triples extracted.")
             return 0
         stored_count = 0
+        for t in result.triples:
+            subj = str(t.subject).strip().upper()
+            rel  = _sanitize_relation(str(t.relation))
+            obj  = str(t.object).strip().upper()
             # Validate both nodes
             if not _is_valid_node(subj) or not _is_valid_node(obj):
     if not neo4j_db.driver:
         return [], []
     cypher = """
     MATCH (n:Entity)-[r]->(m:Entity)
     WHERE (n.user_id = $user_id)

scratch/inspect_llm_output.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import sys
+import os
+import json
+import re
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage
+from app.core.config import settings
+def extract_and_print(text: str, user_id: str = "default_user"):
+    api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
+    llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
+    owner = user_id.upper()
+    prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
+Think like a child drawing a mind-map:
+- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
+- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
+- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
+RULES:
+1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
+2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
+3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
+4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
+5. If the text is just greetings or small talk with zero factual content, return: []
+Text:
+{text}
+Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
+No facts? Return: []"""
+    print(f"--- Prompt sent to LLM ---")
+    response = llm.invoke([HumanMessage(content=prompt)])
+    content = response.content.strip()
+    print(f"--- LLM Output Content ---")
+    print(content)
+    print(f"--------------------------")
+extract_and_print("My dog Baxter likes chasing tennis balls in Delhi", "komal")

scratch/test_json_failures.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import sys
+import os
+import json
+import re
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage
+from app.core.config import settings
+def extract_and_inspect():
+    api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
+    llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
+    owner = "KOMAL"
+    inp = "My dog Baxter likes chasing tennis balls in Delhi"
+    prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.
+Think like a child drawing a mind-map:
+- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
+- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
+- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs
+RULES:
+1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
+2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
+3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
+4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
+5. If the text is just greetings or small talk with zero factual content, return: []
+Text:
+{inp}
+Return ONLY a JSON array of simple connections: [{{"subject": "NODE", "relation": "VERB", "object": "NODE"}}]
+No facts? Return: []"""
+    # We do 5 calls to see if it ever generates invalid JSON
+    for i in range(5):
+        response = llm.invoke([HumanMessage(content=prompt)])
+        content = response.content.strip()
+        print(f"\n--- Run {i+1} Output ---")
+        print(content)
+        match = re.search(r'\[.*\]', content, re.DOTALL)
+        if match:
+            json_str = match.group(0)
+            try:
+                json.loads(json_str)
+                print("Valid JSON: Yes")
+            except Exception as e:
+                print(f"Valid JSON: No (Error: {e})")
+                print(f"Extracted string: {json_str!r}")
+extract_and_inspect()

scratch/test_real_extraction.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import sys
+import os
+import json
+import asyncio
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.services.neocortex import extract_and_store_knowledge
+from app.db.neo4j_driver import neo4j_db
+async def main():
+    print("Testing neocortex on real DB and Groq connection...")
+    # Check driver
+    if not neo4j_db.driver:
+        print("Neo4j driver is not connected. Attempting connect...")
+        from app.core.config import settings
+        print(f"Connecting to {settings.NEO4J_URI}...")
+        neo4j_db.connect(settings.NEO4J_URI, settings.NEO4J_USER, settings.NEO4J_PASSWORD)
+    test_inputs = [
+        "I love coding in Python and playing cricket on Sundays",
+        "My dog Baxter likes chasing tennis balls in Delhi",
+    ]
+    for inp in test_inputs:
+        print(f"\n========================================\nInput: '{inp}'")
+        try:
+            triples = extract_and_store_knowledge(inp, "komal")
+            print(f"Result: extracted {triples} relationships successfully.")
+        except Exception as e:
+            print(f"Exception during execution: {e}")
+if __name__ == "__main__":
+    asyncio.run(main())

scratch/test_structured_output.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import sys
+import os
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from pydantic import BaseModel, Field
+from typing import List
+from langchain_groq import ChatGroq
+from app.core.config import settings
+class RelationshipTriple(BaseModel):
+    subject: str = Field(description="The subject entity (1-3 words, UPPERCASE concept)")
+    relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS")
+    object: str = Field(description="The object entity (1-3 words, UPPERCASE concept)")
+class KnowledgeGraphExtraction(BaseModel):
+    triples: List[RelationshipTriple] = Field(description="List of extracted concept relationships")
+def test_structured_output():
+    api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
+    llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
+    try:
+        structured_llm = llm.with_structured_output(KnowledgeGraphExtraction)
+        print("Success: ChatGroq.with_structured_output is fully supported!")
+        # Test it on a simple prompt
+        result = structured_llm.invoke("My dog Baxter likes chasing tennis balls in Delhi")
+        print("Result object:", result)
+        print("Extracted triples:")
+        for t in result.triples:
+            print(f"- {t.subject} --{t.relation}--> {t.object}")
+    except Exception as e:
+        print("Failed to run structured output:", e)
+if __name__ == "__main__":
+    test_structured_output()