Spaces:

komalsohal
/

Soma

Running

Komalpreet Kaur

feat: implement responsive, theme-adaptive visitor analytics modal dialog box

df43d43 unverified 13 days ago

8.29 kB

	import json
	import re
	from typing import List
	from pydantic import BaseModel, Field
	from langchain_groq import ChatGroq
	from langchain_core.messages import HumanMessage
	from app.core.config import settings
	from app.db.neo4j_driver import neo4j_db

	# ── Blocked meta-nodes that should never become graph entities ──
	BLOCKED_NODES = {
	"USER", "SOMA", "AI", "ASSISTANT", "BOT", "HUMAN", "SYSTEM",
	"CHATBOT", "NEURAL CORE", "COGNITIVE CONSOLE", "BRAIN",
	"QUESTION", "ANSWER", "RESPONSE", "MESSAGE", "CHAT",
	"CONVERSATION", "HELLO", "HI", "HEY", "THANKS", "THANK YOU",
	"YES", "NO", "OK", "OKAY",
	}

	# ── Pydantic Models for Structured LLM Output ──
	class RelationshipTriple(BaseModel):
	subject: str = Field(description="The subject entity (1-3 words, short CAPITALIZED concept, e.g. KOMAL, BAXTER, CRICKET)")
	relation: str = Field(description="The relationship verb/action, e.g. LIKES, LIVES_IN, PLAYS, HAS, OWNS")
	object: str = Field(description="The object entity (1-3 words, short CAPITALIZED concept, e.g. DELHI, DOG, CRICKET)")

	class KnowledgeGraphExtraction(BaseModel):
	triples: List[RelationshipTriple] = Field(description="List of simple extracted concept relationships")


	def _clean_text(text: str) -> str:
	"""Strip chat-format prefixes so the LLM sees pure content, not 'User: ...'."""
	cleaned = re.sub(r'^(User\|Soma\|Assistant\|AI\|Human):\s*', '', text, flags=re.MULTILINE)
	return cleaned.strip()


	def _is_valid_node(name: str) -> bool:
	"""
	STRICT validation: only allow clean, short concept names as graph nodes.
	Blocks sentences, conversational text, and anything that isn't a real concept.
	"""
	if not name or name in BLOCKED_NODES:
	return False

	# Hard length limits — concepts are SHORT
	if len(name) > 30 or len(name.split()) > 3:
	return False

	# Block anything with sentence punctuation (periods, question marks, exclamation, commas)
	if re.search(r'[.!?,;:\'"()]', name):
	return False

	# Block anything that looks like a sentence/phrase (contains common filler words)
	FILLER_WORDS = {
	"THE", "A", "AN", "IS", "ARE", "WAS", "WERE", "BE", "BEEN",
	"HAVE", "HAS", "HAD", "DO", "DOES", "DID", "WILL", "WOULD",
	"COULD", "SHOULD", "MAY", "MIGHT", "SHALL", "CAN",
	"THIS", "THAT", "THESE", "THOSE", "IT", "ITS",
	"VERY", "REALLY", "JUST", "ALSO", "TOO", "SO",
	"HOW", "WHAT", "WHERE", "WHEN", "WHY", "WHO",
	"YOUR", "MY", "OUR", "THEIR", "HIS", "HER",
	"NOT", "BUT", "AND", "OR", "IF", "THEN",
	"THERE", "HERE", "NICE", "MEET", "GOING",
	"ABOUT", "WITH", "FROM", "INTO", "OVER",
	}
	words = set(name.split())
	# If more than half the words are filler, it's a sentence not a concept
	filler_count = len(words & FILLER_WORDS)
	if filler_count >= 2 or (len(words) == 1 and name in FILLER_WORDS):
	return False

	# Must contain at least one letter
	if not re.search(r'[A-Z]', name):
	return False

	return True


	def _sanitize_relation(rel: str) -> str:
	"""Clean a relation name for Neo4j compatibility."""
	rel = rel.upper().strip()
	rel = re.sub(r'[^A-Z0-9_]', '_', rel) # Only alphanumeric + underscore
	rel = re.sub(r'_+', '_', rel).strip('_') # Collapse multiple underscores
	return rel or "RELATED_TO"


	def extract_and_store_knowledge(text: str, user_id: str = "default_user"):
	"""
	Child-brain knowledge extraction with 100% structurally guaranteed JSON output.

	Reads a conversation and extracts simple, clean concept associations —
	the way a child's brain naturally builds connections between ideas.
	"""
	if not neo4j_db.driver:
	print("Knowledge Graph disabled (No DB connection).")
	return 0

	api_key = settings.GROQ_API_KEY if settings.GROQ_API_KEY else "dummy_key"
	llm = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)

	clean = _clean_text(text)

	# Need at least 3 words to extract a relationship (e.g., "I like apples")
	if len(clean.split()) < 3:
	print(f"Neocortex: Input too short ({len(clean.split())} words), skipping.")
	return 0

	owner = user_id.upper()

	prompt = f"""You are a child's brain learning about the world. Read the text and pick out SIMPLE facts as connections between concepts.

	Think like a child drawing a mind-map:
	- "{owner}" is the person speaking. If they say "I like X" → {owner} --LIKES--> X
	- Extract only SHORT concept names (1-3 words). Never use full sentences as names.
	- Focus on: people, places, things, hobbies, foods, animals, feelings, skills, jobs

	RULES:
	1. Nodes must be 1-3 word concept names, ALL CAPS. Example: "CRICKET", "DELHI", "MOM", "CODING"
	2. Relations must be simple verbs: LIKES, IS_A, LIVES_IN, PLAYS, WORKS_AT, HAS, KNOWS, STUDIES, etc.
	3. "I" or "my" in the text refers to "{owner}" — always use "{owner}" as the node name for the speaker.
	4. DO NOT create nodes named "USER", "SOMA", "AI", "ASSISTANT", or any chat/bot terms.
	5. If the text is just greetings or small talk with zero factual content, return an empty triples list.

	Text:
	{clean}

	Return the extracted facts ONLY as a valid JSON block in this exact format:
	{{
	"triples": [
	{{"subject": "SUBJECT", "relation": "RELATION", "object": "OBJECT"}}
	]
	}}
	Do not write any other explanation or thoughts outside the JSON block. If there are no facts, return: {{"triples": []}}"""

	try:
	response = llm.invoke([HumanMessage(content=prompt)])
	content = response.content.strip()

	# Robustly extract the JSON block
	triples_data = []
	match = re.search(r'\{.*\}', content, re.DOTALL)
	if match:
	json_str = match.group(0)
	try:
	data = json.loads(json_str)
	triples_data = data.get("triples", [])
	except Exception as e:
	print(f"Neocortex: Failed to parse JSON block: {e}")
	return 0
	else:
	print("Neocortex: No JSON block found in LLM response.")
	return 0

	if not triples_data:
	print("Neocortex: No triples extracted.")
	return 0

	stored_count = 0

	for t in triples_data:
	subj = str(t.get("subject", "")).strip().upper()
	rel = _sanitize_relation(str(t.get("relation", "")))
	obj = str(t.get("object", "")).strip().upper()

	# Validate both nodes
	if not _is_valid_node(subj) or not _is_valid_node(obj):
	continue
	if subj == obj: # Self-loops are meaningless
	continue

	cypher = f"""
	MERGE (s:Entity {{name: $subject, user_id: $user_id}})
	MERGE (o:Entity {{name: $object, user_id: $user_id}})
	MERGE (s)-[r:`{rel}`]->(o)
	"""
	neo4j_db.query(cypher, {"subject": subj, "object": obj, "user_id": user_id})
	stored_count += 1

	return stored_count
	except Exception as e:
	print(f"Neocortex extraction error: {e}")
	return 0


	def retrieve_graph_context(query: str, user_id: str = "default_user"):
	"""
	Search the Knowledge Graph for entities mentioned in the query.
	Returns (context_strings, touched_entities)
	"""
	if not neo4j_db.driver:
	return [], []

	cypher = """
	MATCH (n:Entity)-[r]->(m:Entity)
	WHERE (n.user_id = $user_id)
	AND (m.user_id = $user_id)
	AND (toLower($query) CONTAINS toLower(n.name) OR toLower($query) CONTAINS toLower(m.name))
	RETURN n.name AS s, type(r) AS rel, m.name AS o
	LIMIT 15
	"""
	try:
	results = neo4j_db.query(cypher, {"query": query, "user_id": user_id})
	if not results:
	return [], []

	context = []
	touched = set()
	for res in results:
	context.append(f"{res['s']} [{res['rel']}] {res['o']}")
	touched.add(res['s'])
	touched.add(res['o'])

	return context, list(touched)
	except Exception as e:
	print(f"Error retrieving from Neocortex: {e}")
	return [], []