Komalpreet Kaur commited on
Commit
4bbe74d
·
unverified ·
1 Parent(s): b8fda7c

fix: bulletproof node validation - block ALL sentences and filler text from graph

Browse files
Files changed (1) hide show
  1. app/services/neocortex.py +34 -3
app/services/neocortex.py CHANGED
@@ -23,13 +23,44 @@ def _clean_text(text: str) -> str:
23
 
24
 
25
  def _is_valid_node(name: str) -> bool:
26
- """Check if a node name is a clean, short concept — not junk."""
 
 
 
27
  if not name or name in BLOCKED_NODES:
28
  return False
29
- if len(name) > 40: # Nodes must be short concepts, not sentences
 
 
30
  return False
31
- if len(name.split()) > 4: # Max 4 words
 
 
32
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  return True
34
 
35
 
 
23
 
24
 
25
  def _is_valid_node(name: str) -> bool:
26
+ """
27
+ STRICT validation: only allow clean, short concept names as graph nodes.
28
+ Blocks sentences, conversational text, and anything that isn't a real concept.
29
+ """
30
  if not name or name in BLOCKED_NODES:
31
  return False
32
+
33
+ # Hard length limits — concepts are SHORT
34
+ if len(name) > 30 or len(name.split()) > 3:
35
  return False
36
+
37
+ # Block anything with sentence punctuation (periods, question marks, exclamation, commas)
38
+ if re.search(r'[.!?,;:\'"()]', name):
39
  return False
40
+
41
+ # Block anything that looks like a sentence/phrase (contains common filler words)
42
+ FILLER_WORDS = {
43
+ "THE", "A", "AN", "IS", "ARE", "WAS", "WERE", "BE", "BEEN",
44
+ "HAVE", "HAS", "HAD", "DO", "DOES", "DID", "WILL", "WOULD",
45
+ "COULD", "SHOULD", "MAY", "MIGHT", "SHALL", "CAN",
46
+ "THIS", "THAT", "THESE", "THOSE", "IT", "ITS",
47
+ "VERY", "REALLY", "JUST", "ALSO", "TOO", "SO",
48
+ "HOW", "WHAT", "WHERE", "WHEN", "WHY", "WHO",
49
+ "YOUR", "MY", "OUR", "THEIR", "HIS", "HER",
50
+ "NOT", "BUT", "AND", "OR", "IF", "THEN",
51
+ "THERE", "HERE", "NICE", "MEET", "GOING",
52
+ "ABOUT", "WITH", "FROM", "INTO", "OVER",
53
+ }
54
+ words = set(name.split())
55
+ # If more than half the words are filler, it's a sentence not a concept
56
+ filler_count = len(words & FILLER_WORDS)
57
+ if filler_count >= 2 or (len(words) == 1 and name in FILLER_WORDS):
58
+ return False
59
+
60
+ # Must contain at least one letter
61
+ if not re.search(r'[A-Z]', name):
62
+ return False
63
+
64
  return True
65
 
66