Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,130 @@ import stanza
|
|
| 4 |
# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
|
| 5 |
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,constituency', force=True, verbose=True)
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from utils import *
|
| 8 |
from datasets import load_dataset
|
| 9 |
|
|
|
|
| 4 |
# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
|
| 5 |
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,constituency', force=True, verbose=True)
|
| 6 |
|
| 7 |
+
##################################################################
|
| 8 |
+
# @title def analyze_sentence3
|
| 9 |
+
def analyze_sentence3(sentence, existing_rules, debug=False):
|
| 10 |
+
"""Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes."""
|
| 11 |
+
print(f"\n🔍 Analyzing: '{sentence}'")
|
| 12 |
+
print("=" * 50)
|
| 13 |
+
|
| 14 |
+
doc = nlp(sentence)
|
| 15 |
+
tree = doc.sentences[0].constituency
|
| 16 |
+
suggestions = {}
|
| 17 |
+
|
| 18 |
+
def is_punctuation_label(label):
|
| 19 |
+
"""Helper function to identify punctuation POS tags."""
|
| 20 |
+
return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}']
|
| 21 |
+
|
| 22 |
+
def has_at_least_two_children(child_labels):
|
| 23 |
+
"""Check if there are at least 2 non-None children."""
|
| 24 |
+
if child_labels is None:
|
| 25 |
+
return False
|
| 26 |
+
# Count actual children (excluding None values if any)
|
| 27 |
+
count = len([label for label in child_labels if label is not None])
|
| 28 |
+
return count >= 2
|
| 29 |
+
|
| 30 |
+
def _traverse(node):
|
| 31 |
+
if not node.is_leaf():
|
| 32 |
+
if node.label not in EXCLUDE_NODES:
|
| 33 |
+
child_labels = tuple(child.label for child in node.children)
|
| 34 |
+
|
| 35 |
+
# Only process if node has 2+ real children
|
| 36 |
+
if has_at_least_two_children(child_labels):
|
| 37 |
+
# SPECIAL HANDLING FOR 'S' NODES (include punctuation)
|
| 38 |
+
if node.label == 'S':
|
| 39 |
+
pattern = (node.label, child_labels)
|
| 40 |
+
if pattern not in existing_rules and pattern not in suggestions:
|
| 41 |
+
example_text = ' '.join(node.leaf_labels())
|
| 42 |
+
suggestions[pattern] = example_text
|
| 43 |
+
|
| 44 |
+
# STANDARD HANDLING FOR OTHER NODES
|
| 45 |
+
else:
|
| 46 |
+
# Suggest rule with all children (including punctuation)
|
| 47 |
+
pattern_all = (node.label, child_labels)
|
| 48 |
+
if pattern_all not in existing_rules and pattern_all not in suggestions:
|
| 49 |
+
example_text = ' '.join(node.leaf_labels())
|
| 50 |
+
suggestions[pattern_all] = example_text
|
| 51 |
+
|
| 52 |
+
# If last child is punctuation, suggest rule without it
|
| 53 |
+
if (is_punctuation_label(child_labels[-1]) and
|
| 54 |
+
not is_punctuation_label(node.label)):
|
| 55 |
+
|
| 56 |
+
non_punct_child_labels = child_labels[:-1]
|
| 57 |
+
# Only suggest if we still have 2+ real children after removing punctuation
|
| 58 |
+
if has_at_least_two_children(non_punct_child_labels):
|
| 59 |
+
pattern_without_punct = (node.label, non_punct_child_labels)
|
| 60 |
+
|
| 61 |
+
if (pattern_without_punct not in existing_rules and
|
| 62 |
+
pattern_without_punct not in suggestions):
|
| 63 |
+
example_text = ' '.join(node.leaf_labels())
|
| 64 |
+
suggestions[pattern_without_punct] = example_text
|
| 65 |
+
|
| 66 |
+
for child in node.children:
|
| 67 |
+
_traverse(child)
|
| 68 |
+
|
| 69 |
+
_traverse(tree)
|
| 70 |
+
|
| 71 |
+
# Only suggest wildcard rules for labels that don't have any specific patterns suggested
|
| 72 |
+
# and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns)
|
| 73 |
+
label_counts = {}
|
| 74 |
+
|
| 75 |
+
def _count_labels(node):
|
| 76 |
+
if not node.is_leaf() and node.label not in EXCLUDE_NODES:
|
| 77 |
+
label_counts[node.label] = label_counts.get(node.label, 0) + 1
|
| 78 |
+
for child in node.children:
|
| 79 |
+
_count_labels(child)
|
| 80 |
+
|
| 81 |
+
_count_labels(tree)
|
| 82 |
+
|
| 83 |
+
# Suggest wildcard rules only for labels that appear multiple times and don't have specific rules
|
| 84 |
+
for label, count in label_counts.items():
|
| 85 |
+
if count >= 2: # Only suggest wildcard for labels that appear multiple times
|
| 86 |
+
wildcard_pattern = (label, None)
|
| 87 |
+
has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys())
|
| 88 |
+
|
| 89 |
+
if (not has_specific_pattern and
|
| 90 |
+
wildcard_pattern not in existing_rules and
|
| 91 |
+
wildcard_pattern not in suggestions):
|
| 92 |
+
|
| 93 |
+
# Find an example for this label
|
| 94 |
+
def _find_example(node, target_label):
|
| 95 |
+
if not node.is_leaf() and node.label == target_label:
|
| 96 |
+
return ' '.join(node.leaf_labels())
|
| 97 |
+
for child in node.children:
|
| 98 |
+
result = _find_example(child, target_label)
|
| 99 |
+
if result:
|
| 100 |
+
return result
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
example_text = _find_example(tree, label)
|
| 104 |
+
if example_text:
|
| 105 |
+
suggestions[wildcard_pattern] = example_text
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# To exclude nodes that have '.' at the end of the list of child nodes
|
| 109 |
+
suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."}
|
| 110 |
+
|
| 111 |
+
suggestions_dict = {}
|
| 112 |
+
if suggestions:
|
| 113 |
+
print("💡 Suggested new rules:")
|
| 114 |
+
for pattern, example in suggestions.items():
|
| 115 |
+
if pattern[1] is None: # Wildcard rule
|
| 116 |
+
value = f"lambda node, *children: ... # e.g., '{example}'"
|
| 117 |
+
print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'")
|
| 118 |
+
else:
|
| 119 |
+
child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))])
|
| 120 |
+
value = f"lambda node, {child_vars}: ... # e.g., '{example}'"
|
| 121 |
+
print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'")
|
| 122 |
+
suggestions_dict[pattern] = value
|
| 123 |
+
else:
|
| 124 |
+
print("✅ All patterns already handled!")
|
| 125 |
+
|
| 126 |
+
return suggestions, suggestions_dict
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
|
| 131 |
from utils import *
|
| 132 |
from datasets import load_dataset
|
| 133 |
|