Juna190825 commited on
Commit
f9984cf
·
verified ·
1 Parent(s): 3693a28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py CHANGED
@@ -4,6 +4,130 @@ import stanza
4
  # nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
5
  nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,constituency', force=True, verbose=True)
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from utils import *
8
  from datasets import load_dataset
9
 
 
4
  # nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
5
  nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,constituency', force=True, verbose=True)
6
 
7
+ ##################################################################
8
+ # @title def analyze_sentence3
9
+ def analyze_sentence3(sentence, existing_rules, debug=False):
10
+ """Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes."""
11
+ print(f"\n🔍 Analyzing: '{sentence}'")
12
+ print("=" * 50)
13
+
14
+ doc = nlp(sentence)
15
+ tree = doc.sentences[0].constituency
16
+ suggestions = {}
17
+
18
+ def is_punctuation_label(label):
19
+ """Helper function to identify punctuation POS tags."""
20
+ return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}']
21
+
22
+ def has_at_least_two_children(child_labels):
23
+ """Check if there are at least 2 non-None children."""
24
+ if child_labels is None:
25
+ return False
26
+ # Count actual children (excluding None values if any)
27
+ count = len([label for label in child_labels if label is not None])
28
+ return count >= 2
29
+
30
+ def _traverse(node):
31
+ if not node.is_leaf():
32
+ if node.label not in EXCLUDE_NODES:
33
+ child_labels = tuple(child.label for child in node.children)
34
+
35
+ # Only process if node has 2+ real children
36
+ if has_at_least_two_children(child_labels):
37
+ # SPECIAL HANDLING FOR 'S' NODES (include punctuation)
38
+ if node.label == 'S':
39
+ pattern = (node.label, child_labels)
40
+ if pattern not in existing_rules and pattern not in suggestions:
41
+ example_text = ' '.join(node.leaf_labels())
42
+ suggestions[pattern] = example_text
43
+
44
+ # STANDARD HANDLING FOR OTHER NODES
45
+ else:
46
+ # Suggest rule with all children (including punctuation)
47
+ pattern_all = (node.label, child_labels)
48
+ if pattern_all not in existing_rules and pattern_all not in suggestions:
49
+ example_text = ' '.join(node.leaf_labels())
50
+ suggestions[pattern_all] = example_text
51
+
52
+ # If last child is punctuation, suggest rule without it
53
+ if (is_punctuation_label(child_labels[-1]) and
54
+ not is_punctuation_label(node.label)):
55
+
56
+ non_punct_child_labels = child_labels[:-1]
57
+ # Only suggest if we still have 2+ real children after removing punctuation
58
+ if has_at_least_two_children(non_punct_child_labels):
59
+ pattern_without_punct = (node.label, non_punct_child_labels)
60
+
61
+ if (pattern_without_punct not in existing_rules and
62
+ pattern_without_punct not in suggestions):
63
+ example_text = ' '.join(node.leaf_labels())
64
+ suggestions[pattern_without_punct] = example_text
65
+
66
+ for child in node.children:
67
+ _traverse(child)
68
+
69
+ _traverse(tree)
70
+
71
+ # Only suggest wildcard rules for labels that don't have any specific patterns suggested
72
+ # and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns)
73
+ label_counts = {}
74
+
75
+ def _count_labels(node):
76
+ if not node.is_leaf() and node.label not in EXCLUDE_NODES:
77
+ label_counts[node.label] = label_counts.get(node.label, 0) + 1
78
+ for child in node.children:
79
+ _count_labels(child)
80
+
81
+ _count_labels(tree)
82
+
83
+ # Suggest wildcard rules only for labels that appear multiple times and don't have specific rules
84
+ for label, count in label_counts.items():
85
+ if count >= 2: # Only suggest wildcard for labels that appear multiple times
86
+ wildcard_pattern = (label, None)
87
+ has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys())
88
+
89
+ if (not has_specific_pattern and
90
+ wildcard_pattern not in existing_rules and
91
+ wildcard_pattern not in suggestions):
92
+
93
+ # Find an example for this label
94
+ def _find_example(node, target_label):
95
+ if not node.is_leaf() and node.label == target_label:
96
+ return ' '.join(node.leaf_labels())
97
+ for child in node.children:
98
+ result = _find_example(child, target_label)
99
+ if result:
100
+ return result
101
+ return None
102
+
103
+ example_text = _find_example(tree, label)
104
+ if example_text:
105
+ suggestions[wildcard_pattern] = example_text
106
+
107
+
108
+ # To exclude nodes that have '.' at the end of the list of child nodes
109
+ suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."}
110
+
111
+ suggestions_dict = {}
112
+ if suggestions:
113
+ print("💡 Suggested new rules:")
114
+ for pattern, example in suggestions.items():
115
+ if pattern[1] is None: # Wildcard rule
116
+ value = f"lambda node, *children: ... # e.g., '{example}'"
117
+ print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'")
118
+ else:
119
+ child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))])
120
+ value = f"lambda node, {child_vars}: ... # e.g., '{example}'"
121
+ print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'")
122
+ suggestions_dict[pattern] = value
123
+ else:
124
+ print("✅ All patterns already handled!")
125
+
126
+ return suggestions, suggestions_dict
127
+
128
+
129
+
130
+
131
  from utils import *
132
  from datasets import load_dataset
133