Spaces:
Sleeping
Sleeping
File size: 8,647 Bytes
3273fbc d8a0ca0 425f9ac 2e572a1 1d6dd1b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | # @title Initialize stanza
import stanza
# try:
# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,constituency', force=True, verbose=True,download_method=stanza.DownloadMethod.REUSE_RESOURCES)
# Nodes that should NOT have transformation rules
EXCLUDE_NODES = {'ROOT', '.', ',', ':', ';', '!', '?', '``', "''", '-LRB-', '-RRB-'}
##################################################################
def google_sheet_to_df(sheet_url):
# allowed urls:
# default first sheet: "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
# specifying each sheet: https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758 # after opening each sheet
import pandas as pd
# Read the Google Sheet into a DataFrame
modified_url = sheet_url.split("#")[0].replace("edit?", "export?format=csv&")
df = pd.read_csv(modified_url)
# Display the DataFrame
return df
# URL of the Google Sheet
# sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758"
##################################################################
# @title def load_syntactic_rules
import csv
import ast
import unicodedata
def load_syntactic_rules():
syntactic_rules_df = google_sheet_to_df("https://docs.google.com/spreadsheets/d/1_CYomIRXi-yyzyoke1bh6F6TzJbocYZvWQomUH3AaFA/edit?pli=1&gid=1599015268#gid=1599015268")
syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
# Normalize text fields
syntactic_rules_df['Key'] = syntactic_rules_df['Key'].apply(
lambda x: unicodedata.normalize("NFKC", str(x)).strip()
)
syntactic_rules_df['Function'] = syntactic_rules_df['Function'].apply(
lambda x: unicodedata.normalize("NFKC", str(x)).strip()
)
print(f"Loaded {syntactic_rules_df.shape[0]} syntactic rules.")
loaded_rules = {}
syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
print(syntactic_rules_df.shape)
# Print duplicate rows before dropping NaNs
duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated()]
# Find and print duplicate 'Key' values
duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated(subset=['Key'], keep=False)]
print("Duplicate rows based on 'Key':")
print(duplicates1)
for _, row in syntactic_rules_df.iterrows():
try:
key = ast.literal_eval(row['Key']) # safely parse tuple
func = eval(row['Function']) # recreate lambda
loaded_rules[key] = func
except Exception as e:
print(f"Error parsing row: {row}\n{e}")
return loaded_rules
##################################################################
# @title def analyze_sentence3
def analyze_sentence3(sentence, existing_rules, debug=False):
"""Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes."""
print(f"\n🔍 Analyzing: '{sentence}'")
print("=" * 50)
doc = nlp(sentence)
tree = doc.sentences[0].constituency
suggestions = {}
def is_punctuation_label(label):
"""Helper function to identify punctuation POS tags."""
return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}']
def has_at_least_two_children(child_labels):
"""Check if there are at least 2 non-None children."""
if child_labels is None:
return False
# Count actual children (excluding None values if any)
count = len([label for label in child_labels if label is not None])
return count >= 2
def _traverse(node):
if not node.is_leaf():
if node.label not in EXCLUDE_NODES:
child_labels = tuple(child.label for child in node.children)
# Only process if node has 2+ real children
if has_at_least_two_children(child_labels):
# SPECIAL HANDLING FOR 'S' NODES (include punctuation)
if node.label == 'S':
pattern = (node.label, child_labels)
if pattern not in existing_rules and pattern not in suggestions:
example_text = ' '.join(node.leaf_labels())
suggestions[pattern] = example_text
# STANDARD HANDLING FOR OTHER NODES
else:
# Suggest rule with all children (including punctuation)
pattern_all = (node.label, child_labels)
if pattern_all not in existing_rules and pattern_all not in suggestions:
example_text = ' '.join(node.leaf_labels())
suggestions[pattern_all] = example_text
# If last child is punctuation, suggest rule without it
if (is_punctuation_label(child_labels[-1]) and
not is_punctuation_label(node.label)):
non_punct_child_labels = child_labels[:-1]
# Only suggest if we still have 2+ real children after removing punctuation
if has_at_least_two_children(non_punct_child_labels):
pattern_without_punct = (node.label, non_punct_child_labels)
if (pattern_without_punct not in existing_rules and
pattern_without_punct not in suggestions):
example_text = ' '.join(node.leaf_labels())
suggestions[pattern_without_punct] = example_text
for child in node.children:
_traverse(child)
_traverse(tree)
# Only suggest wildcard rules for labels that don't have any specific patterns suggested
# and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns)
label_counts = {}
def _count_labels(node):
if not node.is_leaf() and node.label not in EXCLUDE_NODES:
label_counts[node.label] = label_counts.get(node.label, 0) + 1
for child in node.children:
_count_labels(child)
_count_labels(tree)
# Suggest wildcard rules only for labels that appear multiple times and don't have specific rules
for label, count in label_counts.items():
if count >= 2: # Only suggest wildcard for labels that appear multiple times
wildcard_pattern = (label, None)
has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys())
if (not has_specific_pattern and
wildcard_pattern not in existing_rules and
wildcard_pattern not in suggestions):
# Find an example for this label
def _find_example(node, target_label):
if not node.is_leaf() and node.label == target_label:
return ' '.join(node.leaf_labels())
for child in node.children:
result = _find_example(child, target_label)
if result:
return result
return None
example_text = _find_example(tree, label)
if example_text:
suggestions[wildcard_pattern] = example_text
# To exclude nodes that have '.' at the end of the list of child nodes
suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."}
suggestions_dict = {}
if suggestions:
print("💡 Suggested new rules:")
for pattern, example in suggestions.items():
if pattern[1] is None: # Wildcard rule
value = f"lambda node, *children: ... # e.g., '{example}'"
print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'")
else:
child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))])
value = f"lambda node, {child_vars}: ... # e.g., '{example}'"
print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'")
suggestions_dict[pattern] = value
else:
print("✅ All patterns already handled!")
return suggestions, suggestions_dict |