# @title Initialize stanza import stanza # try: # nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True) nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,constituency', force=True, verbose=True,download_method=stanza.DownloadMethod.REUSE_RESOURCES) # Nodes that should NOT have transformation rules EXCLUDE_NODES = {'ROOT', '.', ',', ':', ';', '!', '?', '``', "''", '-LRB-', '-RRB-'} ################################################################## def google_sheet_to_df(sheet_url): # allowed urls: # default first sheet: "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link" # specifying each sheet: https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758 # after opening each sheet import pandas as pd # Read the Google Sheet into a DataFrame modified_url = sheet_url.split("#")[0].replace("edit?", "export?format=csv&") df = pd.read_csv(modified_url) # Display the DataFrame return df # URL of the Google Sheet # sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link" sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758" ################################################################## # @title def load_syntactic_rules import csv import ast import unicodedata def load_syntactic_rules(): syntactic_rules_df = google_sheet_to_df("https://docs.google.com/spreadsheets/d/1_CYomIRXi-yyzyoke1bh6F6TzJbocYZvWQomUH3AaFA/edit?pli=1&gid=1599015268#gid=1599015268") syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key']) # Normalize text fields syntactic_rules_df['Key'] = syntactic_rules_df['Key'].apply( lambda x: unicodedata.normalize("NFKC", str(x)).strip() ) syntactic_rules_df['Function'] = syntactic_rules_df['Function'].apply( lambda x: unicodedata.normalize("NFKC", str(x)).strip() ) print(f"Loaded {syntactic_rules_df.shape[0]} syntactic rules.") loaded_rules = {} syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key']) print(syntactic_rules_df.shape) # Print duplicate rows before dropping NaNs duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated()] # Find and print duplicate 'Key' values duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated(subset=['Key'], keep=False)] print("Duplicate rows based on 'Key':") print(duplicates1) for _, row in syntactic_rules_df.iterrows(): try: key = ast.literal_eval(row['Key']) # safely parse tuple func = eval(row['Function']) # recreate lambda loaded_rules[key] = func except Exception as e: print(f"Error parsing row: {row}\n{e}") return loaded_rules ################################################################## # @title def analyze_sentence3 def analyze_sentence3(sentence, existing_rules, debug=False): """Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes.""" print(f"\nšŸ” Analyzing: '{sentence}'") print("=" * 50) doc = nlp(sentence) tree = doc.sentences[0].constituency suggestions = {} def is_punctuation_label(label): """Helper function to identify punctuation POS tags.""" return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}'] def has_at_least_two_children(child_labels): """Check if there are at least 2 non-None children.""" if child_labels is None: return False # Count actual children (excluding None values if any) count = len([label for label in child_labels if label is not None]) return count >= 2 def _traverse(node): if not node.is_leaf(): if node.label not in EXCLUDE_NODES: child_labels = tuple(child.label for child in node.children) # Only process if node has 2+ real children if has_at_least_two_children(child_labels): # SPECIAL HANDLING FOR 'S' NODES (include punctuation) if node.label == 'S': pattern = (node.label, child_labels) if pattern not in existing_rules and pattern not in suggestions: example_text = ' '.join(node.leaf_labels()) suggestions[pattern] = example_text # STANDARD HANDLING FOR OTHER NODES else: # Suggest rule with all children (including punctuation) pattern_all = (node.label, child_labels) if pattern_all not in existing_rules and pattern_all not in suggestions: example_text = ' '.join(node.leaf_labels()) suggestions[pattern_all] = example_text # If last child is punctuation, suggest rule without it if (is_punctuation_label(child_labels[-1]) and not is_punctuation_label(node.label)): non_punct_child_labels = child_labels[:-1] # Only suggest if we still have 2+ real children after removing punctuation if has_at_least_two_children(non_punct_child_labels): pattern_without_punct = (node.label, non_punct_child_labels) if (pattern_without_punct not in existing_rules and pattern_without_punct not in suggestions): example_text = ' '.join(node.leaf_labels()) suggestions[pattern_without_punct] = example_text for child in node.children: _traverse(child) _traverse(tree) # Only suggest wildcard rules for labels that don't have any specific patterns suggested # and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns) label_counts = {} def _count_labels(node): if not node.is_leaf() and node.label not in EXCLUDE_NODES: label_counts[node.label] = label_counts.get(node.label, 0) + 1 for child in node.children: _count_labels(child) _count_labels(tree) # Suggest wildcard rules only for labels that appear multiple times and don't have specific rules for label, count in label_counts.items(): if count >= 2: # Only suggest wildcard for labels that appear multiple times wildcard_pattern = (label, None) has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys()) if (not has_specific_pattern and wildcard_pattern not in existing_rules and wildcard_pattern not in suggestions): # Find an example for this label def _find_example(node, target_label): if not node.is_leaf() and node.label == target_label: return ' '.join(node.leaf_labels()) for child in node.children: result = _find_example(child, target_label) if result: return result return None example_text = _find_example(tree, label) if example_text: suggestions[wildcard_pattern] = example_text # To exclude nodes that have '.' at the end of the list of child nodes suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."} suggestions_dict = {} if suggestions: print("šŸ’” Suggested new rules:") for pattern, example in suggestions.items(): if pattern[1] is None: # Wildcard rule value = f"lambda node, *children: ... # e.g., '{example}'" print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'") else: child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))]) value = f"lambda node, {child_vars}: ... # e.g., '{example}'" print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'") suggestions_dict[pattern] = value else: print("āœ… All patterns already handled!") return suggestions, suggestions_dict