Spaces:
Sleeping
Sleeping
| # @title Initialize stanza | |
| import stanza | |
| # try: | |
| # nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True) | |
| nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,constituency', force=True, verbose=True,download_method=stanza.DownloadMethod.REUSE_RESOURCES) | |
| # Nodes that should NOT have transformation rules | |
| EXCLUDE_NODES = {'ROOT', '.', ',', ':', ';', '!', '?', '``', "''", '-LRB-', '-RRB-'} | |
| ################################################################## | |
| def google_sheet_to_df(sheet_url): | |
| # allowed urls: | |
| # default first sheet: "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link" | |
| # specifying each sheet: https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758 # after opening each sheet | |
| import pandas as pd | |
| # Read the Google Sheet into a DataFrame | |
| modified_url = sheet_url.split("#")[0].replace("edit?", "export?format=csv&") | |
| df = pd.read_csv(modified_url) | |
| # Display the DataFrame | |
| return df | |
| # URL of the Google Sheet | |
| # sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link" | |
| sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758" | |
| ################################################################## | |
| # @title def load_syntactic_rules | |
| import csv | |
| import ast | |
| import unicodedata | |
| def load_syntactic_rules(): | |
| syntactic_rules_df = google_sheet_to_df("https://docs.google.com/spreadsheets/d/1_CYomIRXi-yyzyoke1bh6F6TzJbocYZvWQomUH3AaFA/edit?pli=1&gid=1599015268#gid=1599015268") | |
| syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key']) | |
| # Normalize text fields | |
| syntactic_rules_df['Key'] = syntactic_rules_df['Key'].apply( | |
| lambda x: unicodedata.normalize("NFKC", str(x)).strip() | |
| ) | |
| syntactic_rules_df['Function'] = syntactic_rules_df['Function'].apply( | |
| lambda x: unicodedata.normalize("NFKC", str(x)).strip() | |
| ) | |
| print(f"Loaded {syntactic_rules_df.shape[0]} syntactic rules.") | |
| loaded_rules = {} | |
| syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key']) | |
| print(syntactic_rules_df.shape) | |
| # Print duplicate rows before dropping NaNs | |
| duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated()] | |
| # Find and print duplicate 'Key' values | |
| duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated(subset=['Key'], keep=False)] | |
| print("Duplicate rows based on 'Key':") | |
| print(duplicates1) | |
| for _, row in syntactic_rules_df.iterrows(): | |
| try: | |
| key = ast.literal_eval(row['Key']) # safely parse tuple | |
| func = eval(row['Function']) # recreate lambda | |
| loaded_rules[key] = func | |
| except Exception as e: | |
| print(f"Error parsing row: {row}\n{e}") | |
| return loaded_rules | |
| ################################################################## | |
| # @title def analyze_sentence3 | |
| def analyze_sentence3(sentence, existing_rules, debug=False): | |
| """Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes.""" | |
| print(f"\n🔍 Analyzing: '{sentence}'") | |
| print("=" * 50) | |
| doc = nlp(sentence) | |
| tree = doc.sentences[0].constituency | |
| suggestions = {} | |
| def is_punctuation_label(label): | |
| """Helper function to identify punctuation POS tags.""" | |
| return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}'] | |
| def has_at_least_two_children(child_labels): | |
| """Check if there are at least 2 non-None children.""" | |
| if child_labels is None: | |
| return False | |
| # Count actual children (excluding None values if any) | |
| count = len([label for label in child_labels if label is not None]) | |
| return count >= 2 | |
| def _traverse(node): | |
| if not node.is_leaf(): | |
| if node.label not in EXCLUDE_NODES: | |
| child_labels = tuple(child.label for child in node.children) | |
| # Only process if node has 2+ real children | |
| if has_at_least_two_children(child_labels): | |
| # SPECIAL HANDLING FOR 'S' NODES (include punctuation) | |
| if node.label == 'S': | |
| pattern = (node.label, child_labels) | |
| if pattern not in existing_rules and pattern not in suggestions: | |
| example_text = ' '.join(node.leaf_labels()) | |
| suggestions[pattern] = example_text | |
| # STANDARD HANDLING FOR OTHER NODES | |
| else: | |
| # Suggest rule with all children (including punctuation) | |
| pattern_all = (node.label, child_labels) | |
| if pattern_all not in existing_rules and pattern_all not in suggestions: | |
| example_text = ' '.join(node.leaf_labels()) | |
| suggestions[pattern_all] = example_text | |
| # If last child is punctuation, suggest rule without it | |
| if (is_punctuation_label(child_labels[-1]) and | |
| not is_punctuation_label(node.label)): | |
| non_punct_child_labels = child_labels[:-1] | |
| # Only suggest if we still have 2+ real children after removing punctuation | |
| if has_at_least_two_children(non_punct_child_labels): | |
| pattern_without_punct = (node.label, non_punct_child_labels) | |
| if (pattern_without_punct not in existing_rules and | |
| pattern_without_punct not in suggestions): | |
| example_text = ' '.join(node.leaf_labels()) | |
| suggestions[pattern_without_punct] = example_text | |
| for child in node.children: | |
| _traverse(child) | |
| _traverse(tree) | |
| # Only suggest wildcard rules for labels that don't have any specific patterns suggested | |
| # and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns) | |
| label_counts = {} | |
| def _count_labels(node): | |
| if not node.is_leaf() and node.label not in EXCLUDE_NODES: | |
| label_counts[node.label] = label_counts.get(node.label, 0) + 1 | |
| for child in node.children: | |
| _count_labels(child) | |
| _count_labels(tree) | |
| # Suggest wildcard rules only for labels that appear multiple times and don't have specific rules | |
| for label, count in label_counts.items(): | |
| if count >= 2: # Only suggest wildcard for labels that appear multiple times | |
| wildcard_pattern = (label, None) | |
| has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys()) | |
| if (not has_specific_pattern and | |
| wildcard_pattern not in existing_rules and | |
| wildcard_pattern not in suggestions): | |
| # Find an example for this label | |
| def _find_example(node, target_label): | |
| if not node.is_leaf() and node.label == target_label: | |
| return ' '.join(node.leaf_labels()) | |
| for child in node.children: | |
| result = _find_example(child, target_label) | |
| if result: | |
| return result | |
| return None | |
| example_text = _find_example(tree, label) | |
| if example_text: | |
| suggestions[wildcard_pattern] = example_text | |
| # To exclude nodes that have '.' at the end of the list of child nodes | |
| suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."} | |
| suggestions_dict = {} | |
| if suggestions: | |
| print("💡 Suggested new rules:") | |
| for pattern, example in suggestions.items(): | |
| if pattern[1] is None: # Wildcard rule | |
| value = f"lambda node, *children: ... # e.g., '{example}'" | |
| print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'") | |
| else: | |
| child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))]) | |
| value = f"lambda node, {child_vars}: ... # e.g., '{example}'" | |
| print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'") | |
| suggestions_dict[pattern] = value | |
| else: | |
| print("✅ All patterns already handled!") | |
| return suggestions, suggestions_dict |