Spaces:

Juna190825
/

karuleste

Sleeping

File size: 8,647 Bytes

# @title Initialize stanza
import stanza
# try:
# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,constituency', force=True, verbose=True,download_method=stanza.DownloadMethod.REUSE_RESOURCES)
# Nodes that should NOT have transformation rules
EXCLUDE_NODES = {'ROOT', '.', ',', ':', ';', '!', '?', '``', "''", '-LRB-', '-RRB-'}
##################################################################
def google_sheet_to_df(sheet_url):
  # allowed urls:
  # default first sheet: "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
  # specifying each sheet: https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758 # after opening each sheet
  import pandas as pd

  # Read the Google Sheet into a DataFrame
  modified_url = sheet_url.split("#")[0].replace("edit?", "export?format=csv&")
  df = pd.read_csv(modified_url)

  # Display the DataFrame
  return df

# URL of the Google Sheet
# sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758"

##################################################################
# @title def load_syntactic_rules
import csv
import ast
import unicodedata
def load_syntactic_rules():
  syntactic_rules_df = google_sheet_to_df("https://docs.google.com/spreadsheets/d/1_CYomIRXi-yyzyoke1bh6F6TzJbocYZvWQomUH3AaFA/edit?pli=1&gid=1599015268#gid=1599015268")
  syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
  # Normalize text fields
  syntactic_rules_df['Key'] = syntactic_rules_df['Key'].apply(
      lambda x: unicodedata.normalize("NFKC", str(x)).strip()
  )
  syntactic_rules_df['Function'] = syntactic_rules_df['Function'].apply(
      lambda x: unicodedata.normalize("NFKC", str(x)).strip()
  )
  print(f"Loaded {syntactic_rules_df.shape[0]} syntactic rules.")
  loaded_rules = {}



  syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
  print(syntactic_rules_df.shape)
  # Print duplicate rows before dropping NaNs
  duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated()]

  # Find and print duplicate 'Key' values
  duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated(subset=['Key'], keep=False)]
  print("Duplicate rows based on 'Key':")
  print(duplicates1)

  for _, row in syntactic_rules_df.iterrows():
      try:
          key = ast.literal_eval(row['Key'])     # safely parse tuple
          func = eval(row['Function'])           # recreate lambda
          loaded_rules[key] = func
      except Exception as e:
          print(f"Error parsing row: {row}\n{e}")

  return loaded_rules


##################################################################
# @title def analyze_sentence3
def analyze_sentence3(sentence, existing_rules, debug=False):
    """Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes."""
    print(f"\n🔍 Analyzing: '{sentence}'")
    print("=" * 50)

    doc = nlp(sentence)
    tree = doc.sentences[0].constituency
    suggestions = {}

    def is_punctuation_label(label):
        """Helper function to identify punctuation POS tags."""
        return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}']

    def has_at_least_two_children(child_labels):
        """Check if there are at least 2 non-None children."""
        if child_labels is None:
            return False
        # Count actual children (excluding None values if any)
        count = len([label for label in child_labels if label is not None])
        return count >= 2

    def _traverse(node):
        if not node.is_leaf():
            if node.label not in EXCLUDE_NODES:
                child_labels = tuple(child.label for child in node.children)

                # Only process if node has 2+ real children
                if has_at_least_two_children(child_labels):
                    # SPECIAL HANDLING FOR 'S' NODES (include punctuation)
                    if node.label == 'S':
                        pattern = (node.label, child_labels)
                        if pattern not in existing_rules and pattern not in suggestions:
                            example_text = ' '.join(node.leaf_labels())
                            suggestions[pattern] = example_text

                    # STANDARD HANDLING FOR OTHER NODES
                    else:
                        # Suggest rule with all children (including punctuation)
                        pattern_all = (node.label, child_labels)
                        if pattern_all not in existing_rules and pattern_all not in suggestions:
                            example_text = ' '.join(node.leaf_labels())
                            suggestions[pattern_all] = example_text

                        # If last child is punctuation, suggest rule without it
                        if (is_punctuation_label(child_labels[-1]) and
                            not is_punctuation_label(node.label)):

                            non_punct_child_labels = child_labels[:-1]
                            # Only suggest if we still have 2+ real children after removing punctuation
                            if has_at_least_two_children(non_punct_child_labels):
                                pattern_without_punct = (node.label, non_punct_child_labels)

                                if (pattern_without_punct not in existing_rules and
                                    pattern_without_punct not in suggestions):
                                    example_text = ' '.join(node.leaf_labels())
                                    suggestions[pattern_without_punct] = example_text

            for child in node.children:
                _traverse(child)

    _traverse(tree)

    # Only suggest wildcard rules for labels that don't have any specific patterns suggested
    # and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns)
    label_counts = {}

    def _count_labels(node):
        if not node.is_leaf() and node.label not in EXCLUDE_NODES:
            label_counts[node.label] = label_counts.get(node.label, 0) + 1
            for child in node.children:
                _count_labels(child)

    _count_labels(tree)

    # Suggest wildcard rules only for labels that appear multiple times and don't have specific rules
    for label, count in label_counts.items():
        if count >= 2:  # Only suggest wildcard for labels that appear multiple times
            wildcard_pattern = (label, None)
            has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys())

            if (not has_specific_pattern and
                wildcard_pattern not in existing_rules and
                wildcard_pattern not in suggestions):

                # Find an example for this label
                def _find_example(node, target_label):
                    if not node.is_leaf() and node.label == target_label:
                        return ' '.join(node.leaf_labels())
                    for child in node.children:
                        result = _find_example(child, target_label)
                        if result:
                            return result
                    return None

                example_text = _find_example(tree, label)
                if example_text:
                    suggestions[wildcard_pattern] = example_text


    # To exclude nodes that have '.' at the end of the list of child nodes
    suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."}

    suggestions_dict = {}
    if suggestions:
        print("💡 Suggested new rules:")
        for pattern, example in suggestions.items():
            if pattern[1] is None:  # Wildcard rule
                value = f"lambda node, *children: ...   # e.g., '{example}'"
                print(f"  {pattern}: lambda node, *children: ...   # e.g., '{example}'")
            else:
                child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))])
                value = f"lambda node, {child_vars}: ...   # e.g., '{example}'"
                print(f"  {pattern}: lambda node, {child_vars}: ...   # e.g., '{example}'")
            suggestions_dict[pattern] = value
    else:
        print("✅ All patterns already handled!")

    return suggestions, suggestions_dict