karuleste / utils.py
Juna190825's picture
Update utils.py
d8a0ca0 verified
# @title Initialize stanza
import stanza
# try:
# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,constituency', force=True, verbose=True,download_method=stanza.DownloadMethod.REUSE_RESOURCES)
# Nodes that should NOT have transformation rules
EXCLUDE_NODES = {'ROOT', '.', ',', ':', ';', '!', '?', '``', "''", '-LRB-', '-RRB-'}
##################################################################
def google_sheet_to_df(sheet_url):
# allowed urls:
# default first sheet: "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
# specifying each sheet: https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758 # after opening each sheet
import pandas as pd
# Read the Google Sheet into a DataFrame
modified_url = sheet_url.split("#")[0].replace("edit?", "export?format=csv&")
df = pd.read_csv(modified_url)
# Display the DataFrame
return df
# URL of the Google Sheet
# sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758"
##################################################################
# @title def load_syntactic_rules
import csv
import ast
import unicodedata
def load_syntactic_rules():
syntactic_rules_df = google_sheet_to_df("https://docs.google.com/spreadsheets/d/1_CYomIRXi-yyzyoke1bh6F6TzJbocYZvWQomUH3AaFA/edit?pli=1&gid=1599015268#gid=1599015268")
syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
# Normalize text fields
syntactic_rules_df['Key'] = syntactic_rules_df['Key'].apply(
lambda x: unicodedata.normalize("NFKC", str(x)).strip()
)
syntactic_rules_df['Function'] = syntactic_rules_df['Function'].apply(
lambda x: unicodedata.normalize("NFKC", str(x)).strip()
)
print(f"Loaded {syntactic_rules_df.shape[0]} syntactic rules.")
loaded_rules = {}
syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
print(syntactic_rules_df.shape)
# Print duplicate rows before dropping NaNs
duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated()]
# Find and print duplicate 'Key' values
duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated(subset=['Key'], keep=False)]
print("Duplicate rows based on 'Key':")
print(duplicates1)
for _, row in syntactic_rules_df.iterrows():
try:
key = ast.literal_eval(row['Key']) # safely parse tuple
func = eval(row['Function']) # recreate lambda
loaded_rules[key] = func
except Exception as e:
print(f"Error parsing row: {row}\n{e}")
return loaded_rules
##################################################################
# @title def analyze_sentence3
def analyze_sentence3(sentence, existing_rules, debug=False):
"""Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes."""
print(f"\n🔍 Analyzing: '{sentence}'")
print("=" * 50)
doc = nlp(sentence)
tree = doc.sentences[0].constituency
suggestions = {}
def is_punctuation_label(label):
"""Helper function to identify punctuation POS tags."""
return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}']
def has_at_least_two_children(child_labels):
"""Check if there are at least 2 non-None children."""
if child_labels is None:
return False
# Count actual children (excluding None values if any)
count = len([label for label in child_labels if label is not None])
return count >= 2
def _traverse(node):
if not node.is_leaf():
if node.label not in EXCLUDE_NODES:
child_labels = tuple(child.label for child in node.children)
# Only process if node has 2+ real children
if has_at_least_two_children(child_labels):
# SPECIAL HANDLING FOR 'S' NODES (include punctuation)
if node.label == 'S':
pattern = (node.label, child_labels)
if pattern not in existing_rules and pattern not in suggestions:
example_text = ' '.join(node.leaf_labels())
suggestions[pattern] = example_text
# STANDARD HANDLING FOR OTHER NODES
else:
# Suggest rule with all children (including punctuation)
pattern_all = (node.label, child_labels)
if pattern_all not in existing_rules and pattern_all not in suggestions:
example_text = ' '.join(node.leaf_labels())
suggestions[pattern_all] = example_text
# If last child is punctuation, suggest rule without it
if (is_punctuation_label(child_labels[-1]) and
not is_punctuation_label(node.label)):
non_punct_child_labels = child_labels[:-1]
# Only suggest if we still have 2+ real children after removing punctuation
if has_at_least_two_children(non_punct_child_labels):
pattern_without_punct = (node.label, non_punct_child_labels)
if (pattern_without_punct not in existing_rules and
pattern_without_punct not in suggestions):
example_text = ' '.join(node.leaf_labels())
suggestions[pattern_without_punct] = example_text
for child in node.children:
_traverse(child)
_traverse(tree)
# Only suggest wildcard rules for labels that don't have any specific patterns suggested
# and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns)
label_counts = {}
def _count_labels(node):
if not node.is_leaf() and node.label not in EXCLUDE_NODES:
label_counts[node.label] = label_counts.get(node.label, 0) + 1
for child in node.children:
_count_labels(child)
_count_labels(tree)
# Suggest wildcard rules only for labels that appear multiple times and don't have specific rules
for label, count in label_counts.items():
if count >= 2: # Only suggest wildcard for labels that appear multiple times
wildcard_pattern = (label, None)
has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys())
if (not has_specific_pattern and
wildcard_pattern not in existing_rules and
wildcard_pattern not in suggestions):
# Find an example for this label
def _find_example(node, target_label):
if not node.is_leaf() and node.label == target_label:
return ' '.join(node.leaf_labels())
for child in node.children:
result = _find_example(child, target_label)
if result:
return result
return None
example_text = _find_example(tree, label)
if example_text:
suggestions[wildcard_pattern] = example_text
# To exclude nodes that have '.' at the end of the list of child nodes
suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."}
suggestions_dict = {}
if suggestions:
print("💡 Suggested new rules:")
for pattern, example in suggestions.items():
if pattern[1] is None: # Wildcard rule
value = f"lambda node, *children: ... # e.g., '{example}'"
print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'")
else:
child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))])
value = f"lambda node, {child_vars}: ... # e.g., '{example}'"
print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'")
suggestions_dict[pattern] = value
else:
print("✅ All patterns already handled!")
return suggestions, suggestions_dict