Spaces:

Juna190825
/

karuleste

Sleeping

App Files Files Community

karuleste / utils.py

Juna190825

Update utils.py

d8a0ca0 verified 5 months ago

raw

history blame contribute delete

8.65 kB

	# @title Initialize stanza
	import stanza
	# try:
	# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner,constituency,coref,sentiment', force=True, verbose=True)
	nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,constituency', force=True, verbose=True,download_method=stanza.DownloadMethod.REUSE_RESOURCES)
	# Nodes that should NOT have transformation rules
	EXCLUDE_NODES = {'ROOT', '.', ',', ':', ';', '!', '?', '``', "''", '-LRB-', '-RRB-'}
	##################################################################
	def google_sheet_to_df(sheet_url):
	# allowed urls:
	# default first sheet: "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
	# specifying each sheet: https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758 # after opening each sheet
	import pandas as pd

	# Read the Google Sheet into a DataFrame
	modified_url = sheet_url.split("#")[0].replace("edit?", "export?format=csv&")
	df = pd.read_csv(modified_url)

	# Display the DataFrame
	return df

	# URL of the Google Sheet
	# sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?usp=drive_link"
	sheet_url = "https://docs.google.com/spreadsheets/d/1Pd7F3_6t5ta5rUmQeTQFG7OqQfqJr5E8TEiljE9Cb2g/edit?gid=1816125758#gid=1816125758"

	##################################################################
	# @title def load_syntactic_rules
	import csv
	import ast
	import unicodedata
	def load_syntactic_rules():
	syntactic_rules_df = google_sheet_to_df("https://docs.google.com/spreadsheets/d/1_CYomIRXi-yyzyoke1bh6F6TzJbocYZvWQomUH3AaFA/edit?pli=1&gid=1599015268#gid=1599015268")
	syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
	# Normalize text fields
	syntactic_rules_df['Key'] = syntactic_rules_df['Key'].apply(
	lambda x: unicodedata.normalize("NFKC", str(x)).strip()
	)
	syntactic_rules_df['Function'] = syntactic_rules_df['Function'].apply(
	lambda x: unicodedata.normalize("NFKC", str(x)).strip()
	)
	print(f"Loaded {syntactic_rules_df.shape[0]} syntactic rules.")
	loaded_rules = {}



	syntactic_rules_df = syntactic_rules_df.dropna(subset=['Key'])
	print(syntactic_rules_df.shape)
	# Print duplicate rows before dropping NaNs
	duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated()]

	# Find and print duplicate 'Key' values
	duplicates1 = syntactic_rules_df[syntactic_rules_df.duplicated(subset=['Key'], keep=False)]
	print("Duplicate rows based on 'Key':")
	print(duplicates1)

	for _, row in syntactic_rules_df.iterrows():
	try:
	key = ast.literal_eval(row['Key']) # safely parse tuple
	func = eval(row['Function']) # recreate lambda
	loaded_rules[key] = func
	except Exception as e:
	print(f"Error parsing row: {row}\n{e}")

	return loaded_rules


	##################################################################
	# @title def analyze_sentence3
	def analyze_sentence3(sentence, existing_rules, debug=False):
	"""Analyzes a sentence and suggests missing rules, being punctuation-inclusive for 'S' nodes."""
	print(f"\n🔍 Analyzing: '{sentence}'")
	print("=" * 50)

	doc = nlp(sentence)
	tree = doc.sentences[0].constituency
	suggestions = {}

	def is_punctuation_label(label):
	"""Helper function to identify punctuation POS tags."""
	return label in ['.', ',', ':', ';', '!', '?', '\'', '"', '`', '-', '--', '#', '$', '(', ')', '[', ']', '{', '}']

	def has_at_least_two_children(child_labels):
	"""Check if there are at least 2 non-None children."""
	if child_labels is None:
	return False
	# Count actual children (excluding None values if any)
	count = len([label for label in child_labels if label is not None])
	return count >= 2

	def _traverse(node):
	if not node.is_leaf():
	if node.label not in EXCLUDE_NODES:
	child_labels = tuple(child.label for child in node.children)

	# Only process if node has 2+ real children
	if has_at_least_two_children(child_labels):
	# SPECIAL HANDLING FOR 'S' NODES (include punctuation)
	if node.label == 'S':
	pattern = (node.label, child_labels)
	if pattern not in existing_rules and pattern not in suggestions:
	example_text = ' '.join(node.leaf_labels())
	suggestions[pattern] = example_text

	# STANDARD HANDLING FOR OTHER NODES
	else:
	# Suggest rule with all children (including punctuation)
	pattern_all = (node.label, child_labels)
	if pattern_all not in existing_rules and pattern_all not in suggestions:
	example_text = ' '.join(node.leaf_labels())
	suggestions[pattern_all] = example_text

	# If last child is punctuation, suggest rule without it
	if (is_punctuation_label(child_labels[-1]) and
	not is_punctuation_label(node.label)):

	non_punct_child_labels = child_labels[:-1]
	# Only suggest if we still have 2+ real children after removing punctuation
	if has_at_least_two_children(non_punct_child_labels):
	pattern_without_punct = (node.label, non_punct_child_labels)

	if (pattern_without_punct not in existing_rules and
	pattern_without_punct not in suggestions):
	example_text = ' '.join(node.leaf_labels())
	suggestions[pattern_without_punct] = example_text

	for child in node.children:
	_traverse(child)

	_traverse(tree)

	# Only suggest wildcard rules for labels that don't have any specific patterns suggested
	# and appear multiple times in the tree (to avoid suggesting wildcards for rare patterns)
	label_counts = {}

	def _count_labels(node):
	if not node.is_leaf() and node.label not in EXCLUDE_NODES:
	label_counts[node.label] = label_counts.get(node.label, 0) + 1
	for child in node.children:
	_count_labels(child)

	_count_labels(tree)

	# Suggest wildcard rules only for labels that appear multiple times and don't have specific rules
	for label, count in label_counts.items():
	if count >= 2: # Only suggest wildcard for labels that appear multiple times
	wildcard_pattern = (label, None)
	has_specific_pattern = any(pattern[0] == label for pattern in suggestions.keys())

	if (not has_specific_pattern and
	wildcard_pattern not in existing_rules and
	wildcard_pattern not in suggestions):

	# Find an example for this label
	def _find_example(node, target_label):
	if not node.is_leaf() and node.label == target_label:
	return ' '.join(node.leaf_labels())
	for child in node.children:
	result = _find_example(child, target_label)
	if result:
	return result
	return None

	example_text = _find_example(tree, label)
	if example_text:
	suggestions[wildcard_pattern] = example_text


	# To exclude nodes that have '.' at the end of the list of child nodes
	suggestions = {k: v for k, v in suggestions.items() if k[1][-1] != "."}

	suggestions_dict = {}
	if suggestions:
	print("💡 Suggested new rules:")
	for pattern, example in suggestions.items():
	if pattern[1] is None: # Wildcard rule
	value = f"lambda node, *children: ... # e.g., '{example}'"
	print(f" {pattern}: lambda node, *children: ... # e.g., '{example}'")
	else:
	child_vars = ', '.join([f'c{i}' for i in range(len(pattern[1]))])
	value = f"lambda node, {child_vars}: ... # e.g., '{example}'"
	print(f" {pattern}: lambda node, {child_vars}: ... # e.g., '{example}'")
	suggestions_dict[pattern] = value
	else:
	print("✅ All patterns already handled!")

	return suggestions, suggestions_dict