|
|
|
|
|
|
|
|
""" |
|
|
Convert infix expressions to prefix notation. |
|
|
|
|
|
This script reads the HuggingFace dataset with infix notation and creates |
|
|
a new column with the same expressions in prefix notation, maintaining |
|
|
the same variables and operators from the original prompt. |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import re |
|
|
import argparse |
|
|
from datasets import load_dataset, Dataset, DatasetDict |
|
|
from huggingface_hub import HfApi |
|
|
import sympy |
|
|
from tqdm import tqdm |
|
|
|
|
|
sys.path.append('.') |
|
|
sys.path.append('..') |
|
|
|
|
|
|
|
|
def sympy_to_prefix(expr): |
|
|
""" |
|
|
Convert a SymPy expression to prefix notation (Polish notation). |
|
|
|
|
|
Args: |
|
|
expr: SymPy expression |
|
|
|
|
|
Returns: |
|
|
str: Expression in prefix notation |
|
|
|
|
|
Examples: |
|
|
x_1 + x_2 -> + x_1 x_2 |
|
|
x_1 * (x_2 + C) -> * x_1 + x_2 C |
|
|
sin(x_1**2) -> sin ** x_1 2 |
|
|
""" |
|
|
if isinstance(expr, sympy.Symbol): |
|
|
return str(expr) |
|
|
|
|
|
if isinstance(expr, (sympy.Integer, sympy.Float, sympy.Rational)): |
|
|
val = float(expr) |
|
|
|
|
|
if val == int(val): |
|
|
return str(int(val)) |
|
|
return str(val) |
|
|
|
|
|
|
|
|
if isinstance(expr, sympy.Mul): |
|
|
|
|
|
if len(expr.args) == 2: |
|
|
if expr.args[0] == -1: |
|
|
|
|
|
arg = sympy_to_prefix(expr.args[1]) |
|
|
return f"* -1 {arg}" |
|
|
elif expr.args[1] == -1: |
|
|
arg = sympy_to_prefix(expr.args[0]) |
|
|
return f"* -1 {arg}" |
|
|
|
|
|
|
|
|
numer = [] |
|
|
denom = [] |
|
|
for arg in expr.args: |
|
|
if isinstance(arg, sympy.Pow) and arg.args[1] == -1: |
|
|
denom.append(arg.args[0]) |
|
|
else: |
|
|
numer.append(arg) |
|
|
|
|
|
if len(denom) > 0: |
|
|
|
|
|
if len(numer) == 0: |
|
|
numer_expr = sympy.Integer(1) |
|
|
elif len(numer) == 1: |
|
|
numer_expr = numer[0] |
|
|
else: |
|
|
numer_expr = sympy.Mul(*numer) |
|
|
|
|
|
if len(denom) == 1: |
|
|
denom_expr = denom[0] |
|
|
else: |
|
|
denom_expr = sympy.Mul(*denom) |
|
|
|
|
|
numer_str = sympy_to_prefix(numer_expr) |
|
|
denom_str = sympy_to_prefix(denom_expr) |
|
|
return f"/ {numer_str} {denom_str}" |
|
|
|
|
|
|
|
|
args = [sympy_to_prefix(arg) for arg in expr.args] |
|
|
if len(args) == 2: |
|
|
return f"* {args[0]} {args[1]}" |
|
|
else: |
|
|
result = args[0] |
|
|
for arg in args[1:]: |
|
|
result = f"* {result} {arg}" |
|
|
return result |
|
|
|
|
|
|
|
|
if isinstance(expr, sympy.Function): |
|
|
func_name = expr.func.__name__.lower() |
|
|
args = [sympy_to_prefix(arg) for arg in expr.args] |
|
|
return f"{func_name} {' '.join(args)}" |
|
|
|
|
|
|
|
|
if isinstance(expr, sympy.Pow): |
|
|
base = sympy_to_prefix(expr.args[0]) |
|
|
exp_val = sympy_to_prefix(expr.args[1]) |
|
|
return f"** {base} {exp_val}" |
|
|
|
|
|
|
|
|
if isinstance(expr, sympy.Add): |
|
|
|
|
|
positive_terms = [] |
|
|
negative_terms = [] |
|
|
|
|
|
for arg in expr.args: |
|
|
if isinstance(arg, sympy.Mul) and len(arg.args) >= 1: |
|
|
if arg.args[0] == -1: |
|
|
|
|
|
if len(arg.args) == 2: |
|
|
negative_terms.append(arg.args[1]) |
|
|
else: |
|
|
negative_terms.append(sympy.Mul(*arg.args[1:])) |
|
|
else: |
|
|
positive_terms.append(arg) |
|
|
else: |
|
|
positive_terms.append(arg) |
|
|
|
|
|
|
|
|
if len(positive_terms) == 1 and len(negative_terms) == 1: |
|
|
left = sympy_to_prefix(positive_terms[0]) |
|
|
right = sympy_to_prefix(negative_terms[0]) |
|
|
return f"- {left} {right}" |
|
|
|
|
|
|
|
|
args = [sympy_to_prefix(arg) for arg in expr.args] |
|
|
if len(args) == 2: |
|
|
return f"+ {args[0]} {args[1]}" |
|
|
else: |
|
|
result = args[0] |
|
|
for arg in args[1:]: |
|
|
result = f"+ {result} {arg}" |
|
|
return result |
|
|
|
|
|
|
|
|
if hasattr(expr, 'func') and hasattr(expr, 'args') and expr.args: |
|
|
func_name = str(expr.func).split('.')[-1].lower() |
|
|
args = [sympy_to_prefix(arg) for arg in expr.args] |
|
|
return f"{func_name} {' '.join(args)}" |
|
|
|
|
|
|
|
|
return str(expr) |
|
|
|
|
|
|
|
|
def parse_infix_prompt(prompt_text): |
|
|
""" |
|
|
Parse an infix prompt to extract vars, operators, constants, and expression. |
|
|
|
|
|
Args: |
|
|
prompt_text: String in format: |
|
|
vars: x_1, x_2, ... |
|
|
oper: +, -, *, ... |
|
|
cons: C |
|
|
expr: x_1 + x_2 |
|
|
|
|
|
Returns: |
|
|
dict with keys: vars, oper, cons, expr |
|
|
""" |
|
|
lines = prompt_text.strip().split('\n') |
|
|
result = {} |
|
|
|
|
|
for line in lines: |
|
|
if line.startswith('vars:'): |
|
|
vars_str = line.replace('vars:', '').strip() |
|
|
result['vars'] = [v.strip() for v in vars_str.split(',')] |
|
|
elif line.startswith('oper:'): |
|
|
oper_str = line.replace('oper:', '').strip() |
|
|
result['oper'] = [o.strip() for o in oper_str.split(',')] |
|
|
elif line.startswith('cons:'): |
|
|
result['cons'] = line.replace('cons:', '').strip() |
|
|
elif line.startswith('expr:'): |
|
|
result['expr'] = line.replace('expr:', '').strip() |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def convert_infix_to_prefix_prompt(infix_prompt): |
|
|
""" |
|
|
Convert an infix prompt to prefix format. |
|
|
|
|
|
Args: |
|
|
infix_prompt: String with infix notation prompt |
|
|
|
|
|
Returns: |
|
|
str: Prompt in prefix notation with same vars/operators |
|
|
""" |
|
|
|
|
|
parsed = parse_infix_prompt(infix_prompt) |
|
|
|
|
|
|
|
|
try: |
|
|
expr_str = parsed['expr'] |
|
|
|
|
|
|
|
|
expr_str_sympy = expr_str.replace('C', 'C_const') |
|
|
|
|
|
|
|
|
sympy_expr = sympy.sympify(expr_str_sympy, evaluate=False) |
|
|
|
|
|
|
|
|
prefix_expr = sympy_to_prefix(sympy_expr) |
|
|
|
|
|
|
|
|
prefix_expr = prefix_expr.replace('C_const', 'C') |
|
|
|
|
|
|
|
|
prefix_prompt = f"vars: {', '.join(parsed['vars'])}\n" |
|
|
prefix_prompt += f"oper: {', '.join(parsed['oper'])}\n" |
|
|
prefix_prompt += f"cons: {parsed['cons']}\n" |
|
|
prefix_prompt += f"expr: {prefix_expr}" |
|
|
|
|
|
return prefix_prompt |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error converting expression: {parsed['expr']}") |
|
|
print(f"Error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def process_dataset(dataset_name='augustocsc/sintetico_natural', |
|
|
split='test', |
|
|
output_path='./data/processed/700K_prefix_converted'): |
|
|
""" |
|
|
Process the entire dataset, converting infix to prefix. |
|
|
|
|
|
Args: |
|
|
dataset_name: HuggingFace dataset name |
|
|
split: Dataset split to process |
|
|
output_path: Where to save the converted dataset |
|
|
|
|
|
Returns: |
|
|
Dataset with new column 'p_prompt_n_converted' |
|
|
""" |
|
|
print(f"Loading dataset {dataset_name} (split={split})...") |
|
|
ds = load_dataset(dataset_name, split=split) |
|
|
|
|
|
print(f"Dataset loaded: {len(ds)} examples") |
|
|
print(f"Columns: {ds.column_names}") |
|
|
|
|
|
|
|
|
if 'i_prompt_n' not in ds.column_names: |
|
|
raise ValueError("Column 'i_prompt_n' not found in dataset!") |
|
|
|
|
|
|
|
|
converted_prompts = [] |
|
|
conversion_success = [] |
|
|
|
|
|
print("\nConverting infix to prefix...") |
|
|
for i, example in enumerate(tqdm(ds)): |
|
|
infix_prompt = example['i_prompt_n'] |
|
|
prefix_prompt = convert_infix_to_prefix_prompt(infix_prompt) |
|
|
|
|
|
if prefix_prompt is not None: |
|
|
converted_prompts.append(prefix_prompt) |
|
|
conversion_success.append(True) |
|
|
else: |
|
|
|
|
|
converted_prompts.append(infix_prompt) |
|
|
conversion_success.append(False) |
|
|
|
|
|
|
|
|
ds = ds.add_column('p_prompt_n_converted', converted_prompts) |
|
|
ds = ds.add_column('conversion_success', conversion_success) |
|
|
|
|
|
success_rate = sum(conversion_success) / len(conversion_success) * 100 |
|
|
print(f"\nConversion success rate: {success_rate:.2f}% ({sum(conversion_success)}/{len(conversion_success)})") |
|
|
|
|
|
|
|
|
print(f"\nSaving dataset to {output_path}...") |
|
|
ds.save_to_disk(output_path) |
|
|
|
|
|
print("\n[OK] Dataset saved successfully!") |
|
|
|
|
|
return ds |
|
|
|
|
|
|
|
|
def upload_to_hub(dataset, repo_id, token=None): |
|
|
""" |
|
|
Upload the converted dataset to HuggingFace Hub. |
|
|
|
|
|
Args: |
|
|
dataset: Dataset object to upload |
|
|
repo_id: Repository ID (e.g., 'username/dataset-name') |
|
|
token: HuggingFace API token (optional, uses cached if not provided) |
|
|
""" |
|
|
print(f"\nUploading dataset to {repo_id}...") |
|
|
|
|
|
try: |
|
|
dataset.push_to_hub(repo_id, token=token) |
|
|
print(f"[OK] Dataset uploaded successfully to {repo_id}") |
|
|
print(f" View at: https://huggingface.co/datasets/{repo_id}") |
|
|
except Exception as e: |
|
|
print(f"[FAIL] Failed to upload dataset: {e}") |
|
|
print(" Make sure you have write permissions to the repository") |
|
|
print(" You may need to run: huggingface-cli login") |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Convert infix expressions to prefix notation" |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'--dataset_name', |
|
|
type=str, |
|
|
default='augustocsc/sintetico_natural', |
|
|
help='HuggingFace dataset name' |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'--split', |
|
|
type=str, |
|
|
default='test', |
|
|
help='Dataset split to process' |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'--output_path', |
|
|
type=str, |
|
|
default='./1_data/processed/700K_prefix_converted', |
|
|
help='Path to save converted dataset' |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'--upload', |
|
|
action='store_true', |
|
|
help='Upload converted dataset to HuggingFace Hub' |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'--repo_id', |
|
|
type=str, |
|
|
default=None, |
|
|
help='Repository ID for upload (e.g., username/dataset-name)' |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
'--test_only', |
|
|
action='store_true', |
|
|
help='Test conversion on first 10 examples only' |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if args.test_only: |
|
|
print("=" * 60) |
|
|
print("TEST MODE: Converting first 10 examples") |
|
|
print("=" * 60) |
|
|
|
|
|
ds = load_dataset(args.dataset_name, split='test[:10]') |
|
|
|
|
|
for i, example in enumerate(ds): |
|
|
print(f"\n{'='*60}") |
|
|
print(f"Example {i+1}") |
|
|
print(f"{'='*60}") |
|
|
print("\nINFIX:") |
|
|
print(example['i_prompt_n']) |
|
|
|
|
|
prefix_prompt = convert_infix_to_prefix_prompt(example['i_prompt_n']) |
|
|
|
|
|
if prefix_prompt: |
|
|
print("\nCONVERTED PREFIX:") |
|
|
print(prefix_prompt) |
|
|
print("\n[OK] Conversion successful") |
|
|
else: |
|
|
print("\n[FAIL] Conversion failed") |
|
|
|
|
|
return |
|
|
|
|
|
|
|
|
dataset = process_dataset( |
|
|
dataset_name=args.dataset_name, |
|
|
split=args.split, |
|
|
output_path=args.output_path |
|
|
) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("SAMPLE CONVERSIONS (first 3 examples)") |
|
|
print("=" * 60) |
|
|
|
|
|
for i in range(min(3, len(dataset))): |
|
|
print(f"\n{'='*60}") |
|
|
print(f"Example {i+1}") |
|
|
print(f"{'='*60}") |
|
|
print("\nORIGINAL INFIX:") |
|
|
print(dataset[i]['i_prompt_n']) |
|
|
print("\nCONVERTED PREFIX:") |
|
|
print(dataset[i]['p_prompt_n_converted']) |
|
|
|
|
|
if 'p_prompt_n' in dataset.column_names: |
|
|
print("\nORIGINAL PREFIX (from dataset):") |
|
|
print(dataset[i]['p_prompt_n']) |
|
|
|
|
|
|
|
|
if args.upload: |
|
|
if args.repo_id is None: |
|
|
print("\n[ERROR] --repo_id required for upload") |
|
|
print(" Example: --repo_id username/sintetico_natural_prefix_converted") |
|
|
else: |
|
|
upload_to_hub(dataset, args.repo_id) |
|
|
else: |
|
|
print("\n" + "=" * 60) |
|
|
print("To upload to HuggingFace Hub, run:") |
|
|
print(f" python {__file__} --upload --repo_id username/dataset-name") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|