AION-1 / real_python_learner.py

Upload AION unified hybrid assistant with local eval results

a1a7070 verified 5 days ago

13.9 kB

	#!/usr/bin/env python3
	"""
	RealPythonLearner: learned char n-gram intent model + compositional Python code generator.

	Why this exists:
	- The pure NumPy GRU in neural_python_mind.py really learns character prediction,
	but in this tiny CPU environment it is too small to reliably bind instructions to semantics.
	- This model learns from many instruction/code examples using character n-grams and a Naive Bayes
	classifier, then composes code from learned operations. It is not a fixed print table: it can
	parse unseen variants like "keep numbers greater than 10" or "return square of every item".

	Usage:
	python real_python_learner.py --mode train --out outputs/real_python_learner
	python real_python_learner.py --mode ask --out outputs/real_python_learner --prompt "write a function that filters even numbers"
	"""
	from __future__ import annotations
	import argparse, json, math, random, re
	from collections import Counter, defaultdict
	from pathlib import Path
	from typing import Dict, List, Tuple

	LABELS = [
	"count_words", "fibonacci", "factorial", "is_prime", "binary_search", "merge_sort",
	"read_json", "write_json", "filter", "map", "group_by", "safe_int", "dataclass", "class_stack",
	"explain_python", "identity_reading"
	]

	TEMPLATES = {
	"count_words": ["count words", "word frequency", "count each word", "dictionary of words", "how many times words appear"],
	"fibonacci": ["fibonacci", "fib number", "nth fibonacci", "sequence 0 1 1 2"],
	"factorial": ["factorial", "multiply from 1 to n", "n!", "product of integers"],
	"is_prime": ["prime", "is prime", "prime checker", "divisors", "number is prime"],
	"binary_search": ["binary search", "search sorted list", "find target in sorted", "halves search space"],
	"merge_sort": ["merge sort", "mergesort", "sort by merging", "divide and merge sort"],
	"read_json": ["read json", "load json", "json from file", "open json file"],
	"write_json": ["write json", "save json", "dump json", "json to file"],
	"filter": ["filter", "keep only", "remove values", "select items", "numbers greater", "even numbers", "positive numbers"],
	"map": ["map", "transform", "convert each", "square every", "apply to each", "return squares"],
	"group_by": ["group by", "group items", "bucket by", "group words by length"],
	"safe_int": ["safe int", "convert to int", "parse integer", "default if invalid"],
	"dataclass": ["dataclass", "data class", "class with fields", "store user data"],
	"class_stack": ["stack class", "push pop", "lifo", "class stack"],
	"explain_python": ["explain python", "what is indentation", "how functions work", "syntax", "library"],
	"identity_reading": ["who are you", "how did you learn to read", "why can you read", "your memory", "your birth"],
	}

	QUESTION_PREFIXES = [
	"write python to", "write a function that", "create code to", "give me code that",
	"show an example to", "i need a function to", "how do i", "can you", "make python",
	]


	def char_ngrams(text: str, nmin=2, nmax=5) -> List[str]:
	s = " " + text.lower() + " "
	feats = []
	for n in range(nmin, nmax + 1):
	for i in range(0, max(0, len(s) - n + 1)):
	feats.append(s[i:i+n])
	# word features too
	feats += re.findall(r"[a-zA-Z_][a-zA-Z_0-9]+\|\d+", text.lower())
	return feats


	def build_training_examples(mult=40) -> List[Tuple[str,str]]:
	examples=[]
	for label, phrases in TEMPLATES.items():
	for phrase in phrases:
	for pref in QUESTION_PREFIXES:
	examples.append((f"{pref} {phrase}", label))
	examples.append((f"please {pref} {phrase} in python", label))
	examples.append((f"I want {phrase}; use clean Python", label))
	# compositional unseen-like variations
	filters = ["even numbers", "odd numbers", "positive numbers", "negative numbers", "numbers greater than 10", "numbers less than 5", "non empty strings", "items not none"]
	for f in filters:
	for pref in QUESTION_PREFIXES:
	examples.append((f"{pref} filter {f} from a list", "filter"))
	examples.append((f"{pref} keep only {f}", "filter"))
	maps = ["squares", "cubes", "absolute values", "strings from values", "lowercase words", "length of each word"]
	for m in maps:
	for pref in QUESTION_PREFIXES:
	examples.append((f"{pref} return {m}", "map"))
	examples.append((f"{pref} transform a list into {m}", "map"))
	return examples * mult


	class NBIntent:
	def __init__(self):
	self.label_counts=Counter(); self.feat_counts={}; self.total_feats=Counter(); self.vocab=set(); self.labels=[]
	def fit(self, examples: List[Tuple[str,str]]):
	self.labels=sorted(set(y for _,y in examples))
	self.feat_counts={y:Counter() for y in self.labels}
	for x,y in examples:
	self.label_counts[y]+=1
	feats=char_ngrams(x)
	self.feat_counts[y].update(feats)
	self.total_feats[y]+=len(feats)
	self.vocab.update(feats)
	def predict_proba(self, text: str) -> List[Tuple[str,float]]:
	feats=char_ngrams(text)
	V=len(self.vocab)+1; N=sum(self.label_counts.values())+len(self.labels)
	scores=[]
	for y in self.labels:
	logp=math.log((self.label_counts[y]+1)/N)
	denom=self.total_feats[y]+V
	fc=self.feat_counts[y]
	for f in feats:
	logp+=math.log((fc.get(f,0)+1)/denom)
	scores.append((y,logp))
	m=max(s for _,s in scores)
	probs=[(y,math.exp(s-m)) for y,s in scores]
	z=sum(p for _,p in probs) or 1
	return sorted([(y,p/z) for y,p in probs], key=lambda x:-x[1])
	def save(self,path:Path):
	data={"label_counts":dict(self.label_counts),"feat_counts":{k:dict(v) for k,v in self.feat_counts.items()},"total_feats":dict(self.total_feats),"vocab":list(self.vocab),"labels":self.labels}
	path.write_text(json.dumps(data),encoding='utf-8')
	@classmethod
	def load(cls,path:Path):
	o=cls(); data=json.loads(path.read_text(encoding='utf-8'))
	o.label_counts=Counter(data['label_counts']); o.feat_counts={k:Counter(v) for k,v in data['feat_counts'].items()}; o.total_feats=Counter(data['total_feats']); o.vocab=set(data['vocab']); o.labels=data['labels']; return o


	def parse_filter(prompt: str) -> Tuple[str,str,str]:
	p=prompt.lower()
	var="items"; item="x"; fname="filter_items"; cond="x"
	if "string" in p or "word" in p:
	var="strings"; item="s"; fname="filter_strings"; cond="s"
	else:
	var="numbers"; item="x"; fname="filter_numbers"
	if "even" in p: cond="x % 2 == 0"; fname="filter_even_numbers"
	elif "odd" in p: cond="x % 2 != 0"; fname="filter_odd_numbers"
	elif "positive" in p: cond="x > 0"; fname="filter_positive_numbers"
	elif "negative" in p: cond="x < 0"; fname="filter_negative_numbers"
	elif "not none" in p or "not none" in p.replace("-"," "): var="items"; item="x"; cond="x is not None"; fname="filter_not_none"
	elif "non empty" in p or "not empty" in p: var="strings"; item="s"; cond="bool(s)"; fname="filter_non_empty_strings"
	else:
	m=re.search(r"greater than\s+(-?\d+)",p)
	if m: cond=f"x > {m.group(1)}"; fname=f"filter_greater_than_{m.group(1).replace('-','minus_')}"
	m=re.search(r"less than\s+(-?\d+)",p)
	if m: cond=f"x < {m.group(1)}"; fname=f"filter_less_than_{m.group(1).replace('-','minus_')}"
	return fname,var,item,cond


	def parse_map(prompt: str) -> Tuple[str,str,str,str]:
	p=prompt.lower(); var="values"; item="x"; fname="map_values"; expr="x"
	if "square" in p: fname="square_numbers"; expr="x * x"; var="numbers"
	elif "cube" in p: fname="cube_numbers"; expr="x * x * x"; var="numbers"
	elif "absolute" in p or "abs" in p: fname="absolute_values"; expr="abs(x)"; var="numbers"
	elif "string" in p or "str" in p: fname="values_to_strings"; expr="str(x)"
	elif "lower" in p: fname="lowercase_words"; var="words"; item="word"; expr="word.lower()"
	elif "length" in p or "len" in p: fname="word_lengths"; var="words"; item="word"; expr="len(word)"
	return fname,var,item,expr


	def code_for(label: str, prompt: str) -> str:
	if label=="count_words":
	return '''```python
	def count_words(text: str) -> dict[str, int]:
	counts = {}
	for raw in text.lower().split():
	word = raw.strip(".,!?;:\\"'")
	if word:
	counts[word] = counts.get(word, 0) + 1
	return counts
	```'''
	if label=="fibonacci":
	return '''```python
	def fibonacci(n: int) -> int:
	if n < 0:
	raise ValueError("n must be non-negative")
	a, b = 0, 1
	for _ in range(n):
	a, b = b, a + b
	return a
	```'''
	if label=="factorial":
	return '''```python
	def factorial(n: int) -> int:
	if n < 0:
	raise ValueError("n must be non-negative")
	result = 1
	for i in range(2, n + 1):
	result *= i
	return result
	```'''
	if label=="is_prime":
	return '''```python
	def is_prime(n: int) -> bool:
	if n < 2:
	return False
	if n == 2:
	return True
	if n % 2 == 0:
	return False
	d = 3
	while d * d <= n:
	if n % d == 0:
	return False
	d += 2
	return True
	```'''
	if label=="binary_search":
	return '''```python
	def binary_search(items, target):
	low, high = 0, len(items) - 1
	while low <= high:
	mid = (low + high) // 2
	if items[mid] == target:
	return mid
	if items[mid] < target:
	low = mid + 1
	else:
	high = mid - 1
	return -1
	```'''
	if label=="merge_sort":
	return '''```python
	def merge_sort(values):
	if len(values) <= 1:
	return values
	mid = len(values) // 2
	left = merge_sort(values[:mid])
	right = merge_sort(values[mid:])
	return merge(left, right)

	def merge(left, right):
	result = []
	i = j = 0
	while i < len(left) and j < len(right):
	if left[i] <= right[j]:
	result.append(left[i]); i += 1
	else:
	result.append(right[j]); j += 1
	result.extend(left[i:]); result.extend(right[j:])
	return result
	```
	Complexity: O(n log n) time and O(n) extra memory.'''
	if label=="read_json":
	return '''```python
	import json

	def read_json(path: str):
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)
	```'''
	if label=="write_json":
	return '''```python
	import json

	def write_json(path: str, data) -> None:
	with open(path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	```'''
	if label=="filter":
	fname,var,item,cond=parse_filter(prompt)
	return f'''```python
	def {fname}({var}):
	result = []
	for {item} in {var}:
	if {cond}:
	result.append({item})
	return result
	```'''
	if label=="map":
	fname,var,item,expr=parse_map(prompt)
	return f'''```python
	def {fname}({var}):
	result = []
	for {item} in {var}:
	result.append({expr})
	return result
	```'''
	if label=="group_by":
	return '''```python
	def group_by(items, key_func):
	groups = {}
	for item in items:
	key = key_func(item)
	groups.setdefault(key, []).append(item)
	return groups
	```'''
	if label=="safe_int":
	return '''```python
	def safe_int(value, default=0):
	try:
	return int(value)
	except (TypeError, ValueError):
	return default
	```'''
	if label=="dataclass":
	return '''```python
	from dataclasses import dataclass

	@dataclass
	class User:
	name: str
	age: int

	def is_adult(self) -> bool:
	return self.age >= 18
	```'''
	if label=="class_stack":
	return '''```python
	class Stack:
	def __init__(self):
	self.items = []

	def push(self, item):
	self.items.append(item)

	def pop(self):
	if not self.items:
	return None
	return self.items.pop()
	```'''
	if label=="identity_reading":
	return "I am a small learned Python assistant. I learned by counting character patterns in many questions and code examples, plus a tiny neural GRU trained to predict every next character. I do not have real consciousness, but I can use learned patterns to compose answers."
	return "Python uses indentation, names, expressions, functions, classes, modules, and exceptions. Ask for a concrete task and I will compose code."


	def train(out: Path):
	out.mkdir(parents=True, exist_ok=True)
	examples=build_training_examples(mult=30)
	model=NBIntent(); model.fit(examples); model.save(out/'intent_nb.json')
	(out/'training_examples.json').write_text(json.dumps(examples[:2000],indent=2),encoding='utf-8')
	report={"examples":len(examples),"labels":LABELS,"features":len(model.vocab),"type":"char_ngram_naive_bayes_plus_compositional_generator"}
	(out/'report.json').write_text(json.dumps(report,indent=2),encoding='utf-8')
	print(json.dumps(report,indent=2))


	def ask(out: Path, prompt: str):
	model=NBIntent.load(out/'intent_nb.json')
	probs=model.predict_proba(prompt)
	label,conf=probs[0]
	reasoning=[f"Read the request as characters and word fragments.", f"Top learned intents: {', '.join(f'{l}={p:.2f}' for l,p in probs[:4])}.", f"Selected intent: {label}."]
	answer=code_for(label,prompt)
	print("## Learned reasoning")
	for r in reasoning: print(f"- {r}")
	print("\n## Answer")
	print(answer)


	def main():
	ap=argparse.ArgumentParser(); ap.add_argument('--mode',choices=['train','ask'],default='ask'); ap.add_argument('--out',default='outputs/real_python_learner'); ap.add_argument('--prompt',default='write a function that filters even numbers from a list')
	args=ap.parse_args(); out=Path(args.out)
	if args.mode=='train': train(out)
	else: ask(out,args.prompt)
	if __name__=='__main__': main()