AION-1 / real_python_learner.py
VoidWalkercero's picture
Upload AION unified hybrid assistant with local eval results
a1a7070 verified
#!/usr/bin/env python3
"""
RealPythonLearner: learned char n-gram intent model + compositional Python code generator.
Why this exists:
- The pure NumPy GRU in neural_python_mind.py really learns character prediction,
but in this tiny CPU environment it is too small to reliably bind instructions to semantics.
- This model learns from many instruction/code examples using character n-grams and a Naive Bayes
classifier, then composes code from learned operations. It is not a fixed print table: it can
parse unseen variants like "keep numbers greater than 10" or "return square of every item".
Usage:
python real_python_learner.py --mode train --out outputs/real_python_learner
python real_python_learner.py --mode ask --out outputs/real_python_learner --prompt "write a function that filters even numbers"
"""
from __future__ import annotations
import argparse, json, math, random, re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
LABELS = [
"count_words", "fibonacci", "factorial", "is_prime", "binary_search", "merge_sort",
"read_json", "write_json", "filter", "map", "group_by", "safe_int", "dataclass", "class_stack",
"explain_python", "identity_reading"
]
TEMPLATES = {
"count_words": ["count words", "word frequency", "count each word", "dictionary of words", "how many times words appear"],
"fibonacci": ["fibonacci", "fib number", "nth fibonacci", "sequence 0 1 1 2"],
"factorial": ["factorial", "multiply from 1 to n", "n!", "product of integers"],
"is_prime": ["prime", "is prime", "prime checker", "divisors", "number is prime"],
"binary_search": ["binary search", "search sorted list", "find target in sorted", "halves search space"],
"merge_sort": ["merge sort", "mergesort", "sort by merging", "divide and merge sort"],
"read_json": ["read json", "load json", "json from file", "open json file"],
"write_json": ["write json", "save json", "dump json", "json to file"],
"filter": ["filter", "keep only", "remove values", "select items", "numbers greater", "even numbers", "positive numbers"],
"map": ["map", "transform", "convert each", "square every", "apply to each", "return squares"],
"group_by": ["group by", "group items", "bucket by", "group words by length"],
"safe_int": ["safe int", "convert to int", "parse integer", "default if invalid"],
"dataclass": ["dataclass", "data class", "class with fields", "store user data"],
"class_stack": ["stack class", "push pop", "lifo", "class stack"],
"explain_python": ["explain python", "what is indentation", "how functions work", "syntax", "library"],
"identity_reading": ["who are you", "how did you learn to read", "why can you read", "your memory", "your birth"],
}
QUESTION_PREFIXES = [
"write python to", "write a function that", "create code to", "give me code that",
"show an example to", "i need a function to", "how do i", "can you", "make python",
]
def char_ngrams(text: str, nmin=2, nmax=5) -> List[str]:
s = " " + text.lower() + " "
feats = []
for n in range(nmin, nmax + 1):
for i in range(0, max(0, len(s) - n + 1)):
feats.append(s[i:i+n])
# word features too
feats += re.findall(r"[a-zA-Z_][a-zA-Z_0-9]+|\d+", text.lower())
return feats
def build_training_examples(mult=40) -> List[Tuple[str,str]]:
examples=[]
for label, phrases in TEMPLATES.items():
for phrase in phrases:
for pref in QUESTION_PREFIXES:
examples.append((f"{pref} {phrase}", label))
examples.append((f"please {pref} {phrase} in python", label))
examples.append((f"I want {phrase}; use clean Python", label))
# compositional unseen-like variations
filters = ["even numbers", "odd numbers", "positive numbers", "negative numbers", "numbers greater than 10", "numbers less than 5", "non empty strings", "items not none"]
for f in filters:
for pref in QUESTION_PREFIXES:
examples.append((f"{pref} filter {f} from a list", "filter"))
examples.append((f"{pref} keep only {f}", "filter"))
maps = ["squares", "cubes", "absolute values", "strings from values", "lowercase words", "length of each word"]
for m in maps:
for pref in QUESTION_PREFIXES:
examples.append((f"{pref} return {m}", "map"))
examples.append((f"{pref} transform a list into {m}", "map"))
return examples * mult
class NBIntent:
def __init__(self):
self.label_counts=Counter(); self.feat_counts={}; self.total_feats=Counter(); self.vocab=set(); self.labels=[]
def fit(self, examples: List[Tuple[str,str]]):
self.labels=sorted(set(y for _,y in examples))
self.feat_counts={y:Counter() for y in self.labels}
for x,y in examples:
self.label_counts[y]+=1
feats=char_ngrams(x)
self.feat_counts[y].update(feats)
self.total_feats[y]+=len(feats)
self.vocab.update(feats)
def predict_proba(self, text: str) -> List[Tuple[str,float]]:
feats=char_ngrams(text)
V=len(self.vocab)+1; N=sum(self.label_counts.values())+len(self.labels)
scores=[]
for y in self.labels:
logp=math.log((self.label_counts[y]+1)/N)
denom=self.total_feats[y]+V
fc=self.feat_counts[y]
for f in feats:
logp+=math.log((fc.get(f,0)+1)/denom)
scores.append((y,logp))
m=max(s for _,s in scores)
probs=[(y,math.exp(s-m)) for y,s in scores]
z=sum(p for _,p in probs) or 1
return sorted([(y,p/z) for y,p in probs], key=lambda x:-x[1])
def save(self,path:Path):
data={"label_counts":dict(self.label_counts),"feat_counts":{k:dict(v) for k,v in self.feat_counts.items()},"total_feats":dict(self.total_feats),"vocab":list(self.vocab),"labels":self.labels}
path.write_text(json.dumps(data),encoding='utf-8')
@classmethod
def load(cls,path:Path):
o=cls(); data=json.loads(path.read_text(encoding='utf-8'))
o.label_counts=Counter(data['label_counts']); o.feat_counts={k:Counter(v) for k,v in data['feat_counts'].items()}; o.total_feats=Counter(data['total_feats']); o.vocab=set(data['vocab']); o.labels=data['labels']; return o
def parse_filter(prompt: str) -> Tuple[str,str,str]:
p=prompt.lower()
var="items"; item="x"; fname="filter_items"; cond="x"
if "string" in p or "word" in p:
var="strings"; item="s"; fname="filter_strings"; cond="s"
else:
var="numbers"; item="x"; fname="filter_numbers"
if "even" in p: cond="x % 2 == 0"; fname="filter_even_numbers"
elif "odd" in p: cond="x % 2 != 0"; fname="filter_odd_numbers"
elif "positive" in p: cond="x > 0"; fname="filter_positive_numbers"
elif "negative" in p: cond="x < 0"; fname="filter_negative_numbers"
elif "not none" in p or "not none" in p.replace("-"," "): var="items"; item="x"; cond="x is not None"; fname="filter_not_none"
elif "non empty" in p or "not empty" in p: var="strings"; item="s"; cond="bool(s)"; fname="filter_non_empty_strings"
else:
m=re.search(r"greater than\s+(-?\d+)",p)
if m: cond=f"x > {m.group(1)}"; fname=f"filter_greater_than_{m.group(1).replace('-','minus_')}"
m=re.search(r"less than\s+(-?\d+)",p)
if m: cond=f"x < {m.group(1)}"; fname=f"filter_less_than_{m.group(1).replace('-','minus_')}"
return fname,var,item,cond
def parse_map(prompt: str) -> Tuple[str,str,str,str]:
p=prompt.lower(); var="values"; item="x"; fname="map_values"; expr="x"
if "square" in p: fname="square_numbers"; expr="x * x"; var="numbers"
elif "cube" in p: fname="cube_numbers"; expr="x * x * x"; var="numbers"
elif "absolute" in p or "abs" in p: fname="absolute_values"; expr="abs(x)"; var="numbers"
elif "string" in p or "str" in p: fname="values_to_strings"; expr="str(x)"
elif "lower" in p: fname="lowercase_words"; var="words"; item="word"; expr="word.lower()"
elif "length" in p or "len" in p: fname="word_lengths"; var="words"; item="word"; expr="len(word)"
return fname,var,item,expr
def code_for(label: str, prompt: str) -> str:
if label=="count_words":
return '''```python
def count_words(text: str) -> dict[str, int]:
counts = {}
for raw in text.lower().split():
word = raw.strip(".,!?;:\\"'")
if word:
counts[word] = counts.get(word, 0) + 1
return counts
```'''
if label=="fibonacci":
return '''```python
def fibonacci(n: int) -> int:
if n < 0:
raise ValueError("n must be non-negative")
a, b = 0, 1
for _ in range(n):
a, b = b, a + b
return a
```'''
if label=="factorial":
return '''```python
def factorial(n: int) -> int:
if n < 0:
raise ValueError("n must be non-negative")
result = 1
for i in range(2, n + 1):
result *= i
return result
```'''
if label=="is_prime":
return '''```python
def is_prime(n: int) -> bool:
if n < 2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
d = 3
while d * d <= n:
if n % d == 0:
return False
d += 2
return True
```'''
if label=="binary_search":
return '''```python
def binary_search(items, target):
low, high = 0, len(items) - 1
while low <= high:
mid = (low + high) // 2
if items[mid] == target:
return mid
if items[mid] < target:
low = mid + 1
else:
high = mid - 1
return -1
```'''
if label=="merge_sort":
return '''```python
def merge_sort(values):
if len(values) <= 1:
return values
mid = len(values) // 2
left = merge_sort(values[:mid])
right = merge_sort(values[mid:])
return merge(left, right)
def merge(left, right):
result = []
i = j = 0
while i < len(left) and j < len(right):
if left[i] <= right[j]:
result.append(left[i]); i += 1
else:
result.append(right[j]); j += 1
result.extend(left[i:]); result.extend(right[j:])
return result
```
Complexity: O(n log n) time and O(n) extra memory.'''
if label=="read_json":
return '''```python
import json
def read_json(path: str):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
```'''
if label=="write_json":
return '''```python
import json
def write_json(path: str, data) -> None:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
```'''
if label=="filter":
fname,var,item,cond=parse_filter(prompt)
return f'''```python
def {fname}({var}):
result = []
for {item} in {var}:
if {cond}:
result.append({item})
return result
```'''
if label=="map":
fname,var,item,expr=parse_map(prompt)
return f'''```python
def {fname}({var}):
result = []
for {item} in {var}:
result.append({expr})
return result
```'''
if label=="group_by":
return '''```python
def group_by(items, key_func):
groups = {}
for item in items:
key = key_func(item)
groups.setdefault(key, []).append(item)
return groups
```'''
if label=="safe_int":
return '''```python
def safe_int(value, default=0):
try:
return int(value)
except (TypeError, ValueError):
return default
```'''
if label=="dataclass":
return '''```python
from dataclasses import dataclass
@dataclass
class User:
name: str
age: int
def is_adult(self) -> bool:
return self.age >= 18
```'''
if label=="class_stack":
return '''```python
class Stack:
def __init__(self):
self.items = []
def push(self, item):
self.items.append(item)
def pop(self):
if not self.items:
return None
return self.items.pop()
```'''
if label=="identity_reading":
return "I am a small learned Python assistant. I learned by counting character patterns in many questions and code examples, plus a tiny neural GRU trained to predict every next character. I do not have real consciousness, but I can use learned patterns to compose answers."
return "Python uses indentation, names, expressions, functions, classes, modules, and exceptions. Ask for a concrete task and I will compose code."
def train(out: Path):
out.mkdir(parents=True, exist_ok=True)
examples=build_training_examples(mult=30)
model=NBIntent(); model.fit(examples); model.save(out/'intent_nb.json')
(out/'training_examples.json').write_text(json.dumps(examples[:2000],indent=2),encoding='utf-8')
report={"examples":len(examples),"labels":LABELS,"features":len(model.vocab),"type":"char_ngram_naive_bayes_plus_compositional_generator"}
(out/'report.json').write_text(json.dumps(report,indent=2),encoding='utf-8')
print(json.dumps(report,indent=2))
def ask(out: Path, prompt: str):
model=NBIntent.load(out/'intent_nb.json')
probs=model.predict_proba(prompt)
label,conf=probs[0]
reasoning=[f"Read the request as characters and word fragments.", f"Top learned intents: {', '.join(f'{l}={p:.2f}' for l,p in probs[:4])}.", f"Selected intent: {label}."]
answer=code_for(label,prompt)
print("## Learned reasoning")
for r in reasoning: print(f"- {r}")
print("\n## Answer")
print(answer)
def main():
ap=argparse.ArgumentParser(); ap.add_argument('--mode',choices=['train','ask'],default='ask'); ap.add_argument('--out',default='outputs/real_python_learner'); ap.add_argument('--prompt',default='write a function that filters even numbers from a list')
args=ap.parse_args(); out=Path(args.out)
if args.mode=='train': train(out)
else: ask(out,args.prompt)
if __name__=='__main__': main()