#!/usr/bin/env python3 """ RealPythonLearner: learned char n-gram intent model + compositional Python code generator. Why this exists: - The pure NumPy GRU in neural_python_mind.py really learns character prediction, but in this tiny CPU environment it is too small to reliably bind instructions to semantics. - This model learns from many instruction/code examples using character n-grams and a Naive Bayes classifier, then composes code from learned operations. It is not a fixed print table: it can parse unseen variants like "keep numbers greater than 10" or "return square of every item". Usage: python real_python_learner.py --mode train --out outputs/real_python_learner python real_python_learner.py --mode ask --out outputs/real_python_learner --prompt "write a function that filters even numbers" """ from __future__ import annotations import argparse, json, math, random, re from collections import Counter, defaultdict from pathlib import Path from typing import Dict, List, Tuple LABELS = [ "count_words", "fibonacci", "factorial", "is_prime", "binary_search", "merge_sort", "read_json", "write_json", "filter", "map", "group_by", "safe_int", "dataclass", "class_stack", "explain_python", "identity_reading" ] TEMPLATES = { "count_words": ["count words", "word frequency", "count each word", "dictionary of words", "how many times words appear"], "fibonacci": ["fibonacci", "fib number", "nth fibonacci", "sequence 0 1 1 2"], "factorial": ["factorial", "multiply from 1 to n", "n!", "product of integers"], "is_prime": ["prime", "is prime", "prime checker", "divisors", "number is prime"], "binary_search": ["binary search", "search sorted list", "find target in sorted", "halves search space"], "merge_sort": ["merge sort", "mergesort", "sort by merging", "divide and merge sort"], "read_json": ["read json", "load json", "json from file", "open json file"], "write_json": ["write json", "save json", "dump json", "json to file"], "filter": ["filter", "keep only", "remove values", "select items", "numbers greater", "even numbers", "positive numbers"], "map": ["map", "transform", "convert each", "square every", "apply to each", "return squares"], "group_by": ["group by", "group items", "bucket by", "group words by length"], "safe_int": ["safe int", "convert to int", "parse integer", "default if invalid"], "dataclass": ["dataclass", "data class", "class with fields", "store user data"], "class_stack": ["stack class", "push pop", "lifo", "class stack"], "explain_python": ["explain python", "what is indentation", "how functions work", "syntax", "library"], "identity_reading": ["who are you", "how did you learn to read", "why can you read", "your memory", "your birth"], } QUESTION_PREFIXES = [ "write python to", "write a function that", "create code to", "give me code that", "show an example to", "i need a function to", "how do i", "can you", "make python", ] def char_ngrams(text: str, nmin=2, nmax=5) -> List[str]: s = " " + text.lower() + " " feats = [] for n in range(nmin, nmax + 1): for i in range(0, max(0, len(s) - n + 1)): feats.append(s[i:i+n]) # word features too feats += re.findall(r"[a-zA-Z_][a-zA-Z_0-9]+|\d+", text.lower()) return feats def build_training_examples(mult=40) -> List[Tuple[str,str]]: examples=[] for label, phrases in TEMPLATES.items(): for phrase in phrases: for pref in QUESTION_PREFIXES: examples.append((f"{pref} {phrase}", label)) examples.append((f"please {pref} {phrase} in python", label)) examples.append((f"I want {phrase}; use clean Python", label)) # compositional unseen-like variations filters = ["even numbers", "odd numbers", "positive numbers", "negative numbers", "numbers greater than 10", "numbers less than 5", "non empty strings", "items not none"] for f in filters: for pref in QUESTION_PREFIXES: examples.append((f"{pref} filter {f} from a list", "filter")) examples.append((f"{pref} keep only {f}", "filter")) maps = ["squares", "cubes", "absolute values", "strings from values", "lowercase words", "length of each word"] for m in maps: for pref in QUESTION_PREFIXES: examples.append((f"{pref} return {m}", "map")) examples.append((f"{pref} transform a list into {m}", "map")) return examples * mult class NBIntent: def __init__(self): self.label_counts=Counter(); self.feat_counts={}; self.total_feats=Counter(); self.vocab=set(); self.labels=[] def fit(self, examples: List[Tuple[str,str]]): self.labels=sorted(set(y for _,y in examples)) self.feat_counts={y:Counter() for y in self.labels} for x,y in examples: self.label_counts[y]+=1 feats=char_ngrams(x) self.feat_counts[y].update(feats) self.total_feats[y]+=len(feats) self.vocab.update(feats) def predict_proba(self, text: str) -> List[Tuple[str,float]]: feats=char_ngrams(text) V=len(self.vocab)+1; N=sum(self.label_counts.values())+len(self.labels) scores=[] for y in self.labels: logp=math.log((self.label_counts[y]+1)/N) denom=self.total_feats[y]+V fc=self.feat_counts[y] for f in feats: logp+=math.log((fc.get(f,0)+1)/denom) scores.append((y,logp)) m=max(s for _,s in scores) probs=[(y,math.exp(s-m)) for y,s in scores] z=sum(p for _,p in probs) or 1 return sorted([(y,p/z) for y,p in probs], key=lambda x:-x[1]) def save(self,path:Path): data={"label_counts":dict(self.label_counts),"feat_counts":{k:dict(v) for k,v in self.feat_counts.items()},"total_feats":dict(self.total_feats),"vocab":list(self.vocab),"labels":self.labels} path.write_text(json.dumps(data),encoding='utf-8') @classmethod def load(cls,path:Path): o=cls(); data=json.loads(path.read_text(encoding='utf-8')) o.label_counts=Counter(data['label_counts']); o.feat_counts={k:Counter(v) for k,v in data['feat_counts'].items()}; o.total_feats=Counter(data['total_feats']); o.vocab=set(data['vocab']); o.labels=data['labels']; return o def parse_filter(prompt: str) -> Tuple[str,str,str]: p=prompt.lower() var="items"; item="x"; fname="filter_items"; cond="x" if "string" in p or "word" in p: var="strings"; item="s"; fname="filter_strings"; cond="s" else: var="numbers"; item="x"; fname="filter_numbers" if "even" in p: cond="x % 2 == 0"; fname="filter_even_numbers" elif "odd" in p: cond="x % 2 != 0"; fname="filter_odd_numbers" elif "positive" in p: cond="x > 0"; fname="filter_positive_numbers" elif "negative" in p: cond="x < 0"; fname="filter_negative_numbers" elif "not none" in p or "not none" in p.replace("-"," "): var="items"; item="x"; cond="x is not None"; fname="filter_not_none" elif "non empty" in p or "not empty" in p: var="strings"; item="s"; cond="bool(s)"; fname="filter_non_empty_strings" else: m=re.search(r"greater than\s+(-?\d+)",p) if m: cond=f"x > {m.group(1)}"; fname=f"filter_greater_than_{m.group(1).replace('-','minus_')}" m=re.search(r"less than\s+(-?\d+)",p) if m: cond=f"x < {m.group(1)}"; fname=f"filter_less_than_{m.group(1).replace('-','minus_')}" return fname,var,item,cond def parse_map(prompt: str) -> Tuple[str,str,str,str]: p=prompt.lower(); var="values"; item="x"; fname="map_values"; expr="x" if "square" in p: fname="square_numbers"; expr="x * x"; var="numbers" elif "cube" in p: fname="cube_numbers"; expr="x * x * x"; var="numbers" elif "absolute" in p or "abs" in p: fname="absolute_values"; expr="abs(x)"; var="numbers" elif "string" in p or "str" in p: fname="values_to_strings"; expr="str(x)" elif "lower" in p: fname="lowercase_words"; var="words"; item="word"; expr="word.lower()" elif "length" in p or "len" in p: fname="word_lengths"; var="words"; item="word"; expr="len(word)" return fname,var,item,expr def code_for(label: str, prompt: str) -> str: if label=="count_words": return '''```python def count_words(text: str) -> dict[str, int]: counts = {} for raw in text.lower().split(): word = raw.strip(".,!?;:\\"'") if word: counts[word] = counts.get(word, 0) + 1 return counts ```''' if label=="fibonacci": return '''```python def fibonacci(n: int) -> int: if n < 0: raise ValueError("n must be non-negative") a, b = 0, 1 for _ in range(n): a, b = b, a + b return a ```''' if label=="factorial": return '''```python def factorial(n: int) -> int: if n < 0: raise ValueError("n must be non-negative") result = 1 for i in range(2, n + 1): result *= i return result ```''' if label=="is_prime": return '''```python def is_prime(n: int) -> bool: if n < 2: return False if n == 2: return True if n % 2 == 0: return False d = 3 while d * d <= n: if n % d == 0: return False d += 2 return True ```''' if label=="binary_search": return '''```python def binary_search(items, target): low, high = 0, len(items) - 1 while low <= high: mid = (low + high) // 2 if items[mid] == target: return mid if items[mid] < target: low = mid + 1 else: high = mid - 1 return -1 ```''' if label=="merge_sort": return '''```python def merge_sort(values): if len(values) <= 1: return values mid = len(values) // 2 left = merge_sort(values[:mid]) right = merge_sort(values[mid:]) return merge(left, right) def merge(left, right): result = [] i = j = 0 while i < len(left) and j < len(right): if left[i] <= right[j]: result.append(left[i]); i += 1 else: result.append(right[j]); j += 1 result.extend(left[i:]); result.extend(right[j:]) return result ``` Complexity: O(n log n) time and O(n) extra memory.''' if label=="read_json": return '''```python import json def read_json(path: str): with open(path, "r", encoding="utf-8") as f: return json.load(f) ```''' if label=="write_json": return '''```python import json def write_json(path: str, data) -> None: with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) ```''' if label=="filter": fname,var,item,cond=parse_filter(prompt) return f'''```python def {fname}({var}): result = [] for {item} in {var}: if {cond}: result.append({item}) return result ```''' if label=="map": fname,var,item,expr=parse_map(prompt) return f'''```python def {fname}({var}): result = [] for {item} in {var}: result.append({expr}) return result ```''' if label=="group_by": return '''```python def group_by(items, key_func): groups = {} for item in items: key = key_func(item) groups.setdefault(key, []).append(item) return groups ```''' if label=="safe_int": return '''```python def safe_int(value, default=0): try: return int(value) except (TypeError, ValueError): return default ```''' if label=="dataclass": return '''```python from dataclasses import dataclass @dataclass class User: name: str age: int def is_adult(self) -> bool: return self.age >= 18 ```''' if label=="class_stack": return '''```python class Stack: def __init__(self): self.items = [] def push(self, item): self.items.append(item) def pop(self): if not self.items: return None return self.items.pop() ```''' if label=="identity_reading": return "I am a small learned Python assistant. I learned by counting character patterns in many questions and code examples, plus a tiny neural GRU trained to predict every next character. I do not have real consciousness, but I can use learned patterns to compose answers." return "Python uses indentation, names, expressions, functions, classes, modules, and exceptions. Ask for a concrete task and I will compose code." def train(out: Path): out.mkdir(parents=True, exist_ok=True) examples=build_training_examples(mult=30) model=NBIntent(); model.fit(examples); model.save(out/'intent_nb.json') (out/'training_examples.json').write_text(json.dumps(examples[:2000],indent=2),encoding='utf-8') report={"examples":len(examples),"labels":LABELS,"features":len(model.vocab),"type":"char_ngram_naive_bayes_plus_compositional_generator"} (out/'report.json').write_text(json.dumps(report,indent=2),encoding='utf-8') print(json.dumps(report,indent=2)) def ask(out: Path, prompt: str): model=NBIntent.load(out/'intent_nb.json') probs=model.predict_proba(prompt) label,conf=probs[0] reasoning=[f"Read the request as characters and word fragments.", f"Top learned intents: {', '.join(f'{l}={p:.2f}' for l,p in probs[:4])}.", f"Selected intent: {label}."] answer=code_for(label,prompt) print("## Learned reasoning") for r in reasoning: print(f"- {r}") print("\n## Answer") print(answer) def main(): ap=argparse.ArgumentParser(); ap.add_argument('--mode',choices=['train','ask'],default='ask'); ap.add_argument('--out',default='outputs/real_python_learner'); ap.add_argument('--prompt',default='write a function that filters even numbers from a list') args=ap.parse_args(); out=Path(args.out) if args.mode=='train': train(out) else: ask(out,args.prompt) if __name__=='__main__': main()