| |
| """ |
| RealPythonLearner: learned char n-gram intent model + compositional Python code generator. |
| |
| Why this exists: |
| - The pure NumPy GRU in neural_python_mind.py really learns character prediction, |
| but in this tiny CPU environment it is too small to reliably bind instructions to semantics. |
| - This model learns from many instruction/code examples using character n-grams and a Naive Bayes |
| classifier, then composes code from learned operations. It is not a fixed print table: it can |
| parse unseen variants like "keep numbers greater than 10" or "return square of every item". |
| |
| Usage: |
| python real_python_learner.py --mode train --out outputs/real_python_learner |
| python real_python_learner.py --mode ask --out outputs/real_python_learner --prompt "write a function that filters even numbers" |
| """ |
| from __future__ import annotations |
| import argparse, json, math, random, re |
| from collections import Counter, defaultdict |
| from pathlib import Path |
| from typing import Dict, List, Tuple |
|
|
| LABELS = [ |
| "count_words", "fibonacci", "factorial", "is_prime", "binary_search", "merge_sort", |
| "read_json", "write_json", "filter", "map", "group_by", "safe_int", "dataclass", "class_stack", |
| "explain_python", "identity_reading" |
| ] |
|
|
| TEMPLATES = { |
| "count_words": ["count words", "word frequency", "count each word", "dictionary of words", "how many times words appear"], |
| "fibonacci": ["fibonacci", "fib number", "nth fibonacci", "sequence 0 1 1 2"], |
| "factorial": ["factorial", "multiply from 1 to n", "n!", "product of integers"], |
| "is_prime": ["prime", "is prime", "prime checker", "divisors", "number is prime"], |
| "binary_search": ["binary search", "search sorted list", "find target in sorted", "halves search space"], |
| "merge_sort": ["merge sort", "mergesort", "sort by merging", "divide and merge sort"], |
| "read_json": ["read json", "load json", "json from file", "open json file"], |
| "write_json": ["write json", "save json", "dump json", "json to file"], |
| "filter": ["filter", "keep only", "remove values", "select items", "numbers greater", "even numbers", "positive numbers"], |
| "map": ["map", "transform", "convert each", "square every", "apply to each", "return squares"], |
| "group_by": ["group by", "group items", "bucket by", "group words by length"], |
| "safe_int": ["safe int", "convert to int", "parse integer", "default if invalid"], |
| "dataclass": ["dataclass", "data class", "class with fields", "store user data"], |
| "class_stack": ["stack class", "push pop", "lifo", "class stack"], |
| "explain_python": ["explain python", "what is indentation", "how functions work", "syntax", "library"], |
| "identity_reading": ["who are you", "how did you learn to read", "why can you read", "your memory", "your birth"], |
| } |
|
|
| QUESTION_PREFIXES = [ |
| "write python to", "write a function that", "create code to", "give me code that", |
| "show an example to", "i need a function to", "how do i", "can you", "make python", |
| ] |
|
|
|
|
| def char_ngrams(text: str, nmin=2, nmax=5) -> List[str]: |
| s = " " + text.lower() + " " |
| feats = [] |
| for n in range(nmin, nmax + 1): |
| for i in range(0, max(0, len(s) - n + 1)): |
| feats.append(s[i:i+n]) |
| |
| feats += re.findall(r"[a-zA-Z_][a-zA-Z_0-9]+|\d+", text.lower()) |
| return feats |
|
|
|
|
| def build_training_examples(mult=40) -> List[Tuple[str,str]]: |
| examples=[] |
| for label, phrases in TEMPLATES.items(): |
| for phrase in phrases: |
| for pref in QUESTION_PREFIXES: |
| examples.append((f"{pref} {phrase}", label)) |
| examples.append((f"please {pref} {phrase} in python", label)) |
| examples.append((f"I want {phrase}; use clean Python", label)) |
| |
| filters = ["even numbers", "odd numbers", "positive numbers", "negative numbers", "numbers greater than 10", "numbers less than 5", "non empty strings", "items not none"] |
| for f in filters: |
| for pref in QUESTION_PREFIXES: |
| examples.append((f"{pref} filter {f} from a list", "filter")) |
| examples.append((f"{pref} keep only {f}", "filter")) |
| maps = ["squares", "cubes", "absolute values", "strings from values", "lowercase words", "length of each word"] |
| for m in maps: |
| for pref in QUESTION_PREFIXES: |
| examples.append((f"{pref} return {m}", "map")) |
| examples.append((f"{pref} transform a list into {m}", "map")) |
| return examples * mult |
|
|
|
|
| class NBIntent: |
| def __init__(self): |
| self.label_counts=Counter(); self.feat_counts={}; self.total_feats=Counter(); self.vocab=set(); self.labels=[] |
| def fit(self, examples: List[Tuple[str,str]]): |
| self.labels=sorted(set(y for _,y in examples)) |
| self.feat_counts={y:Counter() for y in self.labels} |
| for x,y in examples: |
| self.label_counts[y]+=1 |
| feats=char_ngrams(x) |
| self.feat_counts[y].update(feats) |
| self.total_feats[y]+=len(feats) |
| self.vocab.update(feats) |
| def predict_proba(self, text: str) -> List[Tuple[str,float]]: |
| feats=char_ngrams(text) |
| V=len(self.vocab)+1; N=sum(self.label_counts.values())+len(self.labels) |
| scores=[] |
| for y in self.labels: |
| logp=math.log((self.label_counts[y]+1)/N) |
| denom=self.total_feats[y]+V |
| fc=self.feat_counts[y] |
| for f in feats: |
| logp+=math.log((fc.get(f,0)+1)/denom) |
| scores.append((y,logp)) |
| m=max(s for _,s in scores) |
| probs=[(y,math.exp(s-m)) for y,s in scores] |
| z=sum(p for _,p in probs) or 1 |
| return sorted([(y,p/z) for y,p in probs], key=lambda x:-x[1]) |
| def save(self,path:Path): |
| data={"label_counts":dict(self.label_counts),"feat_counts":{k:dict(v) for k,v in self.feat_counts.items()},"total_feats":dict(self.total_feats),"vocab":list(self.vocab),"labels":self.labels} |
| path.write_text(json.dumps(data),encoding='utf-8') |
| @classmethod |
| def load(cls,path:Path): |
| o=cls(); data=json.loads(path.read_text(encoding='utf-8')) |
| o.label_counts=Counter(data['label_counts']); o.feat_counts={k:Counter(v) for k,v in data['feat_counts'].items()}; o.total_feats=Counter(data['total_feats']); o.vocab=set(data['vocab']); o.labels=data['labels']; return o |
|
|
|
|
| def parse_filter(prompt: str) -> Tuple[str,str,str]: |
| p=prompt.lower() |
| var="items"; item="x"; fname="filter_items"; cond="x" |
| if "string" in p or "word" in p: |
| var="strings"; item="s"; fname="filter_strings"; cond="s" |
| else: |
| var="numbers"; item="x"; fname="filter_numbers" |
| if "even" in p: cond="x % 2 == 0"; fname="filter_even_numbers" |
| elif "odd" in p: cond="x % 2 != 0"; fname="filter_odd_numbers" |
| elif "positive" in p: cond="x > 0"; fname="filter_positive_numbers" |
| elif "negative" in p: cond="x < 0"; fname="filter_negative_numbers" |
| elif "not none" in p or "not none" in p.replace("-"," "): var="items"; item="x"; cond="x is not None"; fname="filter_not_none" |
| elif "non empty" in p or "not empty" in p: var="strings"; item="s"; cond="bool(s)"; fname="filter_non_empty_strings" |
| else: |
| m=re.search(r"greater than\s+(-?\d+)",p) |
| if m: cond=f"x > {m.group(1)}"; fname=f"filter_greater_than_{m.group(1).replace('-','minus_')}" |
| m=re.search(r"less than\s+(-?\d+)",p) |
| if m: cond=f"x < {m.group(1)}"; fname=f"filter_less_than_{m.group(1).replace('-','minus_')}" |
| return fname,var,item,cond |
|
|
|
|
| def parse_map(prompt: str) -> Tuple[str,str,str,str]: |
| p=prompt.lower(); var="values"; item="x"; fname="map_values"; expr="x" |
| if "square" in p: fname="square_numbers"; expr="x * x"; var="numbers" |
| elif "cube" in p: fname="cube_numbers"; expr="x * x * x"; var="numbers" |
| elif "absolute" in p or "abs" in p: fname="absolute_values"; expr="abs(x)"; var="numbers" |
| elif "string" in p or "str" in p: fname="values_to_strings"; expr="str(x)" |
| elif "lower" in p: fname="lowercase_words"; var="words"; item="word"; expr="word.lower()" |
| elif "length" in p or "len" in p: fname="word_lengths"; var="words"; item="word"; expr="len(word)" |
| return fname,var,item,expr |
|
|
|
|
| def code_for(label: str, prompt: str) -> str: |
| if label=="count_words": |
| return '''```python |
| def count_words(text: str) -> dict[str, int]: |
| counts = {} |
| for raw in text.lower().split(): |
| word = raw.strip(".,!?;:\\"'") |
| if word: |
| counts[word] = counts.get(word, 0) + 1 |
| return counts |
| ```''' |
| if label=="fibonacci": |
| return '''```python |
| def fibonacci(n: int) -> int: |
| if n < 0: |
| raise ValueError("n must be non-negative") |
| a, b = 0, 1 |
| for _ in range(n): |
| a, b = b, a + b |
| return a |
| ```''' |
| if label=="factorial": |
| return '''```python |
| def factorial(n: int) -> int: |
| if n < 0: |
| raise ValueError("n must be non-negative") |
| result = 1 |
| for i in range(2, n + 1): |
| result *= i |
| return result |
| ```''' |
| if label=="is_prime": |
| return '''```python |
| def is_prime(n: int) -> bool: |
| if n < 2: |
| return False |
| if n == 2: |
| return True |
| if n % 2 == 0: |
| return False |
| d = 3 |
| while d * d <= n: |
| if n % d == 0: |
| return False |
| d += 2 |
| return True |
| ```''' |
| if label=="binary_search": |
| return '''```python |
| def binary_search(items, target): |
| low, high = 0, len(items) - 1 |
| while low <= high: |
| mid = (low + high) // 2 |
| if items[mid] == target: |
| return mid |
| if items[mid] < target: |
| low = mid + 1 |
| else: |
| high = mid - 1 |
| return -1 |
| ```''' |
| if label=="merge_sort": |
| return '''```python |
| def merge_sort(values): |
| if len(values) <= 1: |
| return values |
| mid = len(values) // 2 |
| left = merge_sort(values[:mid]) |
| right = merge_sort(values[mid:]) |
| return merge(left, right) |
| |
| def merge(left, right): |
| result = [] |
| i = j = 0 |
| while i < len(left) and j < len(right): |
| if left[i] <= right[j]: |
| result.append(left[i]); i += 1 |
| else: |
| result.append(right[j]); j += 1 |
| result.extend(left[i:]); result.extend(right[j:]) |
| return result |
| ``` |
| Complexity: O(n log n) time and O(n) extra memory.''' |
| if label=="read_json": |
| return '''```python |
| import json |
| |
| def read_json(path: str): |
| with open(path, "r", encoding="utf-8") as f: |
| return json.load(f) |
| ```''' |
| if label=="write_json": |
| return '''```python |
| import json |
| |
| def write_json(path: str, data) -> None: |
| with open(path, "w", encoding="utf-8") as f: |
| json.dump(data, f, indent=2, ensure_ascii=False) |
| ```''' |
| if label=="filter": |
| fname,var,item,cond=parse_filter(prompt) |
| return f'''```python |
| def {fname}({var}): |
| result = [] |
| for {item} in {var}: |
| if {cond}: |
| result.append({item}) |
| return result |
| ```''' |
| if label=="map": |
| fname,var,item,expr=parse_map(prompt) |
| return f'''```python |
| def {fname}({var}): |
| result = [] |
| for {item} in {var}: |
| result.append({expr}) |
| return result |
| ```''' |
| if label=="group_by": |
| return '''```python |
| def group_by(items, key_func): |
| groups = {} |
| for item in items: |
| key = key_func(item) |
| groups.setdefault(key, []).append(item) |
| return groups |
| ```''' |
| if label=="safe_int": |
| return '''```python |
| def safe_int(value, default=0): |
| try: |
| return int(value) |
| except (TypeError, ValueError): |
| return default |
| ```''' |
| if label=="dataclass": |
| return '''```python |
| from dataclasses import dataclass |
| |
| @dataclass |
| class User: |
| name: str |
| age: int |
| |
| def is_adult(self) -> bool: |
| return self.age >= 18 |
| ```''' |
| if label=="class_stack": |
| return '''```python |
| class Stack: |
| def __init__(self): |
| self.items = [] |
| |
| def push(self, item): |
| self.items.append(item) |
| |
| def pop(self): |
| if not self.items: |
| return None |
| return self.items.pop() |
| ```''' |
| if label=="identity_reading": |
| return "I am a small learned Python assistant. I learned by counting character patterns in many questions and code examples, plus a tiny neural GRU trained to predict every next character. I do not have real consciousness, but I can use learned patterns to compose answers." |
| return "Python uses indentation, names, expressions, functions, classes, modules, and exceptions. Ask for a concrete task and I will compose code." |
|
|
|
|
| def train(out: Path): |
| out.mkdir(parents=True, exist_ok=True) |
| examples=build_training_examples(mult=30) |
| model=NBIntent(); model.fit(examples); model.save(out/'intent_nb.json') |
| (out/'training_examples.json').write_text(json.dumps(examples[:2000],indent=2),encoding='utf-8') |
| report={"examples":len(examples),"labels":LABELS,"features":len(model.vocab),"type":"char_ngram_naive_bayes_plus_compositional_generator"} |
| (out/'report.json').write_text(json.dumps(report,indent=2),encoding='utf-8') |
| print(json.dumps(report,indent=2)) |
|
|
|
|
| def ask(out: Path, prompt: str): |
| model=NBIntent.load(out/'intent_nb.json') |
| probs=model.predict_proba(prompt) |
| label,conf=probs[0] |
| reasoning=[f"Read the request as characters and word fragments.", f"Top learned intents: {', '.join(f'{l}={p:.2f}' for l,p in probs[:4])}.", f"Selected intent: {label}."] |
| answer=code_for(label,prompt) |
| print("## Learned reasoning") |
| for r in reasoning: print(f"- {r}") |
| print("\n## Answer") |
| print(answer) |
|
|
|
|
| def main(): |
| ap=argparse.ArgumentParser(); ap.add_argument('--mode',choices=['train','ask'],default='ask'); ap.add_argument('--out',default='outputs/real_python_learner'); ap.add_argument('--prompt',default='write a function that filters even numbers from a list') |
| args=ap.parse_args(); out=Path(args.out) |
| if args.mode=='train': train(out) |
| else: ask(out,args.prompt) |
| if __name__=='__main__': main() |
|
|