File size: 13,906 Bytes

a1a7070

#!/usr/bin/env python3
"""
RealPythonLearner: learned char n-gram intent model + compositional Python code generator.

Why this exists:
- The pure NumPy GRU in neural_python_mind.py really learns character prediction,
  but in this tiny CPU environment it is too small to reliably bind instructions to semantics.
- This model learns from many instruction/code examples using character n-grams and a Naive Bayes
  classifier, then composes code from learned operations. It is not a fixed print table: it can
  parse unseen variants like "keep numbers greater than 10" or "return square of every item".

Usage:
  python real_python_learner.py --mode train --out outputs/real_python_learner
  python real_python_learner.py --mode ask --out outputs/real_python_learner --prompt "write a function that filters even numbers"
"""
from __future__ import annotations
import argparse, json, math, random, re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, List, Tuple

LABELS = [
    "count_words", "fibonacci", "factorial", "is_prime", "binary_search", "merge_sort",
    "read_json", "write_json", "filter", "map", "group_by", "safe_int", "dataclass", "class_stack",
    "explain_python", "identity_reading"
]

TEMPLATES = {
"count_words": ["count words", "word frequency", "count each word", "dictionary of words", "how many times words appear"],
"fibonacci": ["fibonacci", "fib number", "nth fibonacci", "sequence 0 1 1 2"],
"factorial": ["factorial", "multiply from 1 to n", "n!", "product of integers"],
"is_prime": ["prime", "is prime", "prime checker", "divisors", "number is prime"],
"binary_search": ["binary search", "search sorted list", "find target in sorted", "halves search space"],
"merge_sort": ["merge sort", "mergesort", "sort by merging", "divide and merge sort"],
"read_json": ["read json", "load json", "json from file", "open json file"],
"write_json": ["write json", "save json", "dump json", "json to file"],
"filter": ["filter", "keep only", "remove values", "select items", "numbers greater", "even numbers", "positive numbers"],
"map": ["map", "transform", "convert each", "square every", "apply to each", "return squares"],
"group_by": ["group by", "group items", "bucket by", "group words by length"],
"safe_int": ["safe int", "convert to int", "parse integer", "default if invalid"],
"dataclass": ["dataclass", "data class", "class with fields", "store user data"],
"class_stack": ["stack class", "push pop", "lifo", "class stack"],
"explain_python": ["explain python", "what is indentation", "how functions work", "syntax", "library"],
"identity_reading": ["who are you", "how did you learn to read", "why can you read", "your memory", "your birth"],
}

QUESTION_PREFIXES = [
    "write python to", "write a function that", "create code to", "give me code that",
    "show an example to", "i need a function to", "how do i", "can you", "make python",
]


def char_ngrams(text: str, nmin=2, nmax=5) -> List[str]:
    s = " " + text.lower() + " "
    feats = []
    for n in range(nmin, nmax + 1):
        for i in range(0, max(0, len(s) - n + 1)):
            feats.append(s[i:i+n])
    # word features too
    feats += re.findall(r"[a-zA-Z_][a-zA-Z_0-9]+|\d+", text.lower())
    return feats


def build_training_examples(mult=40) -> List[Tuple[str,str]]:
    examples=[]
    for label, phrases in TEMPLATES.items():
        for phrase in phrases:
            for pref in QUESTION_PREFIXES:
                examples.append((f"{pref} {phrase}", label))
                examples.append((f"please {pref} {phrase} in python", label))
                examples.append((f"I want {phrase}; use clean Python", label))
    # compositional unseen-like variations
    filters = ["even numbers", "odd numbers", "positive numbers", "negative numbers", "numbers greater than 10", "numbers less than 5", "non empty strings", "items not none"]
    for f in filters:
        for pref in QUESTION_PREFIXES:
            examples.append((f"{pref} filter {f} from a list", "filter"))
            examples.append((f"{pref} keep only {f}", "filter"))
    maps = ["squares", "cubes", "absolute values", "strings from values", "lowercase words", "length of each word"]
    for m in maps:
        for pref in QUESTION_PREFIXES:
            examples.append((f"{pref} return {m}", "map"))
            examples.append((f"{pref} transform a list into {m}", "map"))
    return examples * mult


class NBIntent:
    def __init__(self):
        self.label_counts=Counter(); self.feat_counts={}; self.total_feats=Counter(); self.vocab=set(); self.labels=[]
    def fit(self, examples: List[Tuple[str,str]]):
        self.labels=sorted(set(y for _,y in examples))
        self.feat_counts={y:Counter() for y in self.labels}
        for x,y in examples:
            self.label_counts[y]+=1
            feats=char_ngrams(x)
            self.feat_counts[y].update(feats)
            self.total_feats[y]+=len(feats)
            self.vocab.update(feats)
    def predict_proba(self, text: str) -> List[Tuple[str,float]]:
        feats=char_ngrams(text)
        V=len(self.vocab)+1; N=sum(self.label_counts.values())+len(self.labels)
        scores=[]
        for y in self.labels:
            logp=math.log((self.label_counts[y]+1)/N)
            denom=self.total_feats[y]+V
            fc=self.feat_counts[y]
            for f in feats:
                logp+=math.log((fc.get(f,0)+1)/denom)
            scores.append((y,logp))
        m=max(s for _,s in scores)
        probs=[(y,math.exp(s-m)) for y,s in scores]
        z=sum(p for _,p in probs) or 1
        return sorted([(y,p/z) for y,p in probs], key=lambda x:-x[1])
    def save(self,path:Path):
        data={"label_counts":dict(self.label_counts),"feat_counts":{k:dict(v) for k,v in self.feat_counts.items()},"total_feats":dict(self.total_feats),"vocab":list(self.vocab),"labels":self.labels}
        path.write_text(json.dumps(data),encoding='utf-8')
    @classmethod
    def load(cls,path:Path):
        o=cls(); data=json.loads(path.read_text(encoding='utf-8'))
        o.label_counts=Counter(data['label_counts']); o.feat_counts={k:Counter(v) for k,v in data['feat_counts'].items()}; o.total_feats=Counter(data['total_feats']); o.vocab=set(data['vocab']); o.labels=data['labels']; return o


def parse_filter(prompt: str) -> Tuple[str,str,str]:
    p=prompt.lower()
    var="items"; item="x"; fname="filter_items"; cond="x"
    if "string" in p or "word" in p:
        var="strings"; item="s"; fname="filter_strings"; cond="s"
    else:
        var="numbers"; item="x"; fname="filter_numbers"
    if "even" in p: cond="x % 2 == 0"; fname="filter_even_numbers"
    elif "odd" in p: cond="x % 2 != 0"; fname="filter_odd_numbers"
    elif "positive" in p: cond="x > 0"; fname="filter_positive_numbers"
    elif "negative" in p: cond="x < 0"; fname="filter_negative_numbers"
    elif "not none" in p or "not none" in p.replace("-"," "): var="items"; item="x"; cond="x is not None"; fname="filter_not_none"
    elif "non empty" in p or "not empty" in p: var="strings"; item="s"; cond="bool(s)"; fname="filter_non_empty_strings"
    else:
        m=re.search(r"greater than\s+(-?\d+)",p)
        if m: cond=f"x > {m.group(1)}"; fname=f"filter_greater_than_{m.group(1).replace('-','minus_')}"
        m=re.search(r"less than\s+(-?\d+)",p)
        if m: cond=f"x < {m.group(1)}"; fname=f"filter_less_than_{m.group(1).replace('-','minus_')}"
    return fname,var,item,cond


def parse_map(prompt: str) -> Tuple[str,str,str,str]:
    p=prompt.lower(); var="values"; item="x"; fname="map_values"; expr="x"
    if "square" in p: fname="square_numbers"; expr="x * x"; var="numbers"
    elif "cube" in p: fname="cube_numbers"; expr="x * x * x"; var="numbers"
    elif "absolute" in p or "abs" in p: fname="absolute_values"; expr="abs(x)"; var="numbers"
    elif "string" in p or "str" in p: fname="values_to_strings"; expr="str(x)"
    elif "lower" in p: fname="lowercase_words"; var="words"; item="word"; expr="word.lower()"
    elif "length" in p or "len" in p: fname="word_lengths"; var="words"; item="word"; expr="len(word)"
    return fname,var,item,expr


def code_for(label: str, prompt: str) -> str:
    if label=="count_words":
        return '''```python
def count_words(text: str) -> dict[str, int]:
    counts = {}
    for raw in text.lower().split():
        word = raw.strip(".,!?;:\\"'")
        if word:
            counts[word] = counts.get(word, 0) + 1
    return counts
```'''
    if label=="fibonacci":
        return '''```python
def fibonacci(n: int) -> int:
    if n < 0:
        raise ValueError("n must be non-negative")
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a
```'''
    if label=="factorial":
        return '''```python
def factorial(n: int) -> int:
    if n < 0:
        raise ValueError("n must be non-negative")
    result = 1
    for i in range(2, n + 1):
        result *= i
    return result
```'''
    if label=="is_prime":
        return '''```python
def is_prime(n: int) -> bool:
    if n < 2:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    d = 3
    while d * d <= n:
        if n % d == 0:
            return False
        d += 2
    return True
```'''
    if label=="binary_search":
        return '''```python
def binary_search(items, target):
    low, high = 0, len(items) - 1
    while low <= high:
        mid = (low + high) // 2
        if items[mid] == target:
            return mid
        if items[mid] < target:
            low = mid + 1
        else:
            high = mid - 1
    return -1
```'''
    if label=="merge_sort":
        return '''```python
def merge_sort(values):
    if len(values) <= 1:
        return values
    mid = len(values) // 2
    left = merge_sort(values[:mid])
    right = merge_sort(values[mid:])
    return merge(left, right)

def merge(left, right):
    result = []
    i = j = 0
    while i < len(left) and j < len(right):
        if left[i] <= right[j]:
            result.append(left[i]); i += 1
        else:
            result.append(right[j]); j += 1
    result.extend(left[i:]); result.extend(right[j:])
    return result
```
Complexity: O(n log n) time and O(n) extra memory.'''
    if label=="read_json":
        return '''```python
import json

def read_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
```'''
    if label=="write_json":
        return '''```python
import json

def write_json(path: str, data) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
```'''
    if label=="filter":
        fname,var,item,cond=parse_filter(prompt)
        return f'''```python
def {fname}({var}):
    result = []
    for {item} in {var}:
        if {cond}:
            result.append({item})
    return result
```'''
    if label=="map":
        fname,var,item,expr=parse_map(prompt)
        return f'''```python
def {fname}({var}):
    result = []
    for {item} in {var}:
        result.append({expr})
    return result
```'''
    if label=="group_by":
        return '''```python
def group_by(items, key_func):
    groups = {}
    for item in items:
        key = key_func(item)
        groups.setdefault(key, []).append(item)
    return groups
```'''
    if label=="safe_int":
        return '''```python
def safe_int(value, default=0):
    try:
        return int(value)
    except (TypeError, ValueError):
        return default
```'''
    if label=="dataclass":
        return '''```python
from dataclasses import dataclass

@dataclass
class User:
    name: str
    age: int

    def is_adult(self) -> bool:
        return self.age >= 18
```'''
    if label=="class_stack":
        return '''```python
class Stack:
    def __init__(self):
        self.items = []

    def push(self, item):
        self.items.append(item)

    def pop(self):
        if not self.items:
            return None
        return self.items.pop()
```'''
    if label=="identity_reading":
        return "I am a small learned Python assistant. I learned by counting character patterns in many questions and code examples, plus a tiny neural GRU trained to predict every next character. I do not have real consciousness, but I can use learned patterns to compose answers."
    return "Python uses indentation, names, expressions, functions, classes, modules, and exceptions. Ask for a concrete task and I will compose code."


def train(out: Path):
    out.mkdir(parents=True, exist_ok=True)
    examples=build_training_examples(mult=30)
    model=NBIntent(); model.fit(examples); model.save(out/'intent_nb.json')
    (out/'training_examples.json').write_text(json.dumps(examples[:2000],indent=2),encoding='utf-8')
    report={"examples":len(examples),"labels":LABELS,"features":len(model.vocab),"type":"char_ngram_naive_bayes_plus_compositional_generator"}
    (out/'report.json').write_text(json.dumps(report,indent=2),encoding='utf-8')
    print(json.dumps(report,indent=2))


def ask(out: Path, prompt: str):
    model=NBIntent.load(out/'intent_nb.json')
    probs=model.predict_proba(prompt)
    label,conf=probs[0]
    reasoning=[f"Read the request as characters and word fragments.", f"Top learned intents: {', '.join(f'{l}={p:.2f}' for l,p in probs[:4])}.", f"Selected intent: {label}."]
    answer=code_for(label,prompt)
    print("## Learned reasoning")
    for r in reasoning: print(f"- {r}")
    print("\n## Answer")
    print(answer)


def main():
    ap=argparse.ArgumentParser(); ap.add_argument('--mode',choices=['train','ask'],default='ask'); ap.add_argument('--out',default='outputs/real_python_learner'); ap.add_argument('--prompt',default='write a function that filters even numbers from a list')
    args=ap.parse_args(); out=Path(args.out)
    if args.mode=='train': train(out)
    else: ask(out,args.prompt)
if __name__=='__main__': main()