"""Knowledge Base for detecting and learning unknown concepts""" import json import os import re from typing import List, Dict, Set, Optional, Tuple from datetime import datetime from config import KNOWLEDGE_PATH, DATA_DIR class KnowledgeBase: """Manages known and unknown programming concepts""" def __init__(self): self.known_concepts: Dict[str, dict] = {} self.unknown_concepts: Dict[str, dict] = {} self.concept_examples: Dict[str, List[str]] = {} self.learning_queue: List[dict] = [] # Built-in Python knowledge self._init_builtin_knowledge() # Load saved knowledge self.load() def _init_builtin_knowledge(self): """Initialize with Python built-in knowledge""" # Python keywords python_keywords = [ 'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'finally', 'with', 'as', 'import', 'from', 'return', 'yield', 'raise', 'pass', 'break', 'continue', 'lambda', 'and', 'or', 'not', 'in', 'is', 'None', 'True', 'False', 'global', 'nonlocal', 'assert', 'del', 'async', 'await' ] # Python built-in functions builtin_functions = [ 'print', 'len', 'range', 'int', 'str', 'float', 'list', 'dict', 'set', 'tuple', 'bool', 'type', 'isinstance', 'issubclass', 'hasattr', 'getattr', 'setattr', 'delattr', 'callable', 'iter', 'next', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed', 'sum', 'min', 'max', 'abs', 'round', 'pow', 'divmod', 'input', 'open', 'file', 'repr', 'hash', 'id', 'dir', 'vars', 'locals', 'globals', 'eval', 'exec', 'compile', 'format', 'chr', 'ord', 'bin', 'hex', 'oct', 'slice', 'object', 'super', 'property', 'staticmethod', 'classmethod', 'all', 'any' ] # Python standard library modules stdlib_modules = [ 'os', 'sys', 'json', 're', 'math', 'random', 'datetime', 'time', 'collections', 'itertools', 'functools', 'operator', 'string', 'io', 'pathlib', 'shutil', 'glob', 'pickle', 'csv', 'sqlite3', 'urllib', 'http', 'email', 'html', 'xml', 'logging', 'unittest', 'threading', 'multiprocessing', 'subprocess', 'socket', 'asyncio', 'typing', 'dataclasses', 'abc', 'copy', 'pprint', 'textwrap', 'struct', 'codecs', 'unicodedata', 'hashlib', 'hmac', 'secrets' ] # Common third-party libraries common_libraries = { 'numpy': 'Numerical computing library', 'pandas': 'Data analysis library', 'tensorflow': 'Machine learning framework', 'pytorch': 'Deep learning framework', 'torch': 'PyTorch deep learning', 'keras': 'High-level neural network API', 'sklearn': 'Machine learning library', 'scikit-learn': 'Machine learning library', 'matplotlib': 'Plotting library', 'seaborn': 'Statistical visualization', 'requests': 'HTTP library', 'flask': 'Web framework', 'django': 'Web framework', 'fastapi': 'Modern web framework', 'sqlalchemy': 'Database ORM', 'beautifulsoup': 'Web scraping', 'selenium': 'Browser automation', 'pytest': 'Testing framework', 'pillow': 'Image processing', 'opencv': 'Computer vision', 'cv2': 'OpenCV library', 'scipy': 'Scientific computing', 'nltk': 'Natural language toolkit', 'spacy': 'NLP library', 'transformers': 'Hugging Face transformers', 'gradio': 'ML demo interface', 'streamlit': 'Data app framework' } # Add to known concepts for kw in python_keywords: self.known_concepts[kw] = { 'type': 'keyword', 'category': 'python_builtin', 'learned_at': 'builtin' } for func in builtin_functions: self.known_concepts[func] = { 'type': 'function', 'category': 'python_builtin', 'learned_at': 'builtin' } for mod in stdlib_modules: self.known_concepts[mod] = { 'type': 'module', 'category': 'python_stdlib', 'learned_at': 'builtin' } for lib, desc in common_libraries.items(): self.known_concepts[lib] = { 'type': 'library', 'category': 'third_party', 'description': desc, 'learned_at': 'builtin' } def save(self): """Save knowledge base to file""" data = { 'known_concepts': self.known_concepts, 'unknown_concepts': self.unknown_concepts, 'concept_examples': self.concept_examples, 'learning_queue': self.learning_queue } with open(KNOWLEDGE_PATH, 'w') as f: json.dump(data, f, indent=2, default=str) def load(self): """Load knowledge base from file""" if os.path.exists(KNOWLEDGE_PATH): try: with open(KNOWLEDGE_PATH, 'r') as f: data = json.load(f) # Merge with built-in (don't overwrite) saved_known = data.get('known_concepts', {}) for k, v in saved_known.items(): if k not in self.known_concepts: self.known_concepts[k] = v self.unknown_concepts = data.get('unknown_concepts', {}) self.concept_examples = data.get('concept_examples', {}) self.learning_queue = data.get('learning_queue', []) except Exception as e: print(f"Error loading knowledge base: {e}") def extract_concepts(self, code: str) -> Set[str]: """Extract programming concepts from code""" concepts = set() # Extract identifiers identifiers = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code) concepts.update(identifiers) # Extract import statements imports = re.findall(r'(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code) concepts.update(imports) # Extract function/class names definitions = re.findall(r'(?:def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code) concepts.update(definitions) return concepts def check_knowledge(self, text: str) -> Tuple[List[str], List[str]]: """Check text for known and unknown concepts""" concepts = self.extract_concepts(text) known = [] unknown = [] for concept in concepts: # Skip very short or common words if len(concept) < 2: continue if concept.lower() in ['a', 'an', 'the', 'is', 'it', 'to', 'of']: continue if concept.lower() in self.known_concepts or concept in self.known_concepts: known.append(concept) else: # Check if it looks like a library/framework name if self._looks_like_library(concept): unknown.append(concept) return known, unknown def _looks_like_library(self, name: str) -> bool: """Check if a name looks like a library/module name""" # Skip common variable names common_vars = [ 'self', 'cls', 'args', 'kwargs', 'result', 'data', 'value', 'item', 'items', 'key', 'keys', 'val', 'vals', 'obj', 'func', 'arr', 'lst', 'num', 'count', 'index', 'idx', 'temp', 'tmp', 'i', 'j', 'k', 'n', 'm', 'x', 'y', 'z', 'a', 'b', 'c' ] if name.lower() in common_vars: return False # Skip ALL_CAPS (likely constants) if name.isupper(): return False # Skip _private and __dunder__ if name.startswith('_'): return False # Looks like a library if: # - lowercase # - contains underscore # - known library patterns if name.islower() and len(name) > 3: return True return False def add_unknown(self, concept: str, context: str = ""): """Add an unknown concept to the learning queue""" if concept not in self.unknown_concepts: self.unknown_concepts[concept] = { 'first_seen': datetime.now().isoformat(), 'times_seen': 1, 'contexts': [context] if context else [], 'status': 'pending' } # Add to learning queue self.learning_queue.append({ 'concept': concept, 'context': context, 'timestamp': datetime.now().isoformat() }) else: self.unknown_concepts[concept]['times_seen'] += 1 if context and context not in self.unknown_concepts[concept]['contexts']: self.unknown_concepts[concept]['contexts'].append(context) self.save() def teach_concept( self, concept: str, concept_type: str, description: str, example_code: str, category: str = "user_taught" ): """Teach the model a new concept""" # Add to known concepts self.known_concepts[concept] = { 'type': concept_type, 'category': category, 'description': description, 'learned_at': datetime.now().isoformat() } # Add example if concept not in self.concept_examples: self.concept_examples[concept] = [] self.concept_examples[concept].append(example_code) # Remove from unknown if concept in self.unknown_concepts: del self.unknown_concepts[concept] # Remove from learning queue self.learning_queue = [ item for item in self.learning_queue if item['concept'] != concept ] self.save() return True def get_learning_queue(self) -> List[dict]: """Get concepts waiting to be learned""" return self.learning_queue def get_unknown_concepts(self) -> Dict[str, dict]: """Get all unknown concepts""" return self.unknown_concepts def get_example_code(self, concept: str) -> List[str]: """Get example code for a concept""" return self.concept_examples.get(concept, []) def get_all_examples(self) -> str: """Get all example code for training""" all_code = [] for concept, examples in self.concept_examples.items(): for example in examples: all_code.append(f"# Example of {concept}\n{example}") return "\n\n".join(all_code) def get_statistics(self) -> dict: """Get knowledge base statistics""" return { 'known_concepts': len(self.known_concepts), 'unknown_concepts': len(self.unknown_concepts), 'concepts_with_examples': len(self.concept_examples), 'total_examples': sum(len(v) for v in self.concept_examples.values()), 'learning_queue_size': len(self.learning_queue), 'categories': self._count_categories() } def _count_categories(self) -> dict: """Count concepts by category""" categories = {} for concept, info in self.known_concepts.items(): cat = info.get('category', 'unknown') categories[cat] = categories.get(cat, 0) + 1 return categories # Global knowledge base instance knowledge_base = KnowledgeBase()