veda-programming / knowledge_base.py
vedaco's picture
Create knowledge_base.py
ada3e2e verified
"""Knowledge Base for detecting and learning unknown concepts"""
import json
import os
import re
from typing import List, Dict, Set, Optional, Tuple
from datetime import datetime
from config import KNOWLEDGE_PATH, DATA_DIR
class KnowledgeBase:
"""Manages known and unknown programming concepts"""
def __init__(self):
self.known_concepts: Dict[str, dict] = {}
self.unknown_concepts: Dict[str, dict] = {}
self.concept_examples: Dict[str, List[str]] = {}
self.learning_queue: List[dict] = []
# Built-in Python knowledge
self._init_builtin_knowledge()
# Load saved knowledge
self.load()
def _init_builtin_knowledge(self):
"""Initialize with Python built-in knowledge"""
# Python keywords
python_keywords = [
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try',
'except', 'finally', 'with', 'as', 'import', 'from', 'return',
'yield', 'raise', 'pass', 'break', 'continue', 'lambda', 'and',
'or', 'not', 'in', 'is', 'None', 'True', 'False', 'global',
'nonlocal', 'assert', 'del', 'async', 'await'
]
# Python built-in functions
builtin_functions = [
'print', 'len', 'range', 'int', 'str', 'float', 'list', 'dict',
'set', 'tuple', 'bool', 'type', 'isinstance', 'issubclass',
'hasattr', 'getattr', 'setattr', 'delattr', 'callable', 'iter',
'next', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed',
'sum', 'min', 'max', 'abs', 'round', 'pow', 'divmod', 'input',
'open', 'file', 'repr', 'hash', 'id', 'dir', 'vars', 'locals',
'globals', 'eval', 'exec', 'compile', 'format', 'chr', 'ord',
'bin', 'hex', 'oct', 'slice', 'object', 'super', 'property',
'staticmethod', 'classmethod', 'all', 'any'
]
# Python standard library modules
stdlib_modules = [
'os', 'sys', 'json', 're', 'math', 'random', 'datetime', 'time',
'collections', 'itertools', 'functools', 'operator', 'string',
'io', 'pathlib', 'shutil', 'glob', 'pickle', 'csv', 'sqlite3',
'urllib', 'http', 'email', 'html', 'xml', 'logging', 'unittest',
'threading', 'multiprocessing', 'subprocess', 'socket', 'asyncio',
'typing', 'dataclasses', 'abc', 'copy', 'pprint', 'textwrap',
'struct', 'codecs', 'unicodedata', 'hashlib', 'hmac', 'secrets'
]
# Common third-party libraries
common_libraries = {
'numpy': 'Numerical computing library',
'pandas': 'Data analysis library',
'tensorflow': 'Machine learning framework',
'pytorch': 'Deep learning framework',
'torch': 'PyTorch deep learning',
'keras': 'High-level neural network API',
'sklearn': 'Machine learning library',
'scikit-learn': 'Machine learning library',
'matplotlib': 'Plotting library',
'seaborn': 'Statistical visualization',
'requests': 'HTTP library',
'flask': 'Web framework',
'django': 'Web framework',
'fastapi': 'Modern web framework',
'sqlalchemy': 'Database ORM',
'beautifulsoup': 'Web scraping',
'selenium': 'Browser automation',
'pytest': 'Testing framework',
'pillow': 'Image processing',
'opencv': 'Computer vision',
'cv2': 'OpenCV library',
'scipy': 'Scientific computing',
'nltk': 'Natural language toolkit',
'spacy': 'NLP library',
'transformers': 'Hugging Face transformers',
'gradio': 'ML demo interface',
'streamlit': 'Data app framework'
}
# Add to known concepts
for kw in python_keywords:
self.known_concepts[kw] = {
'type': 'keyword',
'category': 'python_builtin',
'learned_at': 'builtin'
}
for func in builtin_functions:
self.known_concepts[func] = {
'type': 'function',
'category': 'python_builtin',
'learned_at': 'builtin'
}
for mod in stdlib_modules:
self.known_concepts[mod] = {
'type': 'module',
'category': 'python_stdlib',
'learned_at': 'builtin'
}
for lib, desc in common_libraries.items():
self.known_concepts[lib] = {
'type': 'library',
'category': 'third_party',
'description': desc,
'learned_at': 'builtin'
}
def save(self):
"""Save knowledge base to file"""
data = {
'known_concepts': self.known_concepts,
'unknown_concepts': self.unknown_concepts,
'concept_examples': self.concept_examples,
'learning_queue': self.learning_queue
}
with open(KNOWLEDGE_PATH, 'w') as f:
json.dump(data, f, indent=2, default=str)
def load(self):
"""Load knowledge base from file"""
if os.path.exists(KNOWLEDGE_PATH):
try:
with open(KNOWLEDGE_PATH, 'r') as f:
data = json.load(f)
# Merge with built-in (don't overwrite)
saved_known = data.get('known_concepts', {})
for k, v in saved_known.items():
if k not in self.known_concepts:
self.known_concepts[k] = v
self.unknown_concepts = data.get('unknown_concepts', {})
self.concept_examples = data.get('concept_examples', {})
self.learning_queue = data.get('learning_queue', [])
except Exception as e:
print(f"Error loading knowledge base: {e}")
def extract_concepts(self, code: str) -> Set[str]:
"""Extract programming concepts from code"""
concepts = set()
# Extract identifiers
identifiers = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code)
concepts.update(identifiers)
# Extract import statements
imports = re.findall(r'(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
concepts.update(imports)
# Extract function/class names
definitions = re.findall(r'(?:def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
concepts.update(definitions)
return concepts
def check_knowledge(self, text: str) -> Tuple[List[str], List[str]]:
"""Check text for known and unknown concepts"""
concepts = self.extract_concepts(text)
known = []
unknown = []
for concept in concepts:
# Skip very short or common words
if len(concept) < 2:
continue
if concept.lower() in ['a', 'an', 'the', 'is', 'it', 'to', 'of']:
continue
if concept.lower() in self.known_concepts or concept in self.known_concepts:
known.append(concept)
else:
# Check if it looks like a library/framework name
if self._looks_like_library(concept):
unknown.append(concept)
return known, unknown
def _looks_like_library(self, name: str) -> bool:
"""Check if a name looks like a library/module name"""
# Skip common variable names
common_vars = [
'self', 'cls', 'args', 'kwargs', 'result', 'data', 'value',
'item', 'items', 'key', 'keys', 'val', 'vals', 'obj', 'func',
'arr', 'lst', 'num', 'count', 'index', 'idx', 'temp', 'tmp',
'i', 'j', 'k', 'n', 'm', 'x', 'y', 'z', 'a', 'b', 'c'
]
if name.lower() in common_vars:
return False
# Skip ALL_CAPS (likely constants)
if name.isupper():
return False
# Skip _private and __dunder__
if name.startswith('_'):
return False
# Looks like a library if:
# - lowercase
# - contains underscore
# - known library patterns
if name.islower() and len(name) > 3:
return True
return False
def add_unknown(self, concept: str, context: str = ""):
"""Add an unknown concept to the learning queue"""
if concept not in self.unknown_concepts:
self.unknown_concepts[concept] = {
'first_seen': datetime.now().isoformat(),
'times_seen': 1,
'contexts': [context] if context else [],
'status': 'pending'
}
# Add to learning queue
self.learning_queue.append({
'concept': concept,
'context': context,
'timestamp': datetime.now().isoformat()
})
else:
self.unknown_concepts[concept]['times_seen'] += 1
if context and context not in self.unknown_concepts[concept]['contexts']:
self.unknown_concepts[concept]['contexts'].append(context)
self.save()
def teach_concept(
self,
concept: str,
concept_type: str,
description: str,
example_code: str,
category: str = "user_taught"
):
"""Teach the model a new concept"""
# Add to known concepts
self.known_concepts[concept] = {
'type': concept_type,
'category': category,
'description': description,
'learned_at': datetime.now().isoformat()
}
# Add example
if concept not in self.concept_examples:
self.concept_examples[concept] = []
self.concept_examples[concept].append(example_code)
# Remove from unknown
if concept in self.unknown_concepts:
del self.unknown_concepts[concept]
# Remove from learning queue
self.learning_queue = [
item for item in self.learning_queue
if item['concept'] != concept
]
self.save()
return True
def get_learning_queue(self) -> List[dict]:
"""Get concepts waiting to be learned"""
return self.learning_queue
def get_unknown_concepts(self) -> Dict[str, dict]:
"""Get all unknown concepts"""
return self.unknown_concepts
def get_example_code(self, concept: str) -> List[str]:
"""Get example code for a concept"""
return self.concept_examples.get(concept, [])
def get_all_examples(self) -> str:
"""Get all example code for training"""
all_code = []
for concept, examples in self.concept_examples.items():
for example in examples:
all_code.append(f"# Example of {concept}\n{example}")
return "\n\n".join(all_code)
def get_statistics(self) -> dict:
"""Get knowledge base statistics"""
return {
'known_concepts': len(self.known_concepts),
'unknown_concepts': len(self.unknown_concepts),
'concepts_with_examples': len(self.concept_examples),
'total_examples': sum(len(v) for v in self.concept_examples.values()),
'learning_queue_size': len(self.learning_queue),
'categories': self._count_categories()
}
def _count_categories(self) -> dict:
"""Count concepts by category"""
categories = {}
for concept, info in self.known_concepts.items():
cat = info.get('category', 'unknown')
categories[cat] = categories.get(cat, 0) + 1
return categories
# Global knowledge base instance
knowledge_base = KnowledgeBase()