Spaces:
Running
Running
| """Knowledge Base for detecting and learning unknown concepts""" | |
| import json | |
| import os | |
| import re | |
| from typing import List, Dict, Set, Optional, Tuple | |
| from datetime import datetime | |
| from config import KNOWLEDGE_PATH, DATA_DIR | |
| class KnowledgeBase: | |
| """Manages known and unknown programming concepts""" | |
| def __init__(self): | |
| self.known_concepts: Dict[str, dict] = {} | |
| self.unknown_concepts: Dict[str, dict] = {} | |
| self.concept_examples: Dict[str, List[str]] = {} | |
| self.learning_queue: List[dict] = [] | |
| # Built-in Python knowledge | |
| self._init_builtin_knowledge() | |
| # Load saved knowledge | |
| self.load() | |
| def _init_builtin_knowledge(self): | |
| """Initialize with Python built-in knowledge""" | |
| # Python keywords | |
| python_keywords = [ | |
| 'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', | |
| 'except', 'finally', 'with', 'as', 'import', 'from', 'return', | |
| 'yield', 'raise', 'pass', 'break', 'continue', 'lambda', 'and', | |
| 'or', 'not', 'in', 'is', 'None', 'True', 'False', 'global', | |
| 'nonlocal', 'assert', 'del', 'async', 'await' | |
| ] | |
| # Python built-in functions | |
| builtin_functions = [ | |
| 'print', 'len', 'range', 'int', 'str', 'float', 'list', 'dict', | |
| 'set', 'tuple', 'bool', 'type', 'isinstance', 'issubclass', | |
| 'hasattr', 'getattr', 'setattr', 'delattr', 'callable', 'iter', | |
| 'next', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed', | |
| 'sum', 'min', 'max', 'abs', 'round', 'pow', 'divmod', 'input', | |
| 'open', 'file', 'repr', 'hash', 'id', 'dir', 'vars', 'locals', | |
| 'globals', 'eval', 'exec', 'compile', 'format', 'chr', 'ord', | |
| 'bin', 'hex', 'oct', 'slice', 'object', 'super', 'property', | |
| 'staticmethod', 'classmethod', 'all', 'any' | |
| ] | |
| # Python standard library modules | |
| stdlib_modules = [ | |
| 'os', 'sys', 'json', 're', 'math', 'random', 'datetime', 'time', | |
| 'collections', 'itertools', 'functools', 'operator', 'string', | |
| 'io', 'pathlib', 'shutil', 'glob', 'pickle', 'csv', 'sqlite3', | |
| 'urllib', 'http', 'email', 'html', 'xml', 'logging', 'unittest', | |
| 'threading', 'multiprocessing', 'subprocess', 'socket', 'asyncio', | |
| 'typing', 'dataclasses', 'abc', 'copy', 'pprint', 'textwrap', | |
| 'struct', 'codecs', 'unicodedata', 'hashlib', 'hmac', 'secrets' | |
| ] | |
| # Common third-party libraries | |
| common_libraries = { | |
| 'numpy': 'Numerical computing library', | |
| 'pandas': 'Data analysis library', | |
| 'tensorflow': 'Machine learning framework', | |
| 'pytorch': 'Deep learning framework', | |
| 'torch': 'PyTorch deep learning', | |
| 'keras': 'High-level neural network API', | |
| 'sklearn': 'Machine learning library', | |
| 'scikit-learn': 'Machine learning library', | |
| 'matplotlib': 'Plotting library', | |
| 'seaborn': 'Statistical visualization', | |
| 'requests': 'HTTP library', | |
| 'flask': 'Web framework', | |
| 'django': 'Web framework', | |
| 'fastapi': 'Modern web framework', | |
| 'sqlalchemy': 'Database ORM', | |
| 'beautifulsoup': 'Web scraping', | |
| 'selenium': 'Browser automation', | |
| 'pytest': 'Testing framework', | |
| 'pillow': 'Image processing', | |
| 'opencv': 'Computer vision', | |
| 'cv2': 'OpenCV library', | |
| 'scipy': 'Scientific computing', | |
| 'nltk': 'Natural language toolkit', | |
| 'spacy': 'NLP library', | |
| 'transformers': 'Hugging Face transformers', | |
| 'gradio': 'ML demo interface', | |
| 'streamlit': 'Data app framework' | |
| } | |
| # Add to known concepts | |
| for kw in python_keywords: | |
| self.known_concepts[kw] = { | |
| 'type': 'keyword', | |
| 'category': 'python_builtin', | |
| 'learned_at': 'builtin' | |
| } | |
| for func in builtin_functions: | |
| self.known_concepts[func] = { | |
| 'type': 'function', | |
| 'category': 'python_builtin', | |
| 'learned_at': 'builtin' | |
| } | |
| for mod in stdlib_modules: | |
| self.known_concepts[mod] = { | |
| 'type': 'module', | |
| 'category': 'python_stdlib', | |
| 'learned_at': 'builtin' | |
| } | |
| for lib, desc in common_libraries.items(): | |
| self.known_concepts[lib] = { | |
| 'type': 'library', | |
| 'category': 'third_party', | |
| 'description': desc, | |
| 'learned_at': 'builtin' | |
| } | |
| def save(self): | |
| """Save knowledge base to file""" | |
| data = { | |
| 'known_concepts': self.known_concepts, | |
| 'unknown_concepts': self.unknown_concepts, | |
| 'concept_examples': self.concept_examples, | |
| 'learning_queue': self.learning_queue | |
| } | |
| with open(KNOWLEDGE_PATH, 'w') as f: | |
| json.dump(data, f, indent=2, default=str) | |
| def load(self): | |
| """Load knowledge base from file""" | |
| if os.path.exists(KNOWLEDGE_PATH): | |
| try: | |
| with open(KNOWLEDGE_PATH, 'r') as f: | |
| data = json.load(f) | |
| # Merge with built-in (don't overwrite) | |
| saved_known = data.get('known_concepts', {}) | |
| for k, v in saved_known.items(): | |
| if k not in self.known_concepts: | |
| self.known_concepts[k] = v | |
| self.unknown_concepts = data.get('unknown_concepts', {}) | |
| self.concept_examples = data.get('concept_examples', {}) | |
| self.learning_queue = data.get('learning_queue', []) | |
| except Exception as e: | |
| print(f"Error loading knowledge base: {e}") | |
| def extract_concepts(self, code: str) -> Set[str]: | |
| """Extract programming concepts from code""" | |
| concepts = set() | |
| # Extract identifiers | |
| identifiers = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code) | |
| concepts.update(identifiers) | |
| # Extract import statements | |
| imports = re.findall(r'(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code) | |
| concepts.update(imports) | |
| # Extract function/class names | |
| definitions = re.findall(r'(?:def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code) | |
| concepts.update(definitions) | |
| return concepts | |
| def check_knowledge(self, text: str) -> Tuple[List[str], List[str]]: | |
| """Check text for known and unknown concepts""" | |
| concepts = self.extract_concepts(text) | |
| known = [] | |
| unknown = [] | |
| for concept in concepts: | |
| # Skip very short or common words | |
| if len(concept) < 2: | |
| continue | |
| if concept.lower() in ['a', 'an', 'the', 'is', 'it', 'to', 'of']: | |
| continue | |
| if concept.lower() in self.known_concepts or concept in self.known_concepts: | |
| known.append(concept) | |
| else: | |
| # Check if it looks like a library/framework name | |
| if self._looks_like_library(concept): | |
| unknown.append(concept) | |
| return known, unknown | |
| def _looks_like_library(self, name: str) -> bool: | |
| """Check if a name looks like a library/module name""" | |
| # Skip common variable names | |
| common_vars = [ | |
| 'self', 'cls', 'args', 'kwargs', 'result', 'data', 'value', | |
| 'item', 'items', 'key', 'keys', 'val', 'vals', 'obj', 'func', | |
| 'arr', 'lst', 'num', 'count', 'index', 'idx', 'temp', 'tmp', | |
| 'i', 'j', 'k', 'n', 'm', 'x', 'y', 'z', 'a', 'b', 'c' | |
| ] | |
| if name.lower() in common_vars: | |
| return False | |
| # Skip ALL_CAPS (likely constants) | |
| if name.isupper(): | |
| return False | |
| # Skip _private and __dunder__ | |
| if name.startswith('_'): | |
| return False | |
| # Looks like a library if: | |
| # - lowercase | |
| # - contains underscore | |
| # - known library patterns | |
| if name.islower() and len(name) > 3: | |
| return True | |
| return False | |
| def add_unknown(self, concept: str, context: str = ""): | |
| """Add an unknown concept to the learning queue""" | |
| if concept not in self.unknown_concepts: | |
| self.unknown_concepts[concept] = { | |
| 'first_seen': datetime.now().isoformat(), | |
| 'times_seen': 1, | |
| 'contexts': [context] if context else [], | |
| 'status': 'pending' | |
| } | |
| # Add to learning queue | |
| self.learning_queue.append({ | |
| 'concept': concept, | |
| 'context': context, | |
| 'timestamp': datetime.now().isoformat() | |
| }) | |
| else: | |
| self.unknown_concepts[concept]['times_seen'] += 1 | |
| if context and context not in self.unknown_concepts[concept]['contexts']: | |
| self.unknown_concepts[concept]['contexts'].append(context) | |
| self.save() | |
| def teach_concept( | |
| self, | |
| concept: str, | |
| concept_type: str, | |
| description: str, | |
| example_code: str, | |
| category: str = "user_taught" | |
| ): | |
| """Teach the model a new concept""" | |
| # Add to known concepts | |
| self.known_concepts[concept] = { | |
| 'type': concept_type, | |
| 'category': category, | |
| 'description': description, | |
| 'learned_at': datetime.now().isoformat() | |
| } | |
| # Add example | |
| if concept not in self.concept_examples: | |
| self.concept_examples[concept] = [] | |
| self.concept_examples[concept].append(example_code) | |
| # Remove from unknown | |
| if concept in self.unknown_concepts: | |
| del self.unknown_concepts[concept] | |
| # Remove from learning queue | |
| self.learning_queue = [ | |
| item for item in self.learning_queue | |
| if item['concept'] != concept | |
| ] | |
| self.save() | |
| return True | |
| def get_learning_queue(self) -> List[dict]: | |
| """Get concepts waiting to be learned""" | |
| return self.learning_queue | |
| def get_unknown_concepts(self) -> Dict[str, dict]: | |
| """Get all unknown concepts""" | |
| return self.unknown_concepts | |
| def get_example_code(self, concept: str) -> List[str]: | |
| """Get example code for a concept""" | |
| return self.concept_examples.get(concept, []) | |
| def get_all_examples(self) -> str: | |
| """Get all example code for training""" | |
| all_code = [] | |
| for concept, examples in self.concept_examples.items(): | |
| for example in examples: | |
| all_code.append(f"# Example of {concept}\n{example}") | |
| return "\n\n".join(all_code) | |
| def get_statistics(self) -> dict: | |
| """Get knowledge base statistics""" | |
| return { | |
| 'known_concepts': len(self.known_concepts), | |
| 'unknown_concepts': len(self.unknown_concepts), | |
| 'concepts_with_examples': len(self.concept_examples), | |
| 'total_examples': sum(len(v) for v in self.concept_examples.values()), | |
| 'learning_queue_size': len(self.learning_queue), | |
| 'categories': self._count_categories() | |
| } | |
| def _count_categories(self) -> dict: | |
| """Count concepts by category""" | |
| categories = {} | |
| for concept, info in self.known_concepts.items(): | |
| cat = info.get('category', 'unknown') | |
| categories[cat] = categories.get(cat, 0) + 1 | |
| return categories | |
| # Global knowledge base instance | |
| knowledge_base = KnowledgeBase() |