Spaces:
Running
Running
File size: 12,179 Bytes
ada3e2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 |
"""Knowledge Base for detecting and learning unknown concepts"""
import json
import os
import re
from typing import List, Dict, Set, Optional, Tuple
from datetime import datetime
from config import KNOWLEDGE_PATH, DATA_DIR
class KnowledgeBase:
"""Manages known and unknown programming concepts"""
def __init__(self):
self.known_concepts: Dict[str, dict] = {}
self.unknown_concepts: Dict[str, dict] = {}
self.concept_examples: Dict[str, List[str]] = {}
self.learning_queue: List[dict] = []
# Built-in Python knowledge
self._init_builtin_knowledge()
# Load saved knowledge
self.load()
def _init_builtin_knowledge(self):
"""Initialize with Python built-in knowledge"""
# Python keywords
python_keywords = [
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try',
'except', 'finally', 'with', 'as', 'import', 'from', 'return',
'yield', 'raise', 'pass', 'break', 'continue', 'lambda', 'and',
'or', 'not', 'in', 'is', 'None', 'True', 'False', 'global',
'nonlocal', 'assert', 'del', 'async', 'await'
]
# Python built-in functions
builtin_functions = [
'print', 'len', 'range', 'int', 'str', 'float', 'list', 'dict',
'set', 'tuple', 'bool', 'type', 'isinstance', 'issubclass',
'hasattr', 'getattr', 'setattr', 'delattr', 'callable', 'iter',
'next', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed',
'sum', 'min', 'max', 'abs', 'round', 'pow', 'divmod', 'input',
'open', 'file', 'repr', 'hash', 'id', 'dir', 'vars', 'locals',
'globals', 'eval', 'exec', 'compile', 'format', 'chr', 'ord',
'bin', 'hex', 'oct', 'slice', 'object', 'super', 'property',
'staticmethod', 'classmethod', 'all', 'any'
]
# Python standard library modules
stdlib_modules = [
'os', 'sys', 'json', 're', 'math', 'random', 'datetime', 'time',
'collections', 'itertools', 'functools', 'operator', 'string',
'io', 'pathlib', 'shutil', 'glob', 'pickle', 'csv', 'sqlite3',
'urllib', 'http', 'email', 'html', 'xml', 'logging', 'unittest',
'threading', 'multiprocessing', 'subprocess', 'socket', 'asyncio',
'typing', 'dataclasses', 'abc', 'copy', 'pprint', 'textwrap',
'struct', 'codecs', 'unicodedata', 'hashlib', 'hmac', 'secrets'
]
# Common third-party libraries
common_libraries = {
'numpy': 'Numerical computing library',
'pandas': 'Data analysis library',
'tensorflow': 'Machine learning framework',
'pytorch': 'Deep learning framework',
'torch': 'PyTorch deep learning',
'keras': 'High-level neural network API',
'sklearn': 'Machine learning library',
'scikit-learn': 'Machine learning library',
'matplotlib': 'Plotting library',
'seaborn': 'Statistical visualization',
'requests': 'HTTP library',
'flask': 'Web framework',
'django': 'Web framework',
'fastapi': 'Modern web framework',
'sqlalchemy': 'Database ORM',
'beautifulsoup': 'Web scraping',
'selenium': 'Browser automation',
'pytest': 'Testing framework',
'pillow': 'Image processing',
'opencv': 'Computer vision',
'cv2': 'OpenCV library',
'scipy': 'Scientific computing',
'nltk': 'Natural language toolkit',
'spacy': 'NLP library',
'transformers': 'Hugging Face transformers',
'gradio': 'ML demo interface',
'streamlit': 'Data app framework'
}
# Add to known concepts
for kw in python_keywords:
self.known_concepts[kw] = {
'type': 'keyword',
'category': 'python_builtin',
'learned_at': 'builtin'
}
for func in builtin_functions:
self.known_concepts[func] = {
'type': 'function',
'category': 'python_builtin',
'learned_at': 'builtin'
}
for mod in stdlib_modules:
self.known_concepts[mod] = {
'type': 'module',
'category': 'python_stdlib',
'learned_at': 'builtin'
}
for lib, desc in common_libraries.items():
self.known_concepts[lib] = {
'type': 'library',
'category': 'third_party',
'description': desc,
'learned_at': 'builtin'
}
def save(self):
"""Save knowledge base to file"""
data = {
'known_concepts': self.known_concepts,
'unknown_concepts': self.unknown_concepts,
'concept_examples': self.concept_examples,
'learning_queue': self.learning_queue
}
with open(KNOWLEDGE_PATH, 'w') as f:
json.dump(data, f, indent=2, default=str)
def load(self):
"""Load knowledge base from file"""
if os.path.exists(KNOWLEDGE_PATH):
try:
with open(KNOWLEDGE_PATH, 'r') as f:
data = json.load(f)
# Merge with built-in (don't overwrite)
saved_known = data.get('known_concepts', {})
for k, v in saved_known.items():
if k not in self.known_concepts:
self.known_concepts[k] = v
self.unknown_concepts = data.get('unknown_concepts', {})
self.concept_examples = data.get('concept_examples', {})
self.learning_queue = data.get('learning_queue', [])
except Exception as e:
print(f"Error loading knowledge base: {e}")
def extract_concepts(self, code: str) -> Set[str]:
"""Extract programming concepts from code"""
concepts = set()
# Extract identifiers
identifiers = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code)
concepts.update(identifiers)
# Extract import statements
imports = re.findall(r'(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
concepts.update(imports)
# Extract function/class names
definitions = re.findall(r'(?:def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
concepts.update(definitions)
return concepts
def check_knowledge(self, text: str) -> Tuple[List[str], List[str]]:
"""Check text for known and unknown concepts"""
concepts = self.extract_concepts(text)
known = []
unknown = []
for concept in concepts:
# Skip very short or common words
if len(concept) < 2:
continue
if concept.lower() in ['a', 'an', 'the', 'is', 'it', 'to', 'of']:
continue
if concept.lower() in self.known_concepts or concept in self.known_concepts:
known.append(concept)
else:
# Check if it looks like a library/framework name
if self._looks_like_library(concept):
unknown.append(concept)
return known, unknown
def _looks_like_library(self, name: str) -> bool:
"""Check if a name looks like a library/module name"""
# Skip common variable names
common_vars = [
'self', 'cls', 'args', 'kwargs', 'result', 'data', 'value',
'item', 'items', 'key', 'keys', 'val', 'vals', 'obj', 'func',
'arr', 'lst', 'num', 'count', 'index', 'idx', 'temp', 'tmp',
'i', 'j', 'k', 'n', 'm', 'x', 'y', 'z', 'a', 'b', 'c'
]
if name.lower() in common_vars:
return False
# Skip ALL_CAPS (likely constants)
if name.isupper():
return False
# Skip _private and __dunder__
if name.startswith('_'):
return False
# Looks like a library if:
# - lowercase
# - contains underscore
# - known library patterns
if name.islower() and len(name) > 3:
return True
return False
def add_unknown(self, concept: str, context: str = ""):
"""Add an unknown concept to the learning queue"""
if concept not in self.unknown_concepts:
self.unknown_concepts[concept] = {
'first_seen': datetime.now().isoformat(),
'times_seen': 1,
'contexts': [context] if context else [],
'status': 'pending'
}
# Add to learning queue
self.learning_queue.append({
'concept': concept,
'context': context,
'timestamp': datetime.now().isoformat()
})
else:
self.unknown_concepts[concept]['times_seen'] += 1
if context and context not in self.unknown_concepts[concept]['contexts']:
self.unknown_concepts[concept]['contexts'].append(context)
self.save()
def teach_concept(
self,
concept: str,
concept_type: str,
description: str,
example_code: str,
category: str = "user_taught"
):
"""Teach the model a new concept"""
# Add to known concepts
self.known_concepts[concept] = {
'type': concept_type,
'category': category,
'description': description,
'learned_at': datetime.now().isoformat()
}
# Add example
if concept not in self.concept_examples:
self.concept_examples[concept] = []
self.concept_examples[concept].append(example_code)
# Remove from unknown
if concept in self.unknown_concepts:
del self.unknown_concepts[concept]
# Remove from learning queue
self.learning_queue = [
item for item in self.learning_queue
if item['concept'] != concept
]
self.save()
return True
def get_learning_queue(self) -> List[dict]:
"""Get concepts waiting to be learned"""
return self.learning_queue
def get_unknown_concepts(self) -> Dict[str, dict]:
"""Get all unknown concepts"""
return self.unknown_concepts
def get_example_code(self, concept: str) -> List[str]:
"""Get example code for a concept"""
return self.concept_examples.get(concept, [])
def get_all_examples(self) -> str:
"""Get all example code for training"""
all_code = []
for concept, examples in self.concept_examples.items():
for example in examples:
all_code.append(f"# Example of {concept}\n{example}")
return "\n\n".join(all_code)
def get_statistics(self) -> dict:
"""Get knowledge base statistics"""
return {
'known_concepts': len(self.known_concepts),
'unknown_concepts': len(self.unknown_concepts),
'concepts_with_examples': len(self.concept_examples),
'total_examples': sum(len(v) for v in self.concept_examples.values()),
'learning_queue_size': len(self.learning_queue),
'categories': self._count_categories()
}
def _count_categories(self) -> dict:
"""Count concepts by category"""
categories = {}
for concept, info in self.known_concepts.items():
cat = info.get('category', 'unknown')
categories[cat] = categories.get(cat, 0) + 1
return categories
# Global knowledge base instance
knowledge_base = KnowledgeBase() |