Spaces:
Running
Running
Create knowledge_base.py
Browse files- knowledge_base.py +326 -0
knowledge_base.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Knowledge Base for detecting and learning unknown concepts"""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from typing import List, Dict, Set, Optional, Tuple
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
from config import KNOWLEDGE_PATH, DATA_DIR
|
| 10 |
+
|
| 11 |
+
class KnowledgeBase:
|
| 12 |
+
"""Manages known and unknown programming concepts"""
|
| 13 |
+
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.known_concepts: Dict[str, dict] = {}
|
| 16 |
+
self.unknown_concepts: Dict[str, dict] = {}
|
| 17 |
+
self.concept_examples: Dict[str, List[str]] = {}
|
| 18 |
+
self.learning_queue: List[dict] = []
|
| 19 |
+
|
| 20 |
+
# Built-in Python knowledge
|
| 21 |
+
self._init_builtin_knowledge()
|
| 22 |
+
|
| 23 |
+
# Load saved knowledge
|
| 24 |
+
self.load()
|
| 25 |
+
|
| 26 |
+
def _init_builtin_knowledge(self):
|
| 27 |
+
"""Initialize with Python built-in knowledge"""
|
| 28 |
+
|
| 29 |
+
# Python keywords
|
| 30 |
+
python_keywords = [
|
| 31 |
+
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try',
|
| 32 |
+
'except', 'finally', 'with', 'as', 'import', 'from', 'return',
|
| 33 |
+
'yield', 'raise', 'pass', 'break', 'continue', 'lambda', 'and',
|
| 34 |
+
'or', 'not', 'in', 'is', 'None', 'True', 'False', 'global',
|
| 35 |
+
'nonlocal', 'assert', 'del', 'async', 'await'
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
# Python built-in functions
|
| 39 |
+
builtin_functions = [
|
| 40 |
+
'print', 'len', 'range', 'int', 'str', 'float', 'list', 'dict',
|
| 41 |
+
'set', 'tuple', 'bool', 'type', 'isinstance', 'issubclass',
|
| 42 |
+
'hasattr', 'getattr', 'setattr', 'delattr', 'callable', 'iter',
|
| 43 |
+
'next', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed',
|
| 44 |
+
'sum', 'min', 'max', 'abs', 'round', 'pow', 'divmod', 'input',
|
| 45 |
+
'open', 'file', 'repr', 'hash', 'id', 'dir', 'vars', 'locals',
|
| 46 |
+
'globals', 'eval', 'exec', 'compile', 'format', 'chr', 'ord',
|
| 47 |
+
'bin', 'hex', 'oct', 'slice', 'object', 'super', 'property',
|
| 48 |
+
'staticmethod', 'classmethod', 'all', 'any'
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# Python standard library modules
|
| 52 |
+
stdlib_modules = [
|
| 53 |
+
'os', 'sys', 'json', 're', 'math', 'random', 'datetime', 'time',
|
| 54 |
+
'collections', 'itertools', 'functools', 'operator', 'string',
|
| 55 |
+
'io', 'pathlib', 'shutil', 'glob', 'pickle', 'csv', 'sqlite3',
|
| 56 |
+
'urllib', 'http', 'email', 'html', 'xml', 'logging', 'unittest',
|
| 57 |
+
'threading', 'multiprocessing', 'subprocess', 'socket', 'asyncio',
|
| 58 |
+
'typing', 'dataclasses', 'abc', 'copy', 'pprint', 'textwrap',
|
| 59 |
+
'struct', 'codecs', 'unicodedata', 'hashlib', 'hmac', 'secrets'
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
# Common third-party libraries
|
| 63 |
+
common_libraries = {
|
| 64 |
+
'numpy': 'Numerical computing library',
|
| 65 |
+
'pandas': 'Data analysis library',
|
| 66 |
+
'tensorflow': 'Machine learning framework',
|
| 67 |
+
'pytorch': 'Deep learning framework',
|
| 68 |
+
'torch': 'PyTorch deep learning',
|
| 69 |
+
'keras': 'High-level neural network API',
|
| 70 |
+
'sklearn': 'Machine learning library',
|
| 71 |
+
'scikit-learn': 'Machine learning library',
|
| 72 |
+
'matplotlib': 'Plotting library',
|
| 73 |
+
'seaborn': 'Statistical visualization',
|
| 74 |
+
'requests': 'HTTP library',
|
| 75 |
+
'flask': 'Web framework',
|
| 76 |
+
'django': 'Web framework',
|
| 77 |
+
'fastapi': 'Modern web framework',
|
| 78 |
+
'sqlalchemy': 'Database ORM',
|
| 79 |
+
'beautifulsoup': 'Web scraping',
|
| 80 |
+
'selenium': 'Browser automation',
|
| 81 |
+
'pytest': 'Testing framework',
|
| 82 |
+
'pillow': 'Image processing',
|
| 83 |
+
'opencv': 'Computer vision',
|
| 84 |
+
'cv2': 'OpenCV library',
|
| 85 |
+
'scipy': 'Scientific computing',
|
| 86 |
+
'nltk': 'Natural language toolkit',
|
| 87 |
+
'spacy': 'NLP library',
|
| 88 |
+
'transformers': 'Hugging Face transformers',
|
| 89 |
+
'gradio': 'ML demo interface',
|
| 90 |
+
'streamlit': 'Data app framework'
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
# Add to known concepts
|
| 94 |
+
for kw in python_keywords:
|
| 95 |
+
self.known_concepts[kw] = {
|
| 96 |
+
'type': 'keyword',
|
| 97 |
+
'category': 'python_builtin',
|
| 98 |
+
'learned_at': 'builtin'
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
for func in builtin_functions:
|
| 102 |
+
self.known_concepts[func] = {
|
| 103 |
+
'type': 'function',
|
| 104 |
+
'category': 'python_builtin',
|
| 105 |
+
'learned_at': 'builtin'
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
for mod in stdlib_modules:
|
| 109 |
+
self.known_concepts[mod] = {
|
| 110 |
+
'type': 'module',
|
| 111 |
+
'category': 'python_stdlib',
|
| 112 |
+
'learned_at': 'builtin'
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
for lib, desc in common_libraries.items():
|
| 116 |
+
self.known_concepts[lib] = {
|
| 117 |
+
'type': 'library',
|
| 118 |
+
'category': 'third_party',
|
| 119 |
+
'description': desc,
|
| 120 |
+
'learned_at': 'builtin'
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
def save(self):
|
| 124 |
+
"""Save knowledge base to file"""
|
| 125 |
+
data = {
|
| 126 |
+
'known_concepts': self.known_concepts,
|
| 127 |
+
'unknown_concepts': self.unknown_concepts,
|
| 128 |
+
'concept_examples': self.concept_examples,
|
| 129 |
+
'learning_queue': self.learning_queue
|
| 130 |
+
}
|
| 131 |
+
with open(KNOWLEDGE_PATH, 'w') as f:
|
| 132 |
+
json.dump(data, f, indent=2, default=str)
|
| 133 |
+
|
| 134 |
+
def load(self):
|
| 135 |
+
"""Load knowledge base from file"""
|
| 136 |
+
if os.path.exists(KNOWLEDGE_PATH):
|
| 137 |
+
try:
|
| 138 |
+
with open(KNOWLEDGE_PATH, 'r') as f:
|
| 139 |
+
data = json.load(f)
|
| 140 |
+
|
| 141 |
+
# Merge with built-in (don't overwrite)
|
| 142 |
+
saved_known = data.get('known_concepts', {})
|
| 143 |
+
for k, v in saved_known.items():
|
| 144 |
+
if k not in self.known_concepts:
|
| 145 |
+
self.known_concepts[k] = v
|
| 146 |
+
|
| 147 |
+
self.unknown_concepts = data.get('unknown_concepts', {})
|
| 148 |
+
self.concept_examples = data.get('concept_examples', {})
|
| 149 |
+
self.learning_queue = data.get('learning_queue', [])
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"Error loading knowledge base: {e}")
|
| 153 |
+
|
| 154 |
+
def extract_concepts(self, code: str) -> Set[str]:
|
| 155 |
+
"""Extract programming concepts from code"""
|
| 156 |
+
concepts = set()
|
| 157 |
+
|
| 158 |
+
# Extract identifiers
|
| 159 |
+
identifiers = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code)
|
| 160 |
+
concepts.update(identifiers)
|
| 161 |
+
|
| 162 |
+
# Extract import statements
|
| 163 |
+
imports = re.findall(r'(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
|
| 164 |
+
concepts.update(imports)
|
| 165 |
+
|
| 166 |
+
# Extract function/class names
|
| 167 |
+
definitions = re.findall(r'(?:def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
|
| 168 |
+
concepts.update(definitions)
|
| 169 |
+
|
| 170 |
+
return concepts
|
| 171 |
+
|
| 172 |
+
def check_knowledge(self, text: str) -> Tuple[List[str], List[str]]:
|
| 173 |
+
"""Check text for known and unknown concepts"""
|
| 174 |
+
concepts = self.extract_concepts(text)
|
| 175 |
+
|
| 176 |
+
known = []
|
| 177 |
+
unknown = []
|
| 178 |
+
|
| 179 |
+
for concept in concepts:
|
| 180 |
+
# Skip very short or common words
|
| 181 |
+
if len(concept) < 2:
|
| 182 |
+
continue
|
| 183 |
+
if concept.lower() in ['a', 'an', 'the', 'is', 'it', 'to', 'of']:
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
if concept.lower() in self.known_concepts or concept in self.known_concepts:
|
| 187 |
+
known.append(concept)
|
| 188 |
+
else:
|
| 189 |
+
# Check if it looks like a library/framework name
|
| 190 |
+
if self._looks_like_library(concept):
|
| 191 |
+
unknown.append(concept)
|
| 192 |
+
|
| 193 |
+
return known, unknown
|
| 194 |
+
|
| 195 |
+
def _looks_like_library(self, name: str) -> bool:
|
| 196 |
+
"""Check if a name looks like a library/module name"""
|
| 197 |
+
# Skip common variable names
|
| 198 |
+
common_vars = [
|
| 199 |
+
'self', 'cls', 'args', 'kwargs', 'result', 'data', 'value',
|
| 200 |
+
'item', 'items', 'key', 'keys', 'val', 'vals', 'obj', 'func',
|
| 201 |
+
'arr', 'lst', 'num', 'count', 'index', 'idx', 'temp', 'tmp',
|
| 202 |
+
'i', 'j', 'k', 'n', 'm', 'x', 'y', 'z', 'a', 'b', 'c'
|
| 203 |
+
]
|
| 204 |
+
|
| 205 |
+
if name.lower() in common_vars:
|
| 206 |
+
return False
|
| 207 |
+
|
| 208 |
+
# Skip ALL_CAPS (likely constants)
|
| 209 |
+
if name.isupper():
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
# Skip _private and __dunder__
|
| 213 |
+
if name.startswith('_'):
|
| 214 |
+
return False
|
| 215 |
+
|
| 216 |
+
# Looks like a library if:
|
| 217 |
+
# - lowercase
|
| 218 |
+
# - contains underscore
|
| 219 |
+
# - known library patterns
|
| 220 |
+
if name.islower() and len(name) > 3:
|
| 221 |
+
return True
|
| 222 |
+
|
| 223 |
+
return False
|
| 224 |
+
|
| 225 |
+
def add_unknown(self, concept: str, context: str = ""):
|
| 226 |
+
"""Add an unknown concept to the learning queue"""
|
| 227 |
+
if concept not in self.unknown_concepts:
|
| 228 |
+
self.unknown_concepts[concept] = {
|
| 229 |
+
'first_seen': datetime.now().isoformat(),
|
| 230 |
+
'times_seen': 1,
|
| 231 |
+
'contexts': [context] if context else [],
|
| 232 |
+
'status': 'pending'
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
# Add to learning queue
|
| 236 |
+
self.learning_queue.append({
|
| 237 |
+
'concept': concept,
|
| 238 |
+
'context': context,
|
| 239 |
+
'timestamp': datetime.now().isoformat()
|
| 240 |
+
})
|
| 241 |
+
else:
|
| 242 |
+
self.unknown_concepts[concept]['times_seen'] += 1
|
| 243 |
+
if context and context not in self.unknown_concepts[concept]['contexts']:
|
| 244 |
+
self.unknown_concepts[concept]['contexts'].append(context)
|
| 245 |
+
|
| 246 |
+
self.save()
|
| 247 |
+
|
| 248 |
+
def teach_concept(
|
| 249 |
+
self,
|
| 250 |
+
concept: str,
|
| 251 |
+
concept_type: str,
|
| 252 |
+
description: str,
|
| 253 |
+
example_code: str,
|
| 254 |
+
category: str = "user_taught"
|
| 255 |
+
):
|
| 256 |
+
"""Teach the model a new concept"""
|
| 257 |
+
|
| 258 |
+
# Add to known concepts
|
| 259 |
+
self.known_concepts[concept] = {
|
| 260 |
+
'type': concept_type,
|
| 261 |
+
'category': category,
|
| 262 |
+
'description': description,
|
| 263 |
+
'learned_at': datetime.now().isoformat()
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
# Add example
|
| 267 |
+
if concept not in self.concept_examples:
|
| 268 |
+
self.concept_examples[concept] = []
|
| 269 |
+
self.concept_examples[concept].append(example_code)
|
| 270 |
+
|
| 271 |
+
# Remove from unknown
|
| 272 |
+
if concept in self.unknown_concepts:
|
| 273 |
+
del self.unknown_concepts[concept]
|
| 274 |
+
|
| 275 |
+
# Remove from learning queue
|
| 276 |
+
self.learning_queue = [
|
| 277 |
+
item for item in self.learning_queue
|
| 278 |
+
if item['concept'] != concept
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
self.save()
|
| 282 |
+
|
| 283 |
+
return True
|
| 284 |
+
|
| 285 |
+
def get_learning_queue(self) -> List[dict]:
|
| 286 |
+
"""Get concepts waiting to be learned"""
|
| 287 |
+
return self.learning_queue
|
| 288 |
+
|
| 289 |
+
def get_unknown_concepts(self) -> Dict[str, dict]:
|
| 290 |
+
"""Get all unknown concepts"""
|
| 291 |
+
return self.unknown_concepts
|
| 292 |
+
|
| 293 |
+
def get_example_code(self, concept: str) -> List[str]:
|
| 294 |
+
"""Get example code for a concept"""
|
| 295 |
+
return self.concept_examples.get(concept, [])
|
| 296 |
+
|
| 297 |
+
def get_all_examples(self) -> str:
|
| 298 |
+
"""Get all example code for training"""
|
| 299 |
+
all_code = []
|
| 300 |
+
for concept, examples in self.concept_examples.items():
|
| 301 |
+
for example in examples:
|
| 302 |
+
all_code.append(f"# Example of {concept}\n{example}")
|
| 303 |
+
return "\n\n".join(all_code)
|
| 304 |
+
|
| 305 |
+
def get_statistics(self) -> dict:
|
| 306 |
+
"""Get knowledge base statistics"""
|
| 307 |
+
return {
|
| 308 |
+
'known_concepts': len(self.known_concepts),
|
| 309 |
+
'unknown_concepts': len(self.unknown_concepts),
|
| 310 |
+
'concepts_with_examples': len(self.concept_examples),
|
| 311 |
+
'total_examples': sum(len(v) for v in self.concept_examples.values()),
|
| 312 |
+
'learning_queue_size': len(self.learning_queue),
|
| 313 |
+
'categories': self._count_categories()
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
def _count_categories(self) -> dict:
|
| 317 |
+
"""Count concepts by category"""
|
| 318 |
+
categories = {}
|
| 319 |
+
for concept, info in self.known_concepts.items():
|
| 320 |
+
cat = info.get('category', 'unknown')
|
| 321 |
+
categories[cat] = categories.get(cat, 0) + 1
|
| 322 |
+
return categories
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# Global knowledge base instance
|
| 326 |
+
knowledge_base = KnowledgeBase()
|