vedaco commited on
Commit
ada3e2e
·
verified ·
1 Parent(s): b84ae93

Create knowledge_base.py

Browse files
Files changed (1) hide show
  1. knowledge_base.py +326 -0
knowledge_base.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Knowledge Base for detecting and learning unknown concepts"""
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ from typing import List, Dict, Set, Optional, Tuple
7
+ from datetime import datetime
8
+
9
+ from config import KNOWLEDGE_PATH, DATA_DIR
10
+
11
+ class KnowledgeBase:
12
+ """Manages known and unknown programming concepts"""
13
+
14
+ def __init__(self):
15
+ self.known_concepts: Dict[str, dict] = {}
16
+ self.unknown_concepts: Dict[str, dict] = {}
17
+ self.concept_examples: Dict[str, List[str]] = {}
18
+ self.learning_queue: List[dict] = []
19
+
20
+ # Built-in Python knowledge
21
+ self._init_builtin_knowledge()
22
+
23
+ # Load saved knowledge
24
+ self.load()
25
+
26
+ def _init_builtin_knowledge(self):
27
+ """Initialize with Python built-in knowledge"""
28
+
29
+ # Python keywords
30
+ python_keywords = [
31
+ 'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try',
32
+ 'except', 'finally', 'with', 'as', 'import', 'from', 'return',
33
+ 'yield', 'raise', 'pass', 'break', 'continue', 'lambda', 'and',
34
+ 'or', 'not', 'in', 'is', 'None', 'True', 'False', 'global',
35
+ 'nonlocal', 'assert', 'del', 'async', 'await'
36
+ ]
37
+
38
+ # Python built-in functions
39
+ builtin_functions = [
40
+ 'print', 'len', 'range', 'int', 'str', 'float', 'list', 'dict',
41
+ 'set', 'tuple', 'bool', 'type', 'isinstance', 'issubclass',
42
+ 'hasattr', 'getattr', 'setattr', 'delattr', 'callable', 'iter',
43
+ 'next', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed',
44
+ 'sum', 'min', 'max', 'abs', 'round', 'pow', 'divmod', 'input',
45
+ 'open', 'file', 'repr', 'hash', 'id', 'dir', 'vars', 'locals',
46
+ 'globals', 'eval', 'exec', 'compile', 'format', 'chr', 'ord',
47
+ 'bin', 'hex', 'oct', 'slice', 'object', 'super', 'property',
48
+ 'staticmethod', 'classmethod', 'all', 'any'
49
+ ]
50
+
51
+ # Python standard library modules
52
+ stdlib_modules = [
53
+ 'os', 'sys', 'json', 're', 'math', 'random', 'datetime', 'time',
54
+ 'collections', 'itertools', 'functools', 'operator', 'string',
55
+ 'io', 'pathlib', 'shutil', 'glob', 'pickle', 'csv', 'sqlite3',
56
+ 'urllib', 'http', 'email', 'html', 'xml', 'logging', 'unittest',
57
+ 'threading', 'multiprocessing', 'subprocess', 'socket', 'asyncio',
58
+ 'typing', 'dataclasses', 'abc', 'copy', 'pprint', 'textwrap',
59
+ 'struct', 'codecs', 'unicodedata', 'hashlib', 'hmac', 'secrets'
60
+ ]
61
+
62
+ # Common third-party libraries
63
+ common_libraries = {
64
+ 'numpy': 'Numerical computing library',
65
+ 'pandas': 'Data analysis library',
66
+ 'tensorflow': 'Machine learning framework',
67
+ 'pytorch': 'Deep learning framework',
68
+ 'torch': 'PyTorch deep learning',
69
+ 'keras': 'High-level neural network API',
70
+ 'sklearn': 'Machine learning library',
71
+ 'scikit-learn': 'Machine learning library',
72
+ 'matplotlib': 'Plotting library',
73
+ 'seaborn': 'Statistical visualization',
74
+ 'requests': 'HTTP library',
75
+ 'flask': 'Web framework',
76
+ 'django': 'Web framework',
77
+ 'fastapi': 'Modern web framework',
78
+ 'sqlalchemy': 'Database ORM',
79
+ 'beautifulsoup': 'Web scraping',
80
+ 'selenium': 'Browser automation',
81
+ 'pytest': 'Testing framework',
82
+ 'pillow': 'Image processing',
83
+ 'opencv': 'Computer vision',
84
+ 'cv2': 'OpenCV library',
85
+ 'scipy': 'Scientific computing',
86
+ 'nltk': 'Natural language toolkit',
87
+ 'spacy': 'NLP library',
88
+ 'transformers': 'Hugging Face transformers',
89
+ 'gradio': 'ML demo interface',
90
+ 'streamlit': 'Data app framework'
91
+ }
92
+
93
+ # Add to known concepts
94
+ for kw in python_keywords:
95
+ self.known_concepts[kw] = {
96
+ 'type': 'keyword',
97
+ 'category': 'python_builtin',
98
+ 'learned_at': 'builtin'
99
+ }
100
+
101
+ for func in builtin_functions:
102
+ self.known_concepts[func] = {
103
+ 'type': 'function',
104
+ 'category': 'python_builtin',
105
+ 'learned_at': 'builtin'
106
+ }
107
+
108
+ for mod in stdlib_modules:
109
+ self.known_concepts[mod] = {
110
+ 'type': 'module',
111
+ 'category': 'python_stdlib',
112
+ 'learned_at': 'builtin'
113
+ }
114
+
115
+ for lib, desc in common_libraries.items():
116
+ self.known_concepts[lib] = {
117
+ 'type': 'library',
118
+ 'category': 'third_party',
119
+ 'description': desc,
120
+ 'learned_at': 'builtin'
121
+ }
122
+
123
+ def save(self):
124
+ """Save knowledge base to file"""
125
+ data = {
126
+ 'known_concepts': self.known_concepts,
127
+ 'unknown_concepts': self.unknown_concepts,
128
+ 'concept_examples': self.concept_examples,
129
+ 'learning_queue': self.learning_queue
130
+ }
131
+ with open(KNOWLEDGE_PATH, 'w') as f:
132
+ json.dump(data, f, indent=2, default=str)
133
+
134
+ def load(self):
135
+ """Load knowledge base from file"""
136
+ if os.path.exists(KNOWLEDGE_PATH):
137
+ try:
138
+ with open(KNOWLEDGE_PATH, 'r') as f:
139
+ data = json.load(f)
140
+
141
+ # Merge with built-in (don't overwrite)
142
+ saved_known = data.get('known_concepts', {})
143
+ for k, v in saved_known.items():
144
+ if k not in self.known_concepts:
145
+ self.known_concepts[k] = v
146
+
147
+ self.unknown_concepts = data.get('unknown_concepts', {})
148
+ self.concept_examples = data.get('concept_examples', {})
149
+ self.learning_queue = data.get('learning_queue', [])
150
+
151
+ except Exception as e:
152
+ print(f"Error loading knowledge base: {e}")
153
+
154
+ def extract_concepts(self, code: str) -> Set[str]:
155
+ """Extract programming concepts from code"""
156
+ concepts = set()
157
+
158
+ # Extract identifiers
159
+ identifiers = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*', code)
160
+ concepts.update(identifiers)
161
+
162
+ # Extract import statements
163
+ imports = re.findall(r'(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
164
+ concepts.update(imports)
165
+
166
+ # Extract function/class names
167
+ definitions = re.findall(r'(?:def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)', code)
168
+ concepts.update(definitions)
169
+
170
+ return concepts
171
+
172
+ def check_knowledge(self, text: str) -> Tuple[List[str], List[str]]:
173
+ """Check text for known and unknown concepts"""
174
+ concepts = self.extract_concepts(text)
175
+
176
+ known = []
177
+ unknown = []
178
+
179
+ for concept in concepts:
180
+ # Skip very short or common words
181
+ if len(concept) < 2:
182
+ continue
183
+ if concept.lower() in ['a', 'an', 'the', 'is', 'it', 'to', 'of']:
184
+ continue
185
+
186
+ if concept.lower() in self.known_concepts or concept in self.known_concepts:
187
+ known.append(concept)
188
+ else:
189
+ # Check if it looks like a library/framework name
190
+ if self._looks_like_library(concept):
191
+ unknown.append(concept)
192
+
193
+ return known, unknown
194
+
195
+ def _looks_like_library(self, name: str) -> bool:
196
+ """Check if a name looks like a library/module name"""
197
+ # Skip common variable names
198
+ common_vars = [
199
+ 'self', 'cls', 'args', 'kwargs', 'result', 'data', 'value',
200
+ 'item', 'items', 'key', 'keys', 'val', 'vals', 'obj', 'func',
201
+ 'arr', 'lst', 'num', 'count', 'index', 'idx', 'temp', 'tmp',
202
+ 'i', 'j', 'k', 'n', 'm', 'x', 'y', 'z', 'a', 'b', 'c'
203
+ ]
204
+
205
+ if name.lower() in common_vars:
206
+ return False
207
+
208
+ # Skip ALL_CAPS (likely constants)
209
+ if name.isupper():
210
+ return False
211
+
212
+ # Skip _private and __dunder__
213
+ if name.startswith('_'):
214
+ return False
215
+
216
+ # Looks like a library if:
217
+ # - lowercase
218
+ # - contains underscore
219
+ # - known library patterns
220
+ if name.islower() and len(name) > 3:
221
+ return True
222
+
223
+ return False
224
+
225
+ def add_unknown(self, concept: str, context: str = ""):
226
+ """Add an unknown concept to the learning queue"""
227
+ if concept not in self.unknown_concepts:
228
+ self.unknown_concepts[concept] = {
229
+ 'first_seen': datetime.now().isoformat(),
230
+ 'times_seen': 1,
231
+ 'contexts': [context] if context else [],
232
+ 'status': 'pending'
233
+ }
234
+
235
+ # Add to learning queue
236
+ self.learning_queue.append({
237
+ 'concept': concept,
238
+ 'context': context,
239
+ 'timestamp': datetime.now().isoformat()
240
+ })
241
+ else:
242
+ self.unknown_concepts[concept]['times_seen'] += 1
243
+ if context and context not in self.unknown_concepts[concept]['contexts']:
244
+ self.unknown_concepts[concept]['contexts'].append(context)
245
+
246
+ self.save()
247
+
248
+ def teach_concept(
249
+ self,
250
+ concept: str,
251
+ concept_type: str,
252
+ description: str,
253
+ example_code: str,
254
+ category: str = "user_taught"
255
+ ):
256
+ """Teach the model a new concept"""
257
+
258
+ # Add to known concepts
259
+ self.known_concepts[concept] = {
260
+ 'type': concept_type,
261
+ 'category': category,
262
+ 'description': description,
263
+ 'learned_at': datetime.now().isoformat()
264
+ }
265
+
266
+ # Add example
267
+ if concept not in self.concept_examples:
268
+ self.concept_examples[concept] = []
269
+ self.concept_examples[concept].append(example_code)
270
+
271
+ # Remove from unknown
272
+ if concept in self.unknown_concepts:
273
+ del self.unknown_concepts[concept]
274
+
275
+ # Remove from learning queue
276
+ self.learning_queue = [
277
+ item for item in self.learning_queue
278
+ if item['concept'] != concept
279
+ ]
280
+
281
+ self.save()
282
+
283
+ return True
284
+
285
+ def get_learning_queue(self) -> List[dict]:
286
+ """Get concepts waiting to be learned"""
287
+ return self.learning_queue
288
+
289
+ def get_unknown_concepts(self) -> Dict[str, dict]:
290
+ """Get all unknown concepts"""
291
+ return self.unknown_concepts
292
+
293
+ def get_example_code(self, concept: str) -> List[str]:
294
+ """Get example code for a concept"""
295
+ return self.concept_examples.get(concept, [])
296
+
297
+ def get_all_examples(self) -> str:
298
+ """Get all example code for training"""
299
+ all_code = []
300
+ for concept, examples in self.concept_examples.items():
301
+ for example in examples:
302
+ all_code.append(f"# Example of {concept}\n{example}")
303
+ return "\n\n".join(all_code)
304
+
305
+ def get_statistics(self) -> dict:
306
+ """Get knowledge base statistics"""
307
+ return {
308
+ 'known_concepts': len(self.known_concepts),
309
+ 'unknown_concepts': len(self.unknown_concepts),
310
+ 'concepts_with_examples': len(self.concept_examples),
311
+ 'total_examples': sum(len(v) for v in self.concept_examples.values()),
312
+ 'learning_queue_size': len(self.learning_queue),
313
+ 'categories': self._count_categories()
314
+ }
315
+
316
+ def _count_categories(self) -> dict:
317
+ """Count concepts by category"""
318
+ categories = {}
319
+ for concept, info in self.known_concepts.items():
320
+ cat = info.get('category', 'unknown')
321
+ categories[cat] = categories.get(cat, 0) + 1
322
+ return categories
323
+
324
+
325
+ # Global knowledge base instance
326
+ knowledge_base = KnowledgeBase()