sadidft commited on
Commit
44c0a7f
·
verified ·
1 Parent(s): 5ba3999

Create config.py

Browse files
Files changed (1) hide show
  1. config.py +341 -0
config.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cogni-Engine v1 — Configuration
3
+ All system parameters centralized here.
4
+ Every module imports from this file.
5
+ """
6
+
7
+ import os
8
+ import secrets
9
+ import hashlib
10
+
11
+
12
+ # ═══════════════════════════════════════════════════════════
13
+ # API SERVER
14
+ # ═══════════════════════════════════════════════════════════
15
+
16
+ PORT = int(os.environ.get("PORT", 7860))
17
+ API_KEY = os.environ.get("API_KEY", secrets.token_urlsafe(32))
18
+ MAX_REQUEST_SIZE_MB = 50
19
+
20
+ # Print generated key if not set (first run)
21
+ if not os.environ.get("API_KEY"):
22
+ print(f"[CONFIG] No API_KEY set. Generated temporary key: {API_KEY}")
23
+ print(f"[CONFIG] Set API_KEY environment variable for persistent key.")
24
+
25
+
26
+ # ═══════════════════════════════════════════════════════════
27
+ # TiDB CONNECTION
28
+ # ═══════════════════════════════════════════════════════════
29
+
30
+ TIDB_HOST = os.environ.get("TIDB_HOST", "")
31
+ TIDB_PORT = int(os.environ.get("TIDB_PORT", 4000))
32
+ TIDB_USER = os.environ.get("TIDB_USER", "")
33
+ TIDB_PASSWORD = os.environ.get("TIDB_PASSWORD", "")
34
+ TIDB_DATABASE = os.environ.get("TIDB_DATABASE", "cogni_engine")
35
+ TIDB_SSL = os.environ.get("TIDB_SSL", "true").lower() == "true"
36
+
37
+ # Connection pool
38
+ TIDB_POOL_SIZE = int(os.environ.get("TIDB_POOL_SIZE", 5))
39
+ TIDB_CONNECT_TIMEOUT = 10
40
+ TIDB_READ_TIMEOUT = 30
41
+ TIDB_WRITE_TIMEOUT = 30
42
+ TIDB_RETRY_ATTEMPTS = 3
43
+ TIDB_RETRY_DELAY = 2 # seconds between retries
44
+
45
+
46
+ # ═══════════════════════════════════════════════════════════
47
+ # VECTOR & EMBEDDING
48
+ # ═══════════════════════════════════════════════════════════
49
+
50
+ VECTOR_DIM = 128 # Dimensi vektor embedding per node
51
+ NGRAM_SIZES = [3, 4] # Character n-gram sizes for hashing
52
+ HASH_BUCKETS = 8192 # Hash bucket count for n-gram vectorization
53
+ RANDOM_PROJECTION_SEED = 42 # Seed for reproducible random projection matrix
54
+
55
+
56
+ # ═══════════════════════════════════════════════════════════
57
+ # KNOWLEDGE GRAPH
58
+ # ═══════════════════════════════════════════════════════════
59
+
60
+ SIMILARITY_THRESHOLD = 0.65 # Minimum cosine similarity to auto-create edge
61
+ MERGE_THRESHOLD = 0.95 # Similarity above this = redundant nodes, merge
62
+ PRUNE_WEIGHT_THRESHOLD = 0.05 # Edges below this weight get pruned
63
+ MAX_TRAVERSAL_DEPTH = 8 # Maximum hops in graph walk
64
+ MAX_CHAINS_PER_RESPONSE = 7 # Maximum reasoning chains used per response
65
+ MAX_NODES_PER_SEARCH = 20 # Top-K nodes returned by similarity search
66
+ MIN_EDGE_CONFIDENCE = 0.05 # Below this, edge is candidate for deletion
67
+ MAX_GRAPH_MEMORY_NODES = 500000 # Safety limit for in-memory nodes
68
+ MAX_GRAPH_MEMORY_EDGES = 2000000 # Safety limit for in-memory edges
69
+
70
+
71
+ # ═══════════════════════════════════════════════════════════
72
+ # ABSTRACTION
73
+ # ═══════════════════════════════════════════════════════════
74
+
75
+ MAX_ABSTRACTION_DEPTH = 5 # Maximum recursive abstraction levels
76
+ CLUSTER_MIN_SIZE = 3 # Minimum nodes to form an abstraction
77
+ CLUSTER_MAX_SIZE = 50 # Maximum nodes per cluster
78
+ CLUSTER_SIMILARITY_INTRA = 0.60 # Minimum intra-cluster similarity
79
+ CLUSTER_ITERATIONS = 20 # K-means iterations per clustering run
80
+ ABSTRACTION_MIN_CONFIDENCE = 0.50 # Minimum confidence for abstraction node
81
+
82
+
83
+ # ═══════════════════════════════════════════════════════════
84
+ # INFERENCE
85
+ # ═══════════════════════════════════════════════════════════
86
+
87
+ INFERENCE_CONFIDENCE_MIN = 0.30 # Below this, don't save inferred edge
88
+ INFERENCE_DECAY = 0.85 # Decay per hop: conf(A→C) = conf(A→B) * conf(B→C) * decay
89
+ INFERENCE_MAX_CHAIN_LENGTH = 5 # Maximum transitive hops for inference
90
+ ANALOGICAL_SIMILARITY_MIN = 0.70 # Minimum similarity for analogical reasoning
91
+ MAX_INFERENCES_PER_CYCLE = 100 # Limit inferences per thinking cycle (prevent explosion)
92
+
93
+
94
+ # ═══════════════════════════════════════════════════════════
95
+ # THINKING LOOP
96
+ # ═══════════════════════════════════════════════════════════
97
+
98
+ THINKING_INTERVAL_FAST = 2 # Seconds between cycles when active (new data)
99
+ THINKING_INTERVAL_SLOW = 15 # Seconds between cycles when stable
100
+ THINKING_STABILITY_THRESHOLD = 5 # Operations < this = "stable" → slow down
101
+ THINKING_BATCH_SIZE = 50 # Nodes/edges processed per sub-phase
102
+ SYNC_INTERVAL_CYCLES = 100 # Flush to TiDB every N cycles
103
+ SYNC_INTERVAL_SECONDS = 60 # Flush to TiDB every N seconds (whichever first)
104
+ SELF_QUESTION_INTERVAL = 50 # Run self-questioning every N cycles
105
+ VALIDATE_INTERVAL = 25 # Run validation every N cycles
106
+ COMPRESS_INTERVAL = 100 # Run compression every N cycles
107
+
108
+
109
+ # ═══════════════════════════════════════════════════════════
110
+ # WEIGHT DYNAMICS
111
+ # ═══════════════════════════════════════════════════════════
112
+
113
+ WEIGHT_REINFORCE = 1.05 # Multiplier when edge is used in response
114
+ WEIGHT_DECAY_RATE = 0.98 # Multiplier per decay cycle for unused edges
115
+ WEIGHT_DECAY_INTERVAL_CYCLES = 500 # Apply decay every N cycles
116
+ WEIGHT_MAX = 10.0 # Maximum edge/node weight (prevent overflow)
117
+ WEIGHT_MIN = 0.01 # Minimum before considered for pruning
118
+ NODE_WEIGHT_CONNECTION_BONUS = 0.02 # Weight bonus per connection for nodes
119
+ USER_KNOWLEDGE_CONFIDENCE = 0.60 # Default confidence for knowledge extracted from user chat
120
+ DATA_KNOWLEDGE_CONFIDENCE = 0.90 # Default confidence for knowledge from JSONL files
121
+
122
+
123
+ # ═══════════════════════════════════════════════════════════
124
+ # LANGUAGE GENERATION
125
+ # ═══════════════════════════════════════════════════════════
126
+
127
+ DEFAULT_TEMPERATURE = 0.7 # Default response variation (0=deterministic, 1=max variety)
128
+ DEFAULT_FORMALITY = 0.5 # 0=very casual, 1=very formal
129
+ DEFAULT_LANGUAGE = "id" # Default output language
130
+ MAX_RESPONSE_SEGMENTS = 8 # Maximum segments in a response
131
+ MIN_RESPONSE_SEGMENTS = 2 # Minimum segments (even for simple answers)
132
+ CONFIDENCE_HIGH = 0.80 # Above: assertive language
133
+ CONFIDENCE_MEDIUM = 0.50 # Above: qualified language
134
+ CONFIDENCE_LOW = 0.30 # Above: tentative language
135
+ # Below 0.30: honest uncertainty
136
+
137
+ # Segment types available for response construction
138
+ SEGMENT_TYPES = [
139
+ "introduction",
140
+ "main_explanation",
141
+ "supporting_detail",
142
+ "inference",
143
+ "context",
144
+ "acknowledgment_of_uncertainty",
145
+ "suggestion",
146
+ "conclusion",
147
+ "example",
148
+ "comparison",
149
+ "elaboration"
150
+ ]
151
+
152
+ # Intent types for query classification
153
+ INTENT_TYPES = [
154
+ "explain", # "Apa itu X?" / "Jelaskan X"
155
+ "relation", # "Apa hubungan X dengan Y?"
156
+ "how_to", # "Bagaimana cara X?"
157
+ "compare", # "Bandingkan X dan Y"
158
+ "define", # "Definisi X"
159
+ "list", # "Sebutkan X"
160
+ "cause", # "Mengapa X?"
161
+ "opinion", # "Pendapat tentang X?"
162
+ "general", # Catch-all
163
+ "greeting", # "Halo" etc
164
+ "followup" # Continues previous context
165
+ ]
166
+
167
+
168
+ # ═══════════════════════════════════════════════════════════
169
+ # CONVERSATION & SESSION
170
+ # ═══════════════════════════════════════════════════════════
171
+
172
+ CONTEXT_WINDOW_TURNS = 10 # Number of conversation turns kept in memory
173
+ SESSION_TIMEOUT_MINUTES = 30 # Session expires after inactivity
174
+ MAX_CONCURRENT_SESSIONS = 100 # Maximum simultaneous conversations
175
+ SESSION_CLEANUP_INTERVAL = 300 # Seconds between session cleanup sweeps
176
+
177
+
178
+ # ═══════════════════════════════════════════════════════════
179
+ # KEEP-ALIVE
180
+ # ════════════════════════��══════════════════════════════════
181
+
182
+ KEEP_ALIVE_INTERVAL = 300 # Self-ping every 5 minutes
183
+ KEEP_ALIVE_ENABLED = True # Can disable for local development
184
+
185
+
186
+ # ═══════════════════════════════════════════════════════════
187
+ # DATA INPUT
188
+ # ═══════════════════════════════════════════════════════════
189
+
190
+ DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
191
+ SUPPORTED_DATA_EXTENSIONS = [".jsonl"]
192
+ FILE_SCAN_INTERVAL = 30 # Seconds between /data/ folder scans
193
+ MAX_LINES_PER_INGEST = 10000 # Max lines processed per ingest cycle (prevent blocking)
194
+
195
+ # Core data types
196
+ CORE_DATA_TYPES = [
197
+ # Knowledge
198
+ "fact", "definition", "explanation", "description", "property",
199
+ "statistic", "measurement", "term", "abbreviation", "jargon",
200
+ "slang", "idiom", "synonym", "antonym", "quote", "rule",
201
+ "example", "analogy", "opinion", "paragraph",
202
+ # Relational
203
+ "relation", "cause_effect", "comparison", "hierarchy",
204
+ "composition", "dependency", "contradiction", "timeline",
205
+ # Structured
206
+ "process", "procedure", "event", "history", "change", "qa"
207
+ ]
208
+
209
+ # Edge relation types
210
+ EDGE_RELATION_TYPES = [
211
+ # From data
212
+ "is_a", "part_of", "has", "located_in", "created_by",
213
+ "used_for", "causes", "prevents", "requires", "contains",
214
+ "member_of", "opposite_of", "synonym_of", "defined_as",
215
+ "example_of", "follows", "precedes", "related_to",
216
+ # From inference
217
+ "similar_to", "inferred_relation", "instance_of", "analogous_to"
218
+ ]
219
+
220
+
221
+ # ═══════════════════════════════════════════════════════════
222
+ # NODE ID GENERATION
223
+ # ═══════════════════════════════════════════════════════════
224
+
225
+ def generate_node_id(content: str, node_type: str = "") -> str:
226
+ """Generate deterministic node ID from content."""
227
+ raw = f"{node_type}:{content}".strip().lower()
228
+ return "n_" + hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
229
+
230
+
231
+ def generate_edge_id(from_id: str, to_id: str, relation: str) -> str:
232
+ """Generate deterministic edge ID from components."""
233
+ raw = f"{from_id}|{to_id}|{relation}"
234
+ return "e_" + hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
235
+
236
+
237
+ def generate_chain_id(path: list) -> str:
238
+ """Generate deterministic chain ID from path."""
239
+ raw = "|".join(str(p) for p in path)
240
+ return "c_" + hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
241
+
242
+
243
+ def generate_session_id() -> str:
244
+ """Generate random session ID."""
245
+ return "s_" + secrets.token_hex(12)
246
+
247
+
248
+ # ═══════════════════════════════════════════════════════════
249
+ # INTELLIGENCE SCORE WEIGHTS
250
+ # ═══════════════════════════════════════════════════════════
251
+
252
+ INTELLIGENCE_WEIGHTS = {
253
+ "log_nodes": 0.15,
254
+ "log_edges": 0.15,
255
+ "avg_connections": 0.15,
256
+ "max_abstraction_depth": 0.15,
257
+ "avg_chain_length": 0.15,
258
+ "inference_ratio": 0.10,
259
+ "avg_confidence": 0.15
260
+ }
261
+
262
+
263
+ # ═══════════════════════════════════════════════════════════
264
+ # LOGGING
265
+ # ═══════════════════════════════════════════════════════════
266
+
267
+ LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
268
+ LOG_THINKING_DETAILS = os.environ.get("LOG_THINKING", "false").lower() == "true"
269
+ LOG_API_REQUESTS = os.environ.get("LOG_API", "true").lower() == "true"
270
+
271
+
272
+ # ═══════════════════════════════════════════════════════════
273
+ # STARTUP VALIDATION
274
+ # ═══════════════════════════════════════════════════════════
275
+
276
+ def validate_config():
277
+ """Validate critical configuration on startup."""
278
+ errors = []
279
+ warnings = []
280
+
281
+ # TiDB — required for persistence
282
+ if not TIDB_HOST:
283
+ errors.append("TIDB_HOST not set. Database persistence will not work.")
284
+ if not TIDB_USER:
285
+ errors.append("TIDB_USER not set.")
286
+ if not TIDB_PASSWORD:
287
+ errors.append("TIDB_PASSWORD not set.")
288
+
289
+ # Data directory
290
+ if not os.path.exists(DATA_DIR):
291
+ try:
292
+ os.makedirs(DATA_DIR, exist_ok=True)
293
+ warnings.append(f"Created data directory: {DATA_DIR}")
294
+ except OSError as e:
295
+ errors.append(f"Cannot create data directory {DATA_DIR}: {e}")
296
+
297
+ # Sanity checks on parameters
298
+ if VECTOR_DIM < 32 or VECTOR_DIM > 1024:
299
+ warnings.append(f"VECTOR_DIM={VECTOR_DIM} outside recommended range [32, 1024]")
300
+
301
+ if SIMILARITY_THRESHOLD < 0.3 or SIMILARITY_THRESHOLD > 0.95:
302
+ warnings.append(f"SIMILARITY_THRESHOLD={SIMILARITY_THRESHOLD} may cause too many/few connections")
303
+
304
+ if INFERENCE_DECAY < 0.5 or INFERENCE_DECAY > 0.99:
305
+ warnings.append(f"INFERENCE_DECAY={INFERENCE_DECAY} outside recommended range [0.5, 0.99]")
306
+
307
+ if MAX_ABSTRACTION_DEPTH > 10:
308
+ warnings.append(f"MAX_ABSTRACTION_DEPTH={MAX_ABSTRACTION_DEPTH} very high, may cause slow clustering")
309
+
310
+ # Report
311
+ for w in warnings:
312
+ print(f"[CONFIG WARNING] {w}")
313
+ for e in errors:
314
+ print(f"[CONFIG ERROR] {e}")
315
+
316
+ if errors:
317
+ print(f"[CONFIG] {len(errors)} error(s) found. System may not function correctly.")
318
+ return False
319
+
320
+ print(f"[CONFIG] Validation passed. {len(warnings)} warning(s).")
321
+ return True
322
+
323
+
324
+ def print_config_summary():
325
+ """Print non-sensitive configuration summary."""
326
+ print("=" * 55)
327
+ print(" COGNI-ENGINE v1 — Configuration")
328
+ print("=" * 55)
329
+ print(f" Port: {PORT}")
330
+ print(f" TiDB Host: {'SET' if TIDB_HOST else 'NOT SET'}")
331
+ print(f" TiDB Database: {TIDB_DATABASE}")
332
+ print(f" Vector Dim: {VECTOR_DIM}")
333
+ print(f" Data Dir: {DATA_DIR}")
334
+ print(f" Similarity Thresh: {SIMILARITY_THRESHOLD}")
335
+ print(f" Max Traversal: {MAX_TRAVERSAL_DEPTH}")
336
+ print(f" Max Abstraction: {MAX_ABSTRACTION_DEPTH}")
337
+ print(f" Think Fast: {THINKING_INTERVAL_FAST}s")
338
+ print(f" Think Slow: {THINKING_INTERVAL_SLOW}s")
339
+ print(f" Keep-Alive: {'ON' if KEEP_ALIVE_ENABLED else 'OFF'}")
340
+ print(f" Log Level: {LOG_LEVEL}")
341
+ print("=" * 55)