""" KNOWLEDGE STORE — Multi-Container Persistent Knowledge Base Docker SDK / FastAPI — no Gradio, no CSP Containers & their knowledge decay models: medical — fast decay (outdated = dangerous). Half-life 180 days. legal — slow decay (laws change rarely). Half-life 730 days. company — mixed: SOPs stable (HL 365), market/people data volatile (HL 30). research — citation boost on create, then slow decay. HL 540 days. tech — very fast decay (versions). HL 90 days. prompts — no decay (prompts are reusable). history — ANTI-decay: value increases with age. personal — moderate decay (preferences drift). HL 180 days. finance — extreme decay (market data). HL 7 days. operations — moderate. HL 180 days. Knowledge Value Score = base_importance * time_factor(container) * access_bonus Time factor varies per container and uses exponential decay / growth. Search types: keyword — simple full-text (TF-IDF-like scoring) time — recency or historical filter tag — exact/prefix tag match container — container-scoped list semantic — keyword with cosine-like tf scoring (no embeddings, pure Python) value — sorted by current knowledge value score MCP tools: ks_write, ks_read, ks_search, ks_list, ks_delete, ks_containers, ks_stats, ks_top_value """ import os, uuid, json, math, time, re, asyncio from pathlib import Path from datetime import datetime, timezone from typing import Optional, List from collections import defaultdict, Counter from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse BASE = Path(__file__).parent STORE = BASE / "store" STORE.mkdir(exist_ok=True) # ── Container definitions ───────────────────────────────────────── CONTAINERS = { "medical": { "label": "Medical", "icon": "⚕", # caduceus-ish "color": "#ef4444", "description": "Clinical guidelines, drug refs, protocols, case notes", "decay_model": "exponential", "half_life_days": 180, "warn_after_days": 90, "folders": ["guidelines", "drugs", "protocols", "cases", "research"], "note": "Outdated medical info can be dangerous. Review regularly.", "badge": "CRITICAL-DECAY", }, "legal": { "label": "Legal", "icon": "⚖", "color": "#8b5cf6", "description": "Contracts, regulations, compliance, case law, GDPR", "decay_model": "slow_exponential", "half_life_days": 730, "warn_after_days": 365, "folders": ["contracts", "regulations", "gdpr", "caselaw", "templates"], "note": "Laws change slowly but verify jurisdiction and amendment dates.", "badge": "SLOW-DECAY", }, "company": { "label": "Company", "icon": "🏢", "color": "#0ea5e9", "description": "SOPs, org charts, projects, market intel, people", "decay_model": "tiered", # folder-dependent "half_life_days": 180, "warn_after_days": 90, "folders": ["sop", "projects", "people", "market", "strategy"], "folder_half_lives": {"sop":365, "projects":90, "people":60, "market":14, "strategy":180}, "note": "Market and people data decay fast. SOPs are more stable.", "badge": "TIERED-DECAY", }, "research": { "label": "Research", "icon": "🔬", "color": "#06b6d4", "description": "Papers, experiments, hypotheses, datasets, findings", "decay_model": "citation_curve", # peaks at 30 days then slow decay "half_life_days": 540, "peak_days": 30, "warn_after_days": 365, "folders": ["papers", "experiments", "datasets", "hypotheses", "notes"], "note": "New research has highest relevance. Classic papers retain value.", "badge": "CITATION-CURVE", }, "tech": { "label": "Tech / Docs", "icon": "💻", "color": "#22d3ee", "description": "API docs, code snippets, architecture, DevOps, configs", "decay_model": "versioned_decay", "half_life_days": 90, "warn_after_days": 45, "folders": ["api", "snippets", "architecture", "devops", "configs"], "note": "Software versions change fast. Tag with version numbers.", "badge": "FAST-DECAY", }, "prompts": { "label": "Prompts", "icon": "⚡", "color": "#f59e0b", "description": "LLM prompts, system instructions, few-shot examples, chains", "decay_model": "stable", # no decay "half_life_days": None, "warn_after_days": None, "folders": ["system", "chains", "fewshot", "templates", "experiments"], "note": "Prompts are reusable. Value does not decay.", "badge": "STABLE", }, "history": { "label": "History / Archive", "icon": "🕮", "color": "#d97706", "description": "Historical records, past decisions, retrospectives, logs", "decay_model": "anti_decay", # increases in value with age "half_life_days": None, "warn_after_days": None, "folders": ["decisions", "retrospectives", "logs", "milestones", "archive"], "note": "Historical context becomes MORE valuable over time.", "badge": "ANTI-DECAY", }, "personal": { "label": "Personal", "icon": "👤", "color": "#ec4899", "description": "Goals, notes, preferences, journals, ideas", "decay_model": "drift_decay", "half_life_days": 180, "warn_after_days": 120, "folders": ["goals", "notes", "ideas", "journal", "preferences"], "note": "Preferences and goals drift over time. Review periodically.", "badge": "DRIFT-DECAY", }, "finance": { "label": "Finance", "icon": "📈", "color": "#10b981", "description": "Market data, reports, forecasts, invoices, budgets", "decay_model": "extreme_decay", "half_life_days": 7, "warn_after_days": 3, "folders": ["market", "reports", "forecasts", "invoices", "budgets"], "note": "Market data decays within hours. Financial reports within weeks.", "badge": "EXTREME-DECAY", }, "operations": { "label": "Operations", "icon": "⚙", "color": "#84cc16", "description": "Runbooks, incidents, on-call, monitoring, deployments", "decay_model": "operational_decay", "half_life_days": 180, "warn_after_days": 60, "folders": ["runbooks", "incidents", "oncall", "monitoring", "deployments"], "note": "Runbooks age fast in fast-moving infra. Keep versioned.", "badge": "MODERATE-DECAY", }, } # ── Knowledge value scoring ─────────────────────────────────────── def knowledge_value(doc: dict) -> float: """Compute 0-100 current value score for a document.""" container = doc.get("container", "tech") cfg = CONTAINERS.get(container, CONTAINERS["tech"]) base = float(doc.get("importance", 5)) / 10.0 # 0..1 access_bonus = min(1.0, math.log1p(doc.get("access_count", 0)) / 10) age_days = (time.time() - doc.get("created_at", time.time())) / 86400 model = cfg.get("decay_model", "exponential") hl = cfg.get("half_life_days") or 365 if model == "stable": t_factor = 1.0 elif model == "anti_decay": # value grows: tanh curve from 0 to 1 over ~2 years t_factor = 0.5 + 0.5 * math.tanh(age_days / 365) elif model == "citation_curve": peak = cfg.get("peak_days", 30) if age_days <= peak: t_factor = 0.6 + 0.4 * (age_days / peak) else: t_factor = math.exp(-math.log(2) * (age_days - peak) / hl) elif model == "tiered": folder = doc.get("folder", "") folder_hl = cfg.get("folder_half_lives", {}).get(folder, hl) t_factor = math.exp(-math.log(2) * age_days / folder_hl) elif model == "extreme_decay": t_factor = math.exp(-math.log(2) * age_days / max(1, hl)) else: # standard exponential decay t_factor = math.exp(-math.log(2) * age_days / hl) t_factor = max(0.0, min(1.0, t_factor)) score = (base * 0.5 + access_bonus * 0.1 + t_factor * 0.4) * 100 return round(score, 1) def freshness_label(doc: dict) -> str: container = doc.get("container", "tech") cfg = CONTAINERS.get(container, {}) warn = cfg.get("warn_after_days") model = cfg.get("decay_model", "exponential") age_days = (time.time() - doc.get("created_at", time.time())) / 86400 if model == "stable": return "STABLE" if model == "anti_decay": return "ARCHIVAL" if not warn: return "OK" if age_days > warn * 2: return "STALE" if age_days > warn: return "AGING" return "FRESH" # ── Storage utils ───────────────────────────────────────────────── def now_ts(): return int(time.time()) def doc_path(container: str, folder: str, did: str) -> Path: d = STORE / container / folder d.mkdir(parents=True, exist_ok=True) return d / f"{did}.json" def read_doc(container: str, folder: str, did: str) -> Optional[dict]: p = doc_path(container, folder, did) return json.loads(p.read_text()) if p.exists() else None def write_doc(doc: dict): doc["updated_at"] = now_ts() doc_path(doc["container"], doc["folder"], doc["id"]).write_text( json.dumps(doc, indent=2, ensure_ascii=False) ) def all_docs(container: str = "", folder: str = "", limit: int = 500) -> List[dict]: out = [] base = STORE / container if container else STORE for p in sorted(base.rglob("*.json"), reverse=True): try: d = json.loads(p.read_text()) if folder and d.get("folder") != folder: continue out.append(d) except: pass if len(out) >= limit: break return out def new_doc(data: dict) -> dict: did = uuid.uuid4().hex[:10] container = data.get("container", "tech") cfg = CONTAINERS.get(container, {}) folders = cfg.get("folders", ["general"]) folder = data.get("folder", folders[0] if folders else "general") doc = { "id": did, "container": container, "folder": folder, "title": (data.get("title") or "Untitled").strip(), "body": (data.get("body") or data.get("content") or "").strip(), "summary": (data.get("summary") or "").strip(), "tags": [t.strip().lower() for t in data.get("tags", []) if str(t).strip()], "importance": max(0, min(10, int(data.get("importance", 5)))), "author": (data.get("author") or "").strip(), "source": (data.get("source") or "").strip(), "version": (data.get("version") or "").strip(), "expires_hint": data.get("expires_hint"), # ISO date string, optional "links": data.get("links", []), # related doc IDs "metadata": data.get("metadata", {}), "access_count": 0, "created_at": now_ts(), "updated_at": now_ts(), "last_accessed": None, } write_doc(doc) return doc # ── Search engine ───────────────────────────────────────────────── def tokenize(text: str) -> List[str]: return re.findall(r"[a-zA-Z0-9\u00C0-\u024F]+", text.lower()) def tf_score(query_tokens: List[str], doc: dict) -> float: text = " ".join([doc.get("title",""), doc.get("body",""), doc.get("summary",""), " ".join(doc.get("tags",[]))]).lower() doc_tokens = tokenize(text) tf = Counter(doc_tokens) total = len(doc_tokens) or 1 score = sum(tf.get(t, 0) / total for t in query_tokens) # boost: title matches worth 3x title_tokens = tokenize(doc.get("title","").lower()) title_tf = Counter(title_tokens) score += sum(title_tf.get(t, 0) * 2 for t in query_tokens) return score def search_docs(query: str = "", container: str = "", folder: str = "", tag: str = "", author: str = "", sort_by: str = "relevance", freshness: str = "", limit: int = 20) -> List[dict]: docs = all_docs(container, folder, 500) query_tokens = tokenize(query) if query else [] results = [] for doc in docs: # Tag filter if tag and tag.lower() not in doc.get("tags", []): continue # Author filter if author and doc.get("author","").lower() != author.lower(): continue # Freshness filter if freshness: fl = freshness_label(doc) if freshness == "fresh" and fl != "FRESH": continue if freshness == "stale" and fl not in ("STALE","AGING"): continue score = tf_score(query_tokens, doc) if query_tokens else 1.0 if query_tokens and score == 0: continue results.append((score, doc)) # Sort if sort_by == "value": results.sort(key=lambda x: -knowledge_value(x[1])) elif sort_by == "newest": results.sort(key=lambda x: -x[1].get("created_at", 0)) elif sort_by == "oldest": results.sort(key=lambda x: x[1].get("created_at", 0)) elif sort_by == "importance": results.sort(key=lambda x: (-x[1].get("importance", 5), -x[0])) else: results.sort(key=lambda x: (-x[0], -knowledge_value(x[1]))) return [d for _, d in results[:limit]] # ── Seed data ───────────────────────────────────────────────────── def seed(): if any(STORE.rglob("*.json")): return seeds = [ # TECH {"container":"tech","folder":"architecture","title":"ki-fusion-labs.de GPU Worker Architecture", "body":"GPU workers use a polling architecture. Workers call GET /api/queue every 2 seconds to check for pending jobs. On job acquisition, worker POSTs result to /api/results/{job_id}. No inbound connections required — fully firewall-friendly. LM Studio listens on localhost:1234. Jobs include: model_id, prompt, max_tokens, temperature, stream flag.", "summary":"Firewall-friendly polling design for GPU inference workers", "tags":["ki-fusion-labs","gpu","architecture","llm","inference"],"importance":9,"author":"christof","version":"v2"}, {"container":"tech","folder":"api","title":"FORGE Skill Registry API Reference", "body":"POST /api/v1/skills — create skill\nGET /api/v1/skills — list (filter: ?category=&tag=)\nGET /api/v1/skills/{id} — get\nPATCH /api/v1/skills/{id} — update\nDELETE /api/v1/skills/{id} — delete\nGET /mcp/sse — MCP SSE stream\nPOST /mcp — MCP JSON-RPC\n\nSkill schema: {id, name, description, category, code, input_schema, output_schema, tags, version, author}", "summary":"FORGE MCP skill registry REST endpoints","tags":["forge","api","mcp","skills"],"importance":8,"author":"christof","version":"1.0"}, {"container":"tech","folder":"devops","title":"HF Spaces Docker SDK Deployment Guide", "body":"CRITICAL: Use sdk: docker in README.md, NOT sdk: gradio.\nGradio SDK CSP blocks ALL """