Spaces:

DevodG
/

Janus-backend

Running

App Files Files Community

Janus-backend / backend /app /services /memory_manager.py

DevodG

deploy: Janus full system stabilization

24f95f0 22 days ago

raw

history blame contribute delete

9.41 kB

	"""
	New service: MemoryManager

	Makes Janus genuinely self-improving by maintaining structured memory across sessions:

	1. Case deduplication — prevents learning from near-identical queries
	2. Topic clustering — groups cases by domain/intent
	3. Pattern extraction — identifies what kinds of queries work well vs poorly
	4. Knowledge summarization — periodically summarizes case clusters into compact knowledge
	5. Semantic similarity check — uses simple TF-IDF (no GPU needed) to find similar past cases

	This replaces the raw JSON scan in adaptive_intelligence.py that caused the O(n)
	discovery loop — MemoryManager maintains an index so lookups are O(1).
	"""
	from __future__ import annotations

	import json
	import math
	import logging
	import pathlib
	import re
	import time
	from collections import defaultdict
	from typing import Any, Optional

	logger = logging.getLogger(__name__)

	try:
	from app.config import DATA_DIR
	except ImportError:
	DATA_DIR = pathlib.Path(__file__).parent.parent / "data"

	MEMORY_DIR = pathlib.Path(DATA_DIR) / "memory"
	INDEX_FILE = pathlib.Path(DATA_DIR) / "adaptive" / "memory_index.json"


	class MemoryManager:
	"""
	Indexed case memory with similarity search.
	Zero external dependencies — uses TF-IDF for similarity.
	"""

	def __init__(self):
	MEMORY_DIR.mkdir(parents=True, exist_ok=True)
	INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)
	self._index: dict[str, dict] = {} # case_id → metadata
	self._tfidf: dict[str, dict] = {} # case_id → term_freq
	self._idf: dict[str, float] = {} # term → idf score
	self._load_index()

	# ── Public API ─────────────────────────────────────────────────────────

	def add_case(self, case: dict) -> str:
	"""Index a new case. Returns case_id."""
	case_id = case.get("id") or case.get("case_id") or _make_id()
	query = case.get("query") or case.get("user_input") or ""
	domain = case.get("domain") or case.get("route", {}).get("domain", "general")
	quality = case.get("quality_score", 0.5)

	# Check for near-duplicate
	similar = self.find_similar(query, top_k=1)
	if similar and similar[0]["score"] > 0.92:
	logger.debug("MemoryManager: skipping near-duplicate case (%.2f similar to %s)",
	similar[0]["score"], similar[0]["case_id"])
	return similar[0]["case_id"]

	# Index metadata
	self._index[case_id] = {
	"id": case_id,
	"query": query[:200],
	"domain": domain,
	"quality": quality,
	"ts": time.time(),
	}
	# Index TF-IDF terms
	self._tfidf[case_id] = _term_freq(_tokenise(query))
	self._recompute_idf()
	self._save_index()
	return case_id

	def find_similar(self, query: str, top_k: int = 5) -> list[dict]:
	"""Return top_k most similar past cases with cosine similarity scores."""
	if not self._tfidf:
	return []
	query_tf = _term_freq(_tokenise(query))
	scores = []
	for cid, doc_tf in self._tfidf.items():
	score = _cosine(query_tf, doc_tf, self._idf)
	scores.append((score, cid))
	scores.sort(reverse=True)
	return [
	{"case_id": cid, "score": round(score, 4),
	**self._index.get(cid, {})}
	for score, cid in scores[:top_k]
	if score > 0.05
	]

	def get_domain_stats(self) -> dict[str, Any]:
	"""Return per-domain case counts and average quality."""
	domains: dict[str, list] = defaultdict(list)
	for meta in self._index.values():
	domains[meta.get("domain", "general")].append(meta.get("quality", 0.5))
	return {
	domain: {
	"count": len(scores),
	"avg_quality": round(sum(scores) / len(scores), 3) if scores else 0,
	}
	for domain, scores in domains.items()
	}

	def get_frequent_patterns(self, min_freq: int = 3) -> list[dict]:
	"""
	Extract query patterns that appear ≥ min_freq times.
	Used to build skills / prompt specialisations.
	"""
	term_counts: dict[str, int] = defaultdict(int)
	term_cases: dict[str, list] = defaultdict(list)
	for cid, meta in self._index.items():
	for token in _tokenise(meta.get("query", "")):
	if len(token) >= 4:
	term_counts[token] += 1
	term_cases[token].append(cid)

	patterns = []
	for term, count in term_counts.items():
	if count >= min_freq:
	patterns.append({
	"term": term,
	"count": count,
	"case_ids": term_cases[term][:10],
	})
	return sorted(patterns, key=lambda x: -x["count"])

	def total_cases(self) -> int:
	return len(self._index)

	# ── Internal ───────────────────────────────────────────────────────────

	def _recompute_idf(self):
	N = max(len(self._tfidf), 1)
	term_doc_count: dict[str, int] = defaultdict(int)
	for doc_tf in self._tfidf.values():
	for term in doc_tf:
	term_doc_count[term] += 1
	self._idf = {
	term: math.log((N + 1) / (count + 1)) + 1
	for term, count in term_doc_count.items()
	}

	def _save_index(self):
	try:
	INDEX_FILE.write_text(json.dumps({
	"index": self._index,
	"tfidf": self._tfidf,
	"idf": self._idf,
	"saved": time.time(),
	}, indent=2))
	except Exception as exc:
	logger.warning("MemoryManager: failed to save index: %s", exc)

	def _load_index(self):
	if not INDEX_FILE.exists():
	# Bootstrap from existing case files
	self._bootstrap_from_cases()
	return
	try:
	data = json.loads(INDEX_FILE.read_text())
	self._index = data.get("index", {})
	self._tfidf = data.get("tfidf", {})
	self._idf = data.get("idf", {})
	logger.info("MemoryManager: loaded %d indexed cases", len(self._index))
	except Exception as exc:
	logger.warning("MemoryManager: index load failed (%s), rebuilding", exc)
	self._bootstrap_from_cases()

	def _bootstrap_from_cases(self):
	"""Build index from existing case JSON files on first run."""
	if not MEMORY_DIR.exists():
	return
	for f in list(MEMORY_DIR.glob("*.json"))[:500]:
	try:
	case = json.loads(f.read_text())
	case_id = case.get("id") or case.get("case_id") or f.stem
	query = case.get("query") or case.get("user_input") or ""
	self._index[case_id] = {
	"id": case_id,
	"query": query[:200],
	"domain": case.get("domain", "general"),
	"quality": case.get("quality_score", 0.5),
	"ts": case.get("timestamp", time.time()),
	}
	self._tfidf[case_id] = _term_freq(_tokenise(query))
	except Exception:
	pass
	if self._tfidf:
	self._recompute_idf()
	self._save_index()
	logger.info("MemoryManager: bootstrapped %d cases from disk", len(self._index))


	# ── TF-IDF helpers ────────────────────────────────────────────────────────────

	_STOPWORDS = {
	"the","a","an","is","in","of","for","to","and","or","it","its",
	"this","that","with","on","at","by","from","be","are","was","were",
	"what","how","why","when","who","which","can","will","do","does",
	"i","me","my","we","you","your","he","she","they","them","their",
	"about","tell","explain","show","get","give","make","take","find",
	}


	def _tokenise(text: str) -> list[str]:
	text = (text or "").lower()
	tokens = re.findall(r'[a-z]{3,}', text)
	return [t for t in tokens if t not in _STOPWORDS]


	def _term_freq(tokens: list[str]) -> dict[str, float]:
	if not tokens:
	return {}
	counts: dict[str, int] = defaultdict(int)
	for t in tokens:
	counts[t] += 1
	total = len(tokens)
	return {t: c / total for t, c in counts.items()}


	def _cosine(a: dict[str, float], b: dict[str, float], idf: dict[str, float]) -> float:
	common = set(a) & set(b)
	if not common:
	return 0.0
	dot = sum(a[t] * b[t] * idf.get(t, 1.0) ** 2 for t in common)
	mag_a = math.sqrt(sum((v * idf.get(t, 1.0)) ** 2 for t, v in a.items()))
	mag_b = math.sqrt(sum((v * idf.get(t, 1.0)) ** 2 for t, v in b.items()))
	if mag_a == 0 or mag_b == 0:
	return 0.0
	return dot / (mag_a * mag_b)


	def _make_id() -> str:
	import hashlib, uuid
	return hashlib.md5(uuid.uuid4().bytes).hexdigest()[:12]


	# Singleton
	memory_manager = MemoryManager()