Spaces:

build-small-hackathon
/

loosecanvas

Sleeping

loosecanvas / scripts /text_graph_article_probe.py

Joshua Sundance Bailey

loosecanvas: local AI thought-mapping canvas with a trust-tagged knowledge graph

6d1438c 16 days ago

4.41 kB

	"""Throwaway: measure text->graph extraction on a realistic large (wiki-sized) input.

	Builds a ~24 KB multi-paragraph article with many DISTINCT concepts (so dedup does
	not collapse it), then times the real adapter end-to-end and reports chunk count,
	node/edge totals, and wall time. Compares sequential vs the proposed concurrent
	path. Not a unit test; safe to delete.
	"""

	from __future__ import annotations

	import asyncio
	import time

	from loosecanvas.extractors.text_graph_adapter import (
	_chunk_text,
	extract_graph_from_text,
	)
	from loosecanvas.llm_client import LLMClient

	# 12 distinct ML/CS topic paragraphs; repeated with varied framing to reach ~24 KB
	# while keeping a wide concept vocabulary (mimics a long wiki article's breadth).
	_PARAS = [
	"Gradient descent is an iterative optimization algorithm that minimizes a loss "
	"function by stepping parameters in the direction of the negative gradient. The "
	"learning rate controls the step size and critically affects convergence.",
	"Backpropagation computes gradients of the loss with respect to every weight using "
	"reverse-mode automatic differentiation and the chain rule, enabling efficient "
	"training of deep neural networks.",
	"Overfitting occurs when a model memorizes noise in the training data rather than "
	"the underlying signal. Regularization techniques such as dropout and weight decay "
	"counteract overfitting and improve generalization.",
	"Convolutional neural networks apply learnable filters across spatial dimensions, "
	"exploiting translation invariance. Pooling layers downsample feature maps and "
	"reduce computational cost while preserving salient activations.",
	"Recurrent neural networks process sequences by maintaining a hidden state. Long "
	"short-term memory units use gating mechanisms to mitigate the vanishing gradient "
	"problem that plagues vanilla recurrent architectures.",
	"The transformer architecture replaces recurrence with self-attention, allowing "
	"each token to attend to every other token. Multi-head attention captures diverse "
	"relationships, and positional encodings inject sequence order.",
	"Reinforcement learning trains an agent to maximize cumulative reward through "
	"interaction with an environment. The value function estimates expected return, "
	"while the policy maps states to actions.",
	"Support vector machines find the maximum-margin hyperplane separating classes. "
	"The kernel trick maps inputs into a higher-dimensional space where a linear "
	"separator corresponds to a nonlinear boundary in the original space.",
	"Principal component analysis reduces dimensionality by projecting data onto the "
	"orthogonal directions of greatest variance. The eigenvectors of the covariance "
	"matrix define these principal components.",
	"Bayesian inference updates a prior distribution into a posterior using observed "
	"evidence via Bayes' theorem. The likelihood quantifies how probable the data are "
	"under a given hypothesis.",
	"Decision trees recursively partition the feature space using information gain or "
	"Gini impurity. Random forests aggregate many decorrelated trees to reduce variance "
	"and improve robustness.",
	"Generative adversarial networks pit a generator against a discriminator in a "
	"minimax game. The generator learns to synthesize realistic samples while the "
	"discriminator learns to distinguish real from fake.",
	]


	def _build_article(target_chars: int = 24000) -> str:
	parts: list[str] = []
	i = 0
	size = 0
	while size < target_chars:
	para = _PARAS[i % len(_PARAS)]
	header = f"\n\n== Section {i + 1} ==\n\n"
	block = header + para
	parts.append(block)
	size += len(block)
	i += 1
	return "".join(parts)


	async def main() -> None:
	article = _build_article()
	chunks = _chunk_text(article)
	print(f"article chars: {len(article)} chunks: {len(chunks)}")

	client = LLMClient()
	t0 = time.perf_counter()
	graph = await extract_graph_from_text(article, "ML Survey", client)
	dt = time.perf_counter() - t0
	print(
	f"SEQUENTIAL: wall={dt:.1f}s nodes={len(graph.nodes)} edges={len(graph.edges)} "
	f"(~{dt / max(1, len(chunks)):.1f}s/chunk)"
	)


	if __name__ == "__main__":
	asyncio.run(main())