"""Throwaway: measure text->graph extraction on a realistic large (wiki-sized) input. Builds a ~24 KB multi-paragraph article with many DISTINCT concepts (so dedup does not collapse it), then times the real adapter end-to-end and reports chunk count, node/edge totals, and wall time. Compares sequential vs the proposed concurrent path. Not a unit test; safe to delete. """ from __future__ import annotations import asyncio import time from loosecanvas.extractors.text_graph_adapter import ( _chunk_text, extract_graph_from_text, ) from loosecanvas.llm_client import LLMClient # 12 distinct ML/CS topic paragraphs; repeated with varied framing to reach ~24 KB # while keeping a wide concept vocabulary (mimics a long wiki article's breadth). _PARAS = [ "Gradient descent is an iterative optimization algorithm that minimizes a loss " "function by stepping parameters in the direction of the negative gradient. The " "learning rate controls the step size and critically affects convergence.", "Backpropagation computes gradients of the loss with respect to every weight using " "reverse-mode automatic differentiation and the chain rule, enabling efficient " "training of deep neural networks.", "Overfitting occurs when a model memorizes noise in the training data rather than " "the underlying signal. Regularization techniques such as dropout and weight decay " "counteract overfitting and improve generalization.", "Convolutional neural networks apply learnable filters across spatial dimensions, " "exploiting translation invariance. Pooling layers downsample feature maps and " "reduce computational cost while preserving salient activations.", "Recurrent neural networks process sequences by maintaining a hidden state. Long " "short-term memory units use gating mechanisms to mitigate the vanishing gradient " "problem that plagues vanilla recurrent architectures.", "The transformer architecture replaces recurrence with self-attention, allowing " "each token to attend to every other token. Multi-head attention captures diverse " "relationships, and positional encodings inject sequence order.", "Reinforcement learning trains an agent to maximize cumulative reward through " "interaction with an environment. The value function estimates expected return, " "while the policy maps states to actions.", "Support vector machines find the maximum-margin hyperplane separating classes. " "The kernel trick maps inputs into a higher-dimensional space where a linear " "separator corresponds to a nonlinear boundary in the original space.", "Principal component analysis reduces dimensionality by projecting data onto the " "orthogonal directions of greatest variance. The eigenvectors of the covariance " "matrix define these principal components.", "Bayesian inference updates a prior distribution into a posterior using observed " "evidence via Bayes' theorem. The likelihood quantifies how probable the data are " "under a given hypothesis.", "Decision trees recursively partition the feature space using information gain or " "Gini impurity. Random forests aggregate many decorrelated trees to reduce variance " "and improve robustness.", "Generative adversarial networks pit a generator against a discriminator in a " "minimax game. The generator learns to synthesize realistic samples while the " "discriminator learns to distinguish real from fake.", ] def _build_article(target_chars: int = 24000) -> str: parts: list[str] = [] i = 0 size = 0 while size < target_chars: para = _PARAS[i % len(_PARAS)] header = f"\n\n== Section {i + 1} ==\n\n" block = header + para parts.append(block) size += len(block) i += 1 return "".join(parts) async def main() -> None: article = _build_article() chunks = _chunk_text(article) print(f"article chars: {len(article)} chunks: {len(chunks)}") client = LLMClient() t0 = time.perf_counter() graph = await extract_graph_from_text(article, "ML Survey", client) dt = time.perf_counter() - t0 print( f"SEQUENTIAL: wall={dt:.1f}s nodes={len(graph.nodes)} edges={len(graph.edges)} " f"(~{dt / max(1, len(chunks)):.1f}s/chunk)" ) if __name__ == "__main__": asyncio.run(main())