Spaces:

build-small-hackathon
/

loosecanvas

Sleeping

File size: 4,407 Bytes

6d1438c

"""Throwaway: measure text->graph extraction on a realistic large (wiki-sized) input.

Builds a ~24 KB multi-paragraph article with many DISTINCT concepts (so dedup does
not collapse it), then times the real adapter end-to-end and reports chunk count,
node/edge totals, and wall time. Compares sequential vs the proposed concurrent
path. Not a unit test; safe to delete.
"""

from __future__ import annotations

import asyncio
import time

from loosecanvas.extractors.text_graph_adapter import (
    _chunk_text,
    extract_graph_from_text,
)
from loosecanvas.llm_client import LLMClient

# 12 distinct ML/CS topic paragraphs; repeated with varied framing to reach ~24 KB
# while keeping a wide concept vocabulary (mimics a long wiki article's breadth).
_PARAS = [
    "Gradient descent is an iterative optimization algorithm that minimizes a loss "
    "function by stepping parameters in the direction of the negative gradient. The "
    "learning rate controls the step size and critically affects convergence.",
    "Backpropagation computes gradients of the loss with respect to every weight using "
    "reverse-mode automatic differentiation and the chain rule, enabling efficient "
    "training of deep neural networks.",
    "Overfitting occurs when a model memorizes noise in the training data rather than "
    "the underlying signal. Regularization techniques such as dropout and weight decay "
    "counteract overfitting and improve generalization.",
    "Convolutional neural networks apply learnable filters across spatial dimensions, "
    "exploiting translation invariance. Pooling layers downsample feature maps and "
    "reduce computational cost while preserving salient activations.",
    "Recurrent neural networks process sequences by maintaining a hidden state. Long "
    "short-term memory units use gating mechanisms to mitigate the vanishing gradient "
    "problem that plagues vanilla recurrent architectures.",
    "The transformer architecture replaces recurrence with self-attention, allowing "
    "each token to attend to every other token. Multi-head attention captures diverse "
    "relationships, and positional encodings inject sequence order.",
    "Reinforcement learning trains an agent to maximize cumulative reward through "
    "interaction with an environment. The value function estimates expected return, "
    "while the policy maps states to actions.",
    "Support vector machines find the maximum-margin hyperplane separating classes. "
    "The kernel trick maps inputs into a higher-dimensional space where a linear "
    "separator corresponds to a nonlinear boundary in the original space.",
    "Principal component analysis reduces dimensionality by projecting data onto the "
    "orthogonal directions of greatest variance. The eigenvectors of the covariance "
    "matrix define these principal components.",
    "Bayesian inference updates a prior distribution into a posterior using observed "
    "evidence via Bayes' theorem. The likelihood quantifies how probable the data are "
    "under a given hypothesis.",
    "Decision trees recursively partition the feature space using information gain or "
    "Gini impurity. Random forests aggregate many decorrelated trees to reduce variance "
    "and improve robustness.",
    "Generative adversarial networks pit a generator against a discriminator in a "
    "minimax game. The generator learns to synthesize realistic samples while the "
    "discriminator learns to distinguish real from fake.",
]


def _build_article(target_chars: int = 24000) -> str:
    parts: list[str] = []
    i = 0
    size = 0
    while size < target_chars:
        para = _PARAS[i % len(_PARAS)]
        header = f"\n\n== Section {i + 1} ==\n\n"
        block = header + para
        parts.append(block)
        size += len(block)
        i += 1
    return "".join(parts)


async def main() -> None:
    article = _build_article()
    chunks = _chunk_text(article)
    print(f"article chars: {len(article)}  chunks: {len(chunks)}")

    client = LLMClient()
    t0 = time.perf_counter()
    graph = await extract_graph_from_text(article, "ML Survey", client)
    dt = time.perf_counter() - t0
    print(
        f"SEQUENTIAL: wall={dt:.1f}s  nodes={len(graph.nodes)} edges={len(graph.edges)}  "
        f"(~{dt / max(1, len(chunks)):.1f}s/chunk)"
    )


if __name__ == "__main__":
    asyncio.run(main())