File size: 6,121 Bytes
ad6d452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""Example: Dependency Bloat & Overlap Scanner

Demonstrates the branching / map-reduce pattern in Acorn.

The DependencyScanner orchestrator fans out one PackageAnalyzerBranch per
dependency (map phase), collects the PackageProfile results, groups packages
by overlapping purpose, and returns a PruningPlan (reduce phase).

Supports both requirements.txt (pip) and package.json (npm) formats.
"""

import re

import requests
from pydantic import BaseModel, Field

from acorn import Module, tool


# ---------------------------------------------------------------------------
# Schemas
# ---------------------------------------------------------------------------


class PackageInput(BaseModel):

    package_name: str = Field(description="Package name as it appears in the dependency file")

    ecosystem: str = Field(description="'pip' for PyPI packages, 'npm' for Node packages")

class PackageProfile(BaseModel):
    name: str = Field(description="Package name")
    primary_purpose: str = Field(description="One-line description of what the package does")
    categories: list[str] = Field(description="1-3 functional tags, e.g. ['http', 'networking']")
    key_features: list[str] = Field(description="3-5 bullet points describing key features")


class RedundancyGroup(BaseModel):

    purpose: str = Field(description="Shared purpose, e.g. 'HTTP client'")

    packages: list[str] = Field(description="the packages in this group")

    keep: str = Field(description="Recommended package to keep")

    remove: list[str] = Field(description="Recommended packages to remove")

    reason: str = Field(description="Why the recommended package wins")


class DependencyFileInput(BaseModel):
    file_content: str = Field(
        description="Contents of a requirements.txt or package.json file"
    )

class PruningPlan(BaseModel):

    summary: str = Field(description="Narrative paragraph summarising findings and recommendations")

    redundancy_groups: list[RedundancyGroup] = Field(description="Groups of overlapping packages")

    packages_to_remove: list[str] = Field(description="Flat list of packages recommended for removal")




# ---------------------------------------------------------------------------
# Tool (module-level so PackageAnalyzerBranch can reference it)
# ---------------------------------------------------------------------------


@tool
def fetch_package_info(package_name: str, ecosystem: str) -> dict:
    """Fetch README/description for a package from PyPI or the npm registry.

    Args:
        package_name: Package name as it appears in the dependency file
        ecosystem: "pip" for PyPI packages, "npm" for Node packages

    Returns:
        Dict with 'name', 'description', 'readme' (trimmed to 3 000 chars)
    """
    # Strip version pins: requests==2.31.0 → requests, axios@1.6.0 → axios
    clean_name = re.split(r"[=<>!@~^]", package_name)[0].strip()

    if ecosystem == "pip":
        url = f"https://pypi.org/pypi/{clean_name}/json"
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            data = resp.json()
            info = data.get("info", {})
            description = info.get("summary", "") or ""
            readme = info.get("description", "") or ""
        except requests.HTTPError:
            return {"name": clean_name, "description": "Not found on PyPI.", "readme": ""}
    else:  # npm
        url = f"https://registry.npmjs.org/{clean_name}"
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            data = resp.json()
            description = data.get("description", "") or ""
            readme = data.get("readme", "") or ""
        except requests.HTTPError:
            return {"name": clean_name, "description": "Not found on npm.", "readme": ""}

    return {
        "name": clean_name,
        "description": description,
        "readme": readme[:3000],
    }


# ---------------------------------------------------------------------------
# Branch module — one instance per dependency
# ---------------------------------------------------------------------------


BRANCH_PROMPT = """Analyze a single package and determine its primary purpose.

1. Call fetch_package_info with the package_name and ecosystem.
2. From the description and README, identify: primary_purpose (one clear
   sentence), categories (1-3 functional tags), and key_features
   (3-5 bullet points).
3. Finish immediately after fetching — do not loop.
"""


class PackageAnalyzerBranch(Module):
    """Analyze a single package and extract its purpose and features."""

    system_prompt = BRANCH_PROMPT
    model = "anthropic/claude-sonnet-4-6"
    temperature = 0.2
    max_steps = 5

    initial_input = PackageInput
    final_output = PackageProfile

    tools = [fetch_package_info]


# ---------------------------------------------------------------------------
# Orchestrator module
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = """You are a dependency analyst specializing in finding redundant libraries.

Workflow:
1. Parse the file_content to extract all top-level dependency names.
   Detect the ecosystem: "npm" if it parses as JSON (package.json),
   "pip" if it looks like requirements.txt.

2. For EACH dependency use the PackageAnalyzerBranch branch to analyze each. You can analyze multiple in parallel.

3. After all branches finish, review the collected PackageProfile results.
   Group packages by overlapping primary_purpose / categories.

4. Identify RedundancyGroups — packages that do the same job.
   Recommend keeping the most popular/maintained one and removing the rest.

5. Call __finish__ with the complete PruningPlan.
"""


class DependencyScanner(Module):
    """Scan a requirements.txt or package.json for redundant dependencies."""

    system_prompt = SYSTEM_PROMPT
    model = "anthropic/claude-haiku-4-5"
    temperature = 0.2
    max_steps = 60

    initial_input = DependencyFileInput
    final_output = PruningPlan

    branches = [PackageAnalyzerBranch]