File size: 7,696 Bytes
46ba771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Paper to Code
Turn a method description into an implementation plan, PyTorch scaffold, and reproducibility checklist.
"""

import json
import os
import re
import sys
from textwrap import dedent
from typing import Dict

import gradio as gr

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from shared.components import create_footer, create_method_panel, create_premium_hero


try:
    from huggingface_hub import InferenceClient
except Exception:  # pragma: no cover - optional dependency
    InferenceClient = None


SAMPLE_METHOD = """We introduce a retrieval-augmented classifier for support tickets.
Each ticket is embedded with a sentence-transformer, nearest historical cases are retrieved,
and a lightweight cross-encoder reranks them before the final label is produced.
The model reports confidence, top evidence snippets, and an abstain decision when evidence is weak."""


def _extract_keywords(text: str):
    words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower())
    stop = {"with", "that", "this", "from", "before", "after", "when", "model", "method", "using"}
    counts = {}
    for word in words:
        if word not in stop:
            counts[word] = counts.get(word, 0) + 1
    return [word for word, _ in sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:8]]


def _hf_plan(method_text: str, target: str) -> Dict[str, str]:
    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
    if not token or InferenceClient is None:
        return {}

    prompt = f"""
You are an ML engineer converting papers into clean implementation plans.
Return JSON with keys: summary, modules, code, checklist.
Target artifact: {target}
Method text:
{method_text}
"""
    client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token)
    response = client.text_generation(prompt, max_new_tokens=900, temperature=0.25)
    match = re.search(r"\{[\s\S]*\}", response)
    if not match:
        return {}
    try:
        parsed = json.loads(match.group(0))
    except json.JSONDecodeError:
        return {}
    return {key: str(parsed.get(key, "")).strip() for key in ["summary", "modules", "code", "checklist"]}


def _fallback_plan(method_text: str, target: str) -> Dict[str, str]:
    keywords = _extract_keywords(method_text)
    technique = ", ".join(keywords[:5]) or "model pipeline"
    summary = (
        f"This looks like a {technique} workflow. The safest implementation path is to "
        "separate data preparation, model logic, inference, and evaluation so each claim can be tested."
    )
    modules = dedent(f"""
    1. `data.py` - parse examples, labels, and evidence fields.
    2. `model.py` - implement the core method as a small, testable module.
    3. `retrieve.py` - isolate nearest-neighbor or feature lookup logic if retrieval is involved.
    4. `train.py` - keep hyperparameters explicit and serializable.
    5. `evaluate.py` - report task metric, calibration, and failure cases.

    Detected method signals: {", ".join(keywords) if keywords else "none"}
    """).strip()

    code = dedent('''
    import torch
    from torch import nn


    class PaperModule(nn.Module):
        """Minimal scaffold generated from the method description."""

        def __init__(self, encoder: nn.Module, hidden_size: int, num_labels: int):
            super().__init__()
            self.encoder = encoder
            self.classifier = nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.GELU(),
                nn.Dropout(0.1),
                nn.Linear(hidden_size, num_labels),
            )

        def forward(self, inputs, evidence=None):
            encoded = self.encoder(**inputs)
            pooled = encoded.last_hidden_state[:, 0]
            logits = self.classifier(pooled)
            confidence = torch.softmax(logits, dim=-1).max(dim=-1).values
            return {"logits": logits, "confidence": confidence, "evidence": evidence}
    ''').strip()

    if target == "Evaluation Harness":
        code = dedent('''
        from sklearn.metrics import accuracy_score, f1_score


        def evaluate(predictions, labels, confidences, abstain_threshold=0.55):
            keep = [score >= abstain_threshold for score in confidences]
            covered_preds = [pred for pred, ok in zip(predictions, keep) if ok]
            covered_labels = [label for label, ok in zip(labels, keep) if ok]
            return {
                "coverage": sum(keep) / max(1, len(keep)),
                "accuracy": accuracy_score(covered_labels, covered_preds) if covered_preds else 0.0,
                "macro_f1": f1_score(covered_labels, covered_preds, average="macro") if covered_preds else 0.0,
            }
        ''').strip()

    checklist = dedent("""
    - Define the exact dataset split and leakage checks.
    - Log every hyperparameter needed to reproduce the run.
    - Add one baseline that is simpler than the proposed method.
    - Report both aggregate metrics and 5-10 qualitative failures.
    - Save model card notes: intended use, limitations, and ethical risks.
    """).strip()
    return {"summary": summary, "modules": modules, "code": code, "checklist": checklist}


def generate_scaffold(method_text: str, target: str):
    if not method_text or len(method_text.strip()) < 40:
        return "Paste at least a paragraph of method text.", "", "", ""

    plan = _hf_plan(method_text, target) or _fallback_plan(method_text, target)
    return plan["summary"], plan["modules"], plan["code"], plan["checklist"]


with gr.Blocks(title="Paper to Code", theme=gr.themes.Soft()) as app:
    create_premium_hero(
        "Paper to Code",
        "Convert a paper method into a reproducible engineering scaffold: modules, PyTorch code, evaluation harness, and checklist.",
        "🧪",
        badge="Research Engineering",
        highlights=["Method parsing", "PyTorch scaffold", "Reproducibility checklist"],
    )
    create_method_panel({
        "Technique": "Paper implementation planning with optional Hugging Face inference.",
        "What it proves": "You can translate research claims into testable software boundaries.",
        "HF capability": "Use HF-hosted LLMs when `HF_TOKEN` is available; fall back locally otherwise.",
    })

    with gr.Row():
        with gr.Column(scale=1):
            method_input = gr.Textbox(
                label="Paper method or abstract",
                value=SAMPLE_METHOD,
                lines=12,
                placeholder="Paste the method section, abstract, or algorithm summary...",
            )
            target = gr.Radio(
                ["PyTorch Module", "Training Loop", "Evaluation Harness", "Experiment Checklist"],
                value="PyTorch Module",
                label="Target artifact",
            )
            generate_btn = gr.Button("Generate Research Scaffold", variant="primary")
        with gr.Column(scale=1):
            summary = gr.Markdown(label="Technique extraction")
            modules = gr.Markdown(label="Implementation plan")

    code = gr.Code(label="Code scaffold", language="python", lines=18)
    checklist = gr.Markdown(label="Reproducibility checklist")

    generate_btn.click(generate_scaffold, inputs=[method_input, target], outputs=[summary, modules, code, checklist])

    gr.Markdown("""
## Why This Is Useful

Many AI demos stop at summarizing papers. This Space shows the engineering step after reading: identify components, define test boundaries, scaffold code, and make reproducibility explicit.
""")
    create_footer("Paper to Code")


if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)