File size: 13,371 Bytes
bc46be8
 
 
 
 
 
 
 
 
 
8971452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc46be8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8971452
 
bc46be8
8971452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc46be8
8971452
 
 
 
 
 
 
 
 
 
 
bc46be8
 
 
 
 
 
 
8971452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc46be8
 
8971452
 
bc46be8
8971452
 
 
 
 
bc46be8
 
 
 
 
 
 
 
 
 
 
 
 
8971452
bc46be8
8971452
 
 
 
 
 
 
 
 
 
 
 
 
 
bc46be8
8971452
 
 
 
 
 
 
 
bc46be8
 
 
 
 
 
 
8971452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc46be8
 
 
 
 
 
 
 
8971452
 
bc46be8
 
 
8971452
 
bc46be8
 
 
542a8a3
bc46be8
 
 
 
 
 
542a8a3
bc46be8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8971452
bc46be8
 
 
 
 
 
 
 
c878863
bc46be8
 
 
542a8a3
 
 
bc46be8
 
c878863
bc46be8
 
 
542a8a3
 
 
bc46be8
 
 
 
 
 
 
 
 
 
 
8971452
 
 
 
 
 
bc46be8
 
 
 
 
8971452
 
bc46be8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
"""Code Review Quality Analyzer (Gradio / HF Spaces)

This app classifies a single code review comment by:
  - Feedback Type: Logic/Bug, Suggestion, Style/Nitpick, Question, Praise
  - Sentiment: Positive, Neutral, Negative

It uses a zero-shot classifier (`facebook/bart-large-mnli`) so it runs on CPU.
You can paste comment text directly, or fetch from a GitHub PR comment URL.
"""

import os
import re
from functools import lru_cache
from typing import Dict, List, Tuple

import gradio as gr
import requests
from transformers import pipeline

TYPE_LABELS = [
    "Logic/Bug",
    "Suggestion",
    "Style/Nitpick",
    "Question",
    "Praise",
]

SENTIMENT_LABELS = [
    "Positive",
    "Neutral",
    "Negative",
]

GITHUB_REVIEW_URL = re.compile(
    r"https?://github\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>\d+)(?:/[^#]*)?(?:#(?P<fragment>.*))?",
    re.IGNORECASE,
)

MAX_COMMENT_LENGTH = 4000
REQUEST_TIMEOUT_SECONDS = 10
APP_USER_AGENT = "CodeReviewQualityAnalyzer/0.1"
PIPELINE_MODEL_ID = "facebook/bart-large-mnli"

# Simple emojis to make results easier to scan at a glance.
TYPE_EMOJI = {
    "Logic/Bug": "🐞",
    "Suggestion": "πŸ’‘",
    "Style/Nitpick": "✏️",
    "Question": "❓",
    "Praise": "πŸ™Œ",
}
SENTIMENT_EMOJI = {
    "Positive": "πŸ™‚",
    "Neutral": "😐",
    "Negative": "πŸ™",
}

def _extract_comment_id(fragment: str) -> Tuple[str, str]:
    """Parse the fragment from a PR URL and extract the comment type and id."""
    if not fragment:
        raise ValueError("URL must include a fragment pointing to a specific comment.")

    discussion_match = re.search(r"discussion_r(\d+)", fragment)
    if discussion_match:
        return "pull_review_comment", discussion_match.group(1)

    issue_match = re.search(r"issuecomment-(\d+)", fragment)
    if issue_match:
        return "issue_comment", issue_match.group(1)

    review_match = re.search(r"pullrequestreview-(\d+)", fragment)
    if review_match:
        return "pull_review", review_match.group(1)

    raise ValueError(
        "Unsupported GitHub fragment. Supported fragments include '#discussion_r<ID>' and '#issuecomment-<ID>'."
    )

def _github_headers() -> Dict[str, str]:
    """Build GitHub headers, optionally adding a bearer token to increase limits."""
    headers = {
        "Accept": "application/vnd.github+json",
        "User-Agent": APP_USER_AGENT,
    }
    token = os.environ.get("GITHUB_TOKEN")
    if token:
        headers["Authorization"] = f"Bearer {token.strip()}"
    return headers


def fetch_comment_from_github(url: str) -> str:
    """Fetch a PR review comment body from a public GitHub URL.

    Supported fragments:
      - #discussion_r<ID>
      - #issuecomment-<ID>
      - #pullrequestreview-<ID>
    """
    match = GITHUB_REVIEW_URL.match(url.strip())
    if not match:
        raise ValueError("Only GitHub pull request comment URLs are supported at the moment.")

    owner = match.group("owner")
    repo = match.group("repo")
    fragment = match.group("fragment")

    comment_type, comment_id = _extract_comment_id(fragment)

    if comment_type == "pull_review_comment":
        api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/comments/{comment_id}"
    elif comment_type == "issue_comment":
        api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments/{comment_id}"
    elif comment_type == "pull_review":
        api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/reviews/{comment_id}"
    else:
        raise ValueError("Unsupported comment type.")

    try:
        response = requests.get(
            api_url,
            headers=_github_headers(),
            timeout=REQUEST_TIMEOUT_SECONDS,
        )
    except requests.RequestException as err:
        raise ValueError("Unable to reach GitHub. Check your network connection or try again later.") from err

    if response.status_code == 404:
        raise ValueError("Comment not found. Double-check that the link points to a public review comment.")
    if response.status_code == 403:
        raise ValueError(
            "GitHub API rate limit exceeded or access forbidden. Try again later or paste the comment text manually."
        )
    response.raise_for_status()

    payload = response.json()

    if "body" not in payload:
        raise ValueError("Unable to extract comment body from GitHub response.")

    return payload["body"].strip()

@lru_cache(maxsize=1)
def get_zero_shot_pipeline():
    """Lazily load the zero-shot pipeline on CPU."""
    return pipeline("zero-shot-classification", model=PIPELINE_MODEL_ID, device=-1)

def build_table(labels: List[str], scores: List[float]) -> List[List[str]]:
    """Convert labels + scores into a 2D table for display."""
    rows: List[List[str]] = []
    for label, score in zip(labels, scores):
        rows.append([label, f"{score:.2%}"])
    return rows

def _format_summary(best_type: str, best_type_score: float, best_sentiment: str, best_sentiment_score: float) -> str:
    """Build a professional, emoji-enhanced Markdown summary."""
    type_emoji = TYPE_EMOJI.get(best_type, "")
    sent_emoji = SENTIMENT_EMOJI.get(best_sentiment, "")
    return (
        f"### Result\n"
        f"- Feedback Type: {type_emoji} {best_type} ({best_type_score:.1%})\n"
        f"- Sentiment: {sent_emoji} {best_sentiment} ({best_sentiment_score:.1%})\n"
        f"\n"
        f"Model: `{PIPELINE_MODEL_ID}` Β· Device: CPU Β· Method: zero-shot\n"
    )


def classify_comment(comment: str) -> Dict[str, object]:
    """Run zero-shot classification for feedback type and sentiment."""
    classifier = get_zero_shot_pipeline()

    type_result = classifier(comment, TYPE_LABELS, multi_label=False)
    sentiment_result = classifier(comment, SENTIMENT_LABELS, multi_label=False)

    best_type = type_result["labels"][0]
    best_type_score = type_result["scores"][0]

    best_sentiment = sentiment_result["labels"][0]
    best_sentiment_score = sentiment_result["scores"][0]

    type_table = build_table(type_result["labels"], type_result["scores"])
    sentiment_table = build_table(sentiment_result["labels"], sentiment_result["scores"])

    summary = _format_summary(best_type, best_type_score, best_sentiment, best_sentiment_score)

    return {
        "summary": summary,
        "type_rows": type_table,
        "sentiment_rows": sentiment_table,
    }

def analyze_comment(comment_text: str, review_url: str):
    """Main handler called from the UI.

    Rules:
      - If both fields are provided, prefer the pasted text (URL is fetched for preview only).
      - If only URL is provided, attempt to fetch the comment body.
      - Validate size and emit structured outputs.
    """
    comment_text = (comment_text or "").strip()
    review_url = (review_url or "").strip()

    if comment_text and review_url:
        try:
            fetched_comment = fetch_comment_from_github(review_url)
            # Prioritize pasted text but expose fetched variant for comparison.
            combined_comment = comment_text
            comment_note = (
                "Using the pasted comment text. Fetched GitHub comment is shown in the preview for reference."
            )
        except Exception:
            fetched_comment = ""
            combined_comment = comment_text
            comment_note = "Using the pasted comment text."
    elif comment_text:
        combined_comment = comment_text
        fetched_comment = ""
        comment_note = "Using the pasted comment text."
    elif review_url:
        try:
            combined_comment = fetch_comment_from_github(review_url)
            fetched_comment = combined_comment
            comment_note = "Using the comment fetched from GitHub."
        except Exception as err:
            raise gr.Error(str(err))
    else:
        raise gr.Error("Provide either comment text or a GitHub review URL to analyze.")

    if not combined_comment:
        raise gr.Error("Could not determine any comment text to analyze.")

    if len(combined_comment) > MAX_COMMENT_LENGTH:
        raise gr.Error(f"Comment is too long. Please provide text under {MAX_COMMENT_LENGTH:,} characters.")

    analysis = classify_comment(combined_comment)

    preview_parts = [comment_note]
    preview_parts.append("")
    preview_parts.append(combined_comment)
    preview = "\n".join(preview_parts).strip()

    fetched_preview = fetched_comment if fetched_comment else ""

    return (
        analysis["summary"],
        analysis["type_rows"],
        analysis["sentiment_rows"],
        preview,
        fetched_preview,
    )

def _clear():
    """Reset inputs and outputs to a clean state."""
    return "", "", "", [], [], "", ""


theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")

with gr.Blocks(title="Code Review Quality Analyzer", theme=theme) as demo:
    gr.Markdown(
        "# Code Review Quality Analyzer\n"
        "Classify a code review comment by feedback type and sentiment.\n\n"
        "- Runs on CPU (no GPU needed) using zero-shot classification.\n"
        f"- Model: `{PIPELINE_MODEL_ID}` Β· Categories are configurable."
    )

    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.TabItem("Paste Comment"):
                    comment_input = gr.Textbox(
                        label="Review Comment Text",
                        placeholder="Paste a single review comment...",
                        lines=8,
                        autofocus=True,
                    )
                with gr.TabItem("GitHub URL"):
                    url_input = gr.Textbox(
                        label="Public GitHub PR Comment URL",
                        placeholder="https://github.com/org/repo/pull/123#discussion_r456",
                        lines=2,
                        info="Works for #discussion_r<ID> and #issuecomment-<ID> on public repos.",
                    )

            gr.Markdown("### Examples")
            gr.Examples(
                examples=[
                    [
                        "This will break when `user` is None. Consider checking for None before calling `get_id()`.",
                        "",
                    ],
                    [
                        "Nice cleanup here β€” this reads much better now. Thanks!",
                        "",
                    ],
                    [
                        "Nit: rename `x` to something more descriptive like `retry_interval`.",
                        "",
                    ],
                    [
                        "Why do we need this extra flag? Doesn't the existing `bar` already handle that case?",
                        "",
                    ],
                    [
                        "Consider extracting this logic into a helper function to avoid duplication across handlers.",
                        "",
                    ],
                    [
                        "This is a risky approach; I recommend reverting and discussing alternatives.",
                        "",
                    ],
                ],
                inputs=[comment_input, url_input],
                run_on_click=False,
            )

            with gr.Row():
                analyze_button = gr.Button("Analyze Review", variant="primary")
                clear_button = gr.Button("Clear")

        with gr.Column(scale=1):
            summary_output = gr.Markdown(label="Classification Summary")
            with gr.Row():
                type_output = gr.Dataframe(
                    headers=["Label", "Confidence"],
                    label="Feedback Type Confidence",
                    datatype=["str", "str"],
                    interactive=False,
                    row_count=(0, "dynamic"),
                    col_count=(2, "fixed"),
                    value=[],
                )
                sentiment_output = gr.Dataframe(
                    headers=["Label", "Confidence"],
                    label="Sentiment Confidence",
                    datatype=["str", "str"],
                    interactive=False,
                    row_count=(0, "dynamic"),
                    col_count=(2, "fixed"),
                    value=[],
                )
            with gr.Accordion("Preview", open=False):
                preview_output = gr.Textbox(label="Analyzed Comment", lines=6)
                fetched_preview_output = gr.Textbox(label="Fetched GitHub Comment", lines=6)

            with gr.Accordion("Tips", open=False):
                gr.Markdown(
                    "- Use concise, single-comment inputs for best results.\n"
                    "- For organization-wide insights, aggregate predictions across many comments.\n"
                    "- Replace the zero-shot model with a fine-tuned one for higher accuracy on your data."
                )

    analyze_button.click(
        analyze_comment,
        inputs=[comment_input, url_input],
        outputs=[summary_output, type_output, sentiment_output, preview_output, fetched_preview_output],
    )
    clear_button.click(
        _clear,
        inputs=None,
        outputs=[comment_input, url_input, summary_output, type_output, sentiment_output, preview_output, fetched_preview_output],
    )

if __name__ == "__main__":
    demo.queue(max_size=16).launch()