File size: 6,760 Bytes
9ceb38c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
from src.api_clients import openai_client, claude_client, deepseek_client
from src.summary_generator import get_openrouter_models, generate_summary_from_openrouter, is_prompt_valid_for_summary
from src.evaluation import evaluate
from src.comparison import get_last_eval_data, run_comparison, import_model_metrics
from src.utils import preset_vals

from config.css import CSS

with gr.Blocks(css=CSS) as demo:
    with gr.Tabs():

        # β–Έ Tab 1: Unified Summary + Evaluation
        with gr.Tab("🧠 Generate + Evaluate"):
            gr.Markdown("## Unified Summary Generator and Evaluator")
            article = gr.Textbox(label="Paste article", lines=10)
            auto_gen_toggle = gr.Checkbox(label="Enable Auto-Generation if Summary is Empty", value=True)
            summary = gr.Textbox(label="Paste summary", lines=6, visible=False, interactive=True)
            prompt = gr.Textbox(label="Prompt for summary (only if generating)", lines=2, visible=True, interactive=True)
            model = gr.Dropdown(choices=get_openrouter_models(), label="Model (for generation)", visible=True, interactive=True)

            variant = gr.Radio(["Twin-Lock","Judge-Lock","ParallelX-TJ"], value="Twin-Lock", label="Variant", elem_id="variant-group")
            back = gr.CheckboxGroup(["OpenAI","DeepSeek","Claude"], value=["OpenAI","DeepSeek","Claude"], label="Back-ends", elem_id="backend-group")
            p0 = preset_vals("Twin-Lock")
            w_cov  = gr.Slider(0,1,p0[0],step=0.01,label="Coverage",      elem_classes=["metric-slider"])
            w_align= gr.Slider(0,1,p0[1],step=0.01,label="Alignment",     elem_classes=["metric-slider"])
            w_hall = gr.Slider(0,1,p0[2],step=0.01,label="Hallucination", elem_classes=["metric-slider"])
            w_rel  = gr.Slider(0,1,p0[3],step=0.01,label="Relevance",     elem_classes=["metric-slider"])
            w_bias = gr.Slider(0,1,p0[4],step=0.01,label="Bias/Toxicity", elem_classes=["metric-slider"])
            temp = gr.Slider(0,1,0,step=0.01,label="temperature")
            show_ev = gr.Checkbox(True,label="Show evidence spans")

            run = gr.Button("πŸ” Generate & Evaluate")
            gen_sum = gr.Textbox(label="Generated Summary", lines=6, visible=True)
            table  = gr.DataFrame(label="Metrics")
            comm   = gr.JSON(label="Comments JSON")
            score  = gr.JSON(label="Average score")
            tokbox = gr.JSON(label="Token usage")
            csv_dl = gr.File(label="CSV download")
            zip_dl = gr.File(label="Raw JSON zip")

            def toggle_ui(auto):
                return (
                    gr.update(visible=not auto, interactive=not auto),
                    gr.update(visible=auto, interactive=auto),
                    gr.update(visible=auto, interactive=auto),
                    gr.update(value=("πŸ” Generate & Evaluate" if auto else "βœ… Only Evaluate")),
                    gr.update(visible=auto)  # toggle gen_sum
                )

            auto_gen_toggle.change(
                toggle_ui, 
                auto_gen_toggle, 
                [summary, prompt, model, run, gen_sum]
            )

            def unified_run(article, prompt, model, summary, auto_flag, variant, active_back, temp,
                            w_cov, w_align, w_hall, w_rel, w_bias, show_ev):
                if auto_flag and not summary:
                    if not is_prompt_valid_for_summary(prompt):
                        return "β›” Prompt rejected: not summarization-related.", None, None, None, None, None, None
                    summary = generate_summary_from_openrouter(article, prompt, model)
                elif not auto_flag and not summary:
                    return "⚠️ Please provide a summary or enable auto-generation.", None, None, None, None, None, None

                
                return summary, *evaluate(article, summary, variant, active_back, temp, w_cov, w_align, w_hall, w_rel, w_bias, show_ev)

            run.click(
                unified_run, 
                [article, prompt, model, summary, auto_gen_toggle, variant, back, temp,
                 w_cov,w_align,w_hall,w_rel,w_bias,show_ev],
                [gen_sum, table, comm, score, tokbox, csv_dl, zip_dl]
            )

        # β–Έ Tab 3: Comparison
                # β–Έ Tab 3: Comparison (manual inputs for both human and model, no import button)
        with gr.Tab("πŸ“Š Comparison"):
            with gr.Column():
                gr.Markdown("## Compare Human vs Model Evaluations")
                gr.Markdown("### 🧍 Human Evaluation")
                with gr.Row():
                    hc = gr.Slider(1, 10, 1, step=1, label="Coverage")
                    ha = gr.Slider(1, 10, 1, step=1, label="Alignment")
                    hh = gr.Slider(1, 10, 1, step=1, label="Hallucination")
                    hr = gr.Slider(1, 10, 1, step=1, label="Relevance")
                    hb = gr.Slider(1, 10, 1, step=1, label="Bias/Toxicity")
                human_comments = gr.Textbox(label="Human Comments")

                gr.Markdown("### πŸ€– Model Evaluation")
                with gr.Row():
                    mc = gr.Slider(1, 10, 1, step=1, label="Coverage")
                    ma = gr.Slider(1, 10, 1, step=1, label="Alignment")
                    mh = gr.Slider(1, 10, 1, step=1, label="Hallucination")
                    mr = gr.Slider(1, 10, 1, step=1, label="Relevance")
                    mb = gr.Slider(1, 10, 1, step=1, label="Bias/Toxicity")
                model_comments = gr.Textbox(label="Model Comments")

                def compare_structured(hc, ha, hh, hr, hb, human_comments, mc, ma, mh, mr, mb, model_comments):
                    human_scores = {
                        "coverage": hc,
                        "alignment": ha,
                        "hallucination": hh,
                        "relevance": hr,
                        "bias_toxicity": hb
                    }
                    model_scores = {
                        "coverage": mc,
                        "alignment": ma,
                        "hallucination": mh,
                        "relevance": mr,
                        "bias_toxicity": mb
                    }
                    return run_comparison(human_scores, human_comments, model_scores, model_comments)

                compare_btn = gr.Button("πŸ” Compare")
                output_analysis = gr.Textbox(label="Analysis", lines=10)
                compare_btn.click(
                    compare_structured,
                    [hc, ha, hh, hr, hb, human_comments, mc, ma, mh, mr, mb, model_comments],
                    output_analysis
                ) 

demo.launch(share=True, show_error=True)