import gradio as gr import pandas as pd import plotly.graph_objects as go import hashlib, tempfile, os, time from datetime import datetime, timezone import sqlite3 import random # 假设这些模块在其他地方定义 from config import CSS, DIMS from OVAL import oval_scores from DeepEval import deepeval_scores # 全局配置 DAILY_LIMIT = 150 # 每日全局限制次数 REQUEST_INTERVAL = 9 # 请求间隔(秒) DB_FILE = "usage_tracker.db" # SQLite数据库文件名 def init_db(): """初始化SQLite数据库""" conn = sqlite3.connect(DB_FILE) c = conn.cursor() # 创建全局计数器表 c.execute(''' CREATE TABLE IF NOT EXISTS global_stats ( id INTEGER PRIMARY KEY, date TEXT NOT NULL, count INTEGER NOT NULL, last_request REAL NOT NULL ) ''') # 确保只有一条记录 c.execute("SELECT COUNT(*) FROM global_stats") count = c.fetchone()[0] if count == 0: c.execute("INSERT INTO global_stats (date, count, last_request) VALUES (?, ?, ?)", (get_utc_date(), 0, time.time())) conn.commit() conn.close() def get_utc_date(): """获取UTC+0的日期字符串""" return datetime.now(timezone.utc).strftime("%Y-%m-%d") def check_daily_limit(): """检查今日全局请求次数是否超限""" today = get_utc_date() conn = sqlite3.connect(DB_FILE) c = conn.cursor() c.execute("SELECT date, count, last_request FROM global_stats WHERE id = 1") row = c.fetchone() if not row: # 如果记录不存在,初始化 c.execute("INSERT INTO global_stats (date, count, last_request) VALUES (?, ?, ?)", (today, 0, time.time())) count = 0 else: db_date, count, last_request = row # 如果是新的一天,重置计数 if db_date != today: c.execute("UPDATE global_stats SET date = ?, count = ?, last_request = ? WHERE id = 1", (today, 0, time.time())) count = 0 conn.commit() conn.close() return count >= DAILY_LIMIT, count def update_request_count(): """更新全局请求计数""" today = get_utc_date() current_time = time.time() conn = sqlite3.connect(DB_FILE) c = conn.cursor() c.execute("SELECT date, count FROM global_stats WHERE id = 1") row = c.fetchone() if not row: # 如果记录不存在,初始化 c.execute("INSERT INTO global_stats (date, count, last_request) VALUES (?, ?, ?)", (today, 1, current_time)) count = 1 else: db_date, count = row # 如果是新的一天,重置计数 if db_date != today: c.execute("UPDATE global_stats SET date = ?, count = 1, last_request = ? WHERE id = 1", (today, current_time)) count = 1 else: # 增加计数 c.execute("UPDATE global_stats SET count = count + 1, last_request = ? WHERE id = 1", (current_time,)) count += 1 conn.commit() conn.close() return count, current_time def check_request_interval(): """检查请求间隔是否满足要求""" conn = sqlite3.connect(DB_FILE) c = conn.cursor() c.execute("SELECT last_request FROM global_stats WHERE id = 1") row = c.fetchone() if not row: return True # 如果记录不存在,允许请求 last_time = row[0] conn.close() return time.time() - last_time >= REQUEST_INTERVAL def generate_captcha(): """生成随机加法验证码""" num1 = random.randint(2, 8) num2 = random.randint(2, 8) return f"What's {num1} + {num2}?", num1 + num2 def make_explanation(system: str, dimension: str, score: float) -> str: templates = { # OVAL 拓展 5 维 "Structural Clarity": f"{system} scored Structural Clarity at {score}: The text structure may be unclear; consider adding headings or breaking into paragraphs.", "Reasoning Quality": f"{system} scored Reasoning Quality at {score}: Argument support is weak; consider adding logical reasoning or evidence.", "Factuality": f"{system} scored Factuality at {score}: Information may be inaccurate; please fact-check the facts.", "Depth of Analysis": f"{system} scored Depth of Analysis at {score}: Analysis seems shallow; add more insights or examples.", "Topic Coverage": f"{system} scored Topic Coverage at {score}: Key aspects may be missing; ensure you cover the full scope.", # DeepEval 拓展 5 维 "Fluency": f"{system} scored Fluency at {score}: Expression may be disfluent; consider smoothing sentence transitions.", "Prompt Relevance": f"{system} scored Prompt Relevance at {score}: The response may stray from the prompt; ensure alignment.", "Conciseness": f"{system} scored Conciseness at {score}: The response may be verbose; consider trimming redundant parts.", "Readability": f"{system} scored Readability at {score}: The text is hard to read; consider simpler wording or shorter sentences.", "Engagement": f"{system} scored Engagement at {score}: The response lacks engagement; add examples or a conversational tone.", } return templates.get(dimension, f"{system} scored {dimension} at {score}: Low score detected; please review this aspect.") def evaluate( prompt_text: str, output_text: str, # Prompt 主观 5 维度 s1: float, s2: float, s3: float, s4: float, s5: float, # Prompt 主观解释 e1: str, e2: str, e3: str, e4: str, e5: str, # Judge 模块 judge_llm: str, ja1: float, ja2: float, ja3: float, ja4: float, ja5: float, judge_remark: str, # 额外备注 remark: str, # 验证码 captcha_answer: str, correct_answer: int, # 会话状态 session_state: dict ): # 1) 验证全局请求状态 is_limited, current_count = check_daily_limit() # 检查是否达到每日限制 if is_limited: return ( gr.update(visible=True), # 显示限制提示 gr.update(visible=False), # 隐藏结果区域 None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, current_count, None, None ) # 检查请求间隔 if not check_request_interval(): with sqlite3.connect(DB_FILE) as conn: c = conn.cursor() c.execute("SELECT last_request FROM global_stats WHERE id = 1") last_time = c.fetchone()[0] remaining_time = REQUEST_INTERVAL - (time.time() - last_time) raise gr.Error(f"请等待 {remaining_time:.1f} 秒后再试") # 检查验证码 try: if int(captcha_answer) != correct_answer: raise gr.Error("Verification code error, please try again") except (ValueError, TypeError): raise gr.Error("Please enter the correct verification code") # 2) 更新全局请求计数 count, last_request = update_request_count() # 3) 验证 Prompt 主观低分必须解释 for score, exp, label in [ (s1, e1, "Clarity"), (s2, e2, "Scope Definition"), (s3, e3, "Intent Alignment"), (s4, e4, "Bias / Induction"), (s5, e5, "Efficiency"), ]: if score < 3 and not exp.strip(): raise gr.Error(f"{label} score < 3: please provide an explanation.") # 4) 构造三组分数 subj = [s1, s2, s3, s4, s5] + [None]*10 # 获取完整的OVAL和DeepEval分数 full_oval = oval_scores(output_text) full_deep = deepeval_scores(prompt_text, output_text) # 灰化指定的维度(将对应分数设为None) # OVAL的Factuality(索引7)和Topic Coverage(索引9) full_oval[7] = None # Factuality full_oval[9] = None # Topic Coverage # DeepEval的Prompt Relevance(索引11)及Conciseness(索引12) full_deep[11] = None # Prompt Relevance full_deep[12] = None # Conciseness # 使用处理后的分数 oval = full_oval deep = full_deep # 5) 自动低分解释 auto_expls = [] for system, scores, idxs in [ ("OVAL", oval, range(5,10)), ("DeepEval", deep, range(10,15)) ]: for i in idxs: sc = scores[i] if sc is not None and sc < 3: auto_expls.append(make_explanation(system, DIMS[i], sc)) auto_text = "\n".join(auto_expls) or "All automated scores ≥ 3; no issues detected." # 6) 构建 DataFrame(包含 Judge 信息列) full_df = pd.DataFrame({ "Dimension": DIMS, "Subjective (Prompt)": subj, "OVAL (Output)": oval, "DeepEval (Output)": deep, "Judge LLM": [judge_llm] * len(DIMS), "Sensory Accuracy": [ja1] * len(DIMS), "Emotional Engagement": [ja2] * len(DIMS), "Flow & Naturalness": [ja3] * len(DIMS), "Imagery Completeness": [ja4] * len(DIMS), "Simplicity & Accessibility": [ja5] * len(DIMS), "Judge Remarks": [judge_remark] * len(DIMS), "Notes (Slang/Tech Terms)": [remark] * len(DIMS), }) # 7) 提取子表 subj_df = full_df.iloc[0:5][["Dimension","Subjective (Prompt)"]] oval_df = full_df.iloc[5:10][["Dimension","OVAL (Output)"]] deep_df = full_df.iloc[10:15][["Dimension","DeepEval (Output)"]] # 8) 构造雷达图(取三类分数最大值) max_scores = [ max([v for v in vals if v is not None]) if any(v is not None for v in vals) else 0 for vals in zip(subj, oval, deep) ] closed_dims = DIMS + [DIMS[0]] r = max_scores + [max_scores[0]] fig = go.Figure(go.Scatterpolar(r=r, theta=closed_dims, fill='toself')) fig.update_layout( polar=dict(radialaxis=dict(visible=True, range=[0,5])), showlegend=False, title="Final (Max) Scores Radar" ) # 更新页面底部的计数器显示 return ( gr.update(visible=False), # 隐藏限制提示 gr.update(visible=True), # 显示结果区域 subj_df, oval_df, deep_df, fig, None, # 不生成CSV文件 remark, e1, e2, e3, e4, e5, auto_text, judge_llm, ja1, ja2, ja3, ja4, ja5, judge_remark, *generate_captcha(), # 生成新的验证码 count, # 返回当前全局计数 "expanded", # Judge区块默认展开 gr.update(value=f"Today Counts: {count}/{DAILY_LIMIT}") # 更新底部计数器 ) def toggle_explain(v): return gr.update(visible=(v<3)) def check_daily_limit_state(): """检查全局状态并更新UI显示""" is_limited, current_count = check_daily_limit() return ( gr.update(visible=is_limited), # 限制提示 gr.update(visible=not is_limited), # 启用提交按钮 gr.update(visible=not is_limited), # 显示结果区域 f"Today Counts: {current_count}/{DAILY_LIMIT}", # 更新计数器文本 gr.update(value=f"Today Counts: {current_count}/{DAILY_LIMIT}") # 更新底部计数器 ) def show_personal_version_notice(): """显示个人版本提示""" raise gr.Error("Only for coming personal version.") def toggle_judge_section(visible): """切换Judge部分的显示状态""" return gr.update(visible=(visible == "expanded")), gr.update(value=("Collapse" if visible == "expanded" else "Expand")) css = """ #submit-btn { background-color: orange !important; color: white !important; border: none !important; } #submit-btn:hover { background-color: darkorange !important; } .limit-notice { background-color: #ffcccc; border: 1px solid #ff6666; padding: 10px; border-radius: 5px; margin: 10px 0; } .upgrade-notice { background-color: #e6f7ff; border: 1px solid #91d5ff; padding: 10px; border-radius: 5px; margin: 10px 0; } .welcome-notice { background-color: #fff7e6; border: 1px solid #ffd591; padding: 10px; border-radius: 5px; margin: 10px 0; } .disabled-dimension { color: #888; font-style: italic; } .example-label { font-weight: bold; color: #666; margin-top: 10px; } .daily-count { font-size: 16px; font-weight: bold; margin-top: 15px; text-align: center; } .judge-section { border: 1px solid #ddd; border-radius: 5px; margin-top: 10px; } .judge-header { cursor: pointer; padding: 10px; background-color: #f5f5f5; display: flex; justify-content: space-between; align-items: center; } .judge-content { padding: 10px; } """ # 初始化数据库 init_db() with gr.Blocks(css=css) as iface: # 会话状态 session_state = gr.State({}) judge_section_state = gr.State("expanded") # 初始为展开状态 # 顶部欢迎语和限制说明 gr.Markdown("""

👋 Hey there! You're using the ECHOscore demo.

It's a lighter version with limited features.

For the full power, grab the desktop version(coming soon)!

""") # 每日限制提示(初始隐藏) limit_notice = gr.Markdown("""

⚠️ Oops! Daily limit reached.

Tomorrow’s a new day — or skip the wait with desktop version (coming soon)!

""", visible=False) gr.Markdown("# ECHOscore – Prompt vs Output Evaluation") # 当前使用情况 daily_count = gr.Textbox(label="Daily Counts", value="Today Counts: 0/150", interactive=False, visible=False) with gr.Row(): prompt_in = gr.Textbox(lines=4, label="Input (Prompt)") output_in = gr.Textbox(lines=4, label="Output (Model Response)") # verification code captcha_text = gr.Textbox(label="verification code", interactive=False) captcha_answer = gr.Textbox(label="Please enter the calculation result", placeholder="Verification code answer") correct_answer = gr.State(8) # 初始值,会在页面加载时更新 with gr.Row(): s1 = gr.Slider(0,5,0,step=0.1, label="Prompt – Clarity") s2 = gr.Slider(0,5,0,step=0.1, label="Prompt – Scope Definition") s3 = gr.Slider(0,5,0,step=0.1, label="Prompt – Intent Alignment") s4 = gr.Slider(0,5,0,step=0.1, label="Prompt – Bias / Induction") s5 = gr.Slider(0,5,0,step=0.1, label="Prompt – Efficiency") e1 = gr.Textbox(lines=2, label="Explain Clarity (<3)", visible=False) e2 = gr.Textbox(lines=2, label="Explain Scope Definition (<3)", visible=False) e3 = gr.Textbox(lines=2, label="Explain Intent Alignment (<3)", visible=False) e4 = gr.Textbox(lines=2, label="Explain Bias / Induction (<3)", visible=False) e5 = gr.Textbox(lines=2, label="Explain Efficiency (<3)", visible=False) remark = gr.Textbox(lines=2, label="Internet slang & technical terms notes (optional)") # Judge模块 - 可折叠/展开 with gr.Row(): with gr.Column(scale=12): judge_header = gr.Markdown("""
LLM-as-a-Judge (optional)
""") with gr.Column(scale=1, visible=False): toggle_judge_btn = gr.Button("Collapse", visible=False) with gr.Row(visible=True) as judge_section: judge_llm = gr.Textbox(lines=1, label="LLM-as-a-Judge (optional-Place the NAME of LLM)") gr.Markdown("**LLM Scoring Examples**", elem_classes="example-label") ja1 = gr.Number(label="Sensory Accuracy (only for desktop version)", value=0, precision=1, step=0.1, interactive=False) ja2 = gr.Number(label="Emotional Engagement (only for desktop version)", value=0, precision=1, step=0.1, interactive=False) ja3 = gr.Number(label="Flow & Naturalness (only for desktop version)", value=0, precision=1, step=0.1, interactive=False) ja4 = gr.Number(label="Imagery Completeness (only for desktop version)", value=0, precision=1, step=0.1, interactive=False) ja5 = gr.Number(label="Simplicity & Accessibility (only for desktop version)", value=0, precision=1, step=0.1, interactive=False) judge_remark = gr.Textbox(lines=2, label="Judge Remarks (only for desktop version)", interactive=True) # 升级提示 gr.Markdown("""

🔝 Unlock Full Features

Get access to all dimensions and unlimited evaluations.

Learn more about ECHOscore
""") s1.change(toggle_explain, s1, e1) s2.change(toggle_explain, s2, e2) s3.change(toggle_explain, s3, e3) s4.change(toggle_explain, s4, e4) s5.change(toggle_explain, s5, e5) # 结果区域(初始隐藏) with gr.Row(visible=False) as results_area: subj_tbl = gr.Dataframe(label="Prompt Subjective Scores") oval_tbl = gr.Dataframe(label="OVAL Automated Scores") deep_tbl = gr.Dataframe(label="DeepEval Automated Scores") radar = gr.Plot(label="Final Radar Chart") csv_out = gr.File(label="Export CSV") notes_out = gr.Textbox(label="Notes (Slang/Tech Terms)") exp1_out = gr.Textbox(label="Clarity Explanation") exp2_out = gr.Textbox(label="Scope Definition Explanation") exp3_out = gr.Textbox(label="Intent Alignment Explanation") exp4_out = gr.Textbox(label="Bias/Induction Explanation") exp5_out = gr.Textbox(label="Efficiency Explanation") auto_out = gr.Textbox(label="Automatic Explanation") judge_llm_out = gr.Textbox(label="LLM-as-a-Judge") ja1_out = gr.Number(label="Sensory Accuracy",visible=False) ja2_out = gr.Number(label="Emotional Engagement",visible=False) ja3_out = gr.Number(label="Flow & Naturalness",visible=False) ja4_out = gr.Number(label="Imagery Completeness",visible=False) ja5_out = gr.Number(label="Simplicity & Accessibility",visible=False) judge_remarks_out = gr.Textbox(label="Judge Remarks") submit = gr.Button("Submit", elem_id="submit-btn") # 新增:创建一个用于显示底部计数器的组件 footer_count = gr.Textbox(label="Today's Usage", value="Today Counts: 0/150", interactive=False, visible=True) gr.Markdown("""
⚠️ This is a **demo version** of ECHOscore. Data contribution, uploads, and edits are **not supported**. To try the full version, please download the desktop release.
""") # 初始化检查 iface.load( check_daily_limit_state, None, [limit_notice, submit, results_area, daily_count, footer_count] # 添加footer_count ) iface.load( lambda: generate_captcha(), None, [captcha_text, correct_answer] ) submit.click( evaluate, [ prompt_in, output_in, s1, s2, s3, s4, s5, e1, e2, e3, e4, e5, judge_llm, ja1, ja2, ja3, ja4, ja5, judge_remark, remark, captcha_answer, correct_answer, session_state ], [ limit_notice, results_area, subj_tbl, oval_tbl, deep_tbl, radar, csv_out, notes_out, exp1_out, exp2_out, exp3_out, exp4_out, exp5_out, auto_out, judge_llm_out, ja1_out, ja2_out, ja3_out, ja4_out, ja5_out, judge_remarks_out, captcha_text, correct_answer, daily_count, judge_section_state, # 更新Judge区块状态 footer_count # 更新底部计数器 ] ) # 点击CSV下载按钮时显示提示 csv_out.download(show_personal_version_notice) # 切换Judge部分的显示状态 toggle_judge_btn.click( lambda x: ("expanded" if x == "collapsed" else "collapsed"), judge_section_state, judge_section_state ).then( toggle_judge_section, judge_section_state, [judge_section, toggle_judge_btn] ) if __name__ == "__main__": iface.launch()