zhman's picture
Update app.py
4032e2b verified
import streamlit as st
import json
import time
import re
from pathlib import Path
# ==========================================
# 页面配置
# ==========================================
st.set_page_config(
page_title="AI-Math Reasoning Demo",
page_icon="⚡",
layout="wide",
initial_sidebar_state="collapsed"
)
# ==========================================
# 科技感 CSS 样式
# ==========================================
st.markdown("""
<style>
/* 深色赛博朋克背景 with 粒子效果 */
.stApp {
background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 50%, #0f1419 100%);
color: #e0e0e0;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
position: relative;
overflow: hidden;
}
/* 粒子背景层 */
.stApp::before {
content: '';
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-image:
radial-gradient(2px 2px at 20% 30%, rgba(0, 245, 255, 0.3), transparent),
radial-gradient(2px 2px at 60% 70%, rgba(0, 255, 136, 0.3), transparent),
radial-gradient(1px 1px at 50% 50%, rgba(138, 43, 226, 0.3), transparent),
radial-gradient(1px 1px at 80% 10%, rgba(0, 245, 255, 0.4), transparent),
radial-gradient(2px 2px at 90% 60%, rgba(255, 107, 0, 0.3), transparent),
radial-gradient(1px 1px at 33% 85%, rgba(0, 255, 136, 0.3), transparent),
radial-gradient(1px 1px at 75% 40%, rgba(0, 245, 255, 0.3), transparent);
background-size: 200% 200%, 180% 180%, 220% 220%, 190% 190%, 210% 210%, 195% 195%, 205% 205%;
background-position: 0% 0%, 100% 0%, 50% 50%, 0% 100%, 100% 100%, 25% 25%, 75% 75%;
animation: particleFloat 20s ease-in-out infinite;
pointer-events: none;
z-index: 0;
}
/* 流星效果 */
.stApp::after {
content: '';
position: fixed;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background-image:
linear-gradient(90deg, transparent 0%, rgba(0, 245, 255, 0.8) 50%, transparent 100%);
background-size: 200px 2px;
background-repeat: no-repeat;
animation: meteor 15s linear infinite;
pointer-events: none;
z-index: 1;
opacity: 0.3;
}
@keyframes particleFloat {
0%, 100% {
background-position: 0% 0%, 100% 0%, 50% 50%, 0% 100%, 100% 100%, 25% 25%, 75% 75%;
}
50% {
background-position: 100% 100%, 0% 100%, 75% 75%, 100% 0%, 0% 0%, 75% 75%, 25% 25%;
}
}
@keyframes meteor {
0% {
transform: translateX(-100%) translateY(-100%) rotate(45deg);
opacity: 0;
}
10% {
opacity: 0.3;
}
50% {
transform: translateX(50%) translateY(50%) rotate(45deg);
opacity: 0.3;
}
90% {
opacity: 0;
}
100% {
transform: translateX(200%) translateY(200%) rotate(45deg);
opacity: 0;
}
}
/* 确保内容在粒子层之上 */
.main .block-container {
position: relative;
z-index: 2;
}
/* 修复Streamlit容器高度问题 - 确保页面可见 */
html, body, #root, .stApp {
height: auto !important;
min-height: 100vh !important;
}
.main {
height: auto !important;
min-height: 100vh !important;
}
/* 移除全局 * 选择器,因为它会破坏 KaTeX 公式的字体渲染 */
/* 隐藏默认元素 */
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
header {visibility: hidden;}
/* 标题样式 - 霓虹灯效果 */
.main-title {
font-size: 48px;
font-weight: 900;
text-align: center;
background: linear-gradient(90deg, #00f5ff, #0099ff, #00f5ff);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
text-shadow: 0 0 20px rgba(0, 245, 255, 0.5);
margin-bottom: 10px;
animation: glow 2s ease-in-out infinite alternate;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
@keyframes glow {
from { filter: drop-shadow(0 0 5px #00f5ff); }
to { filter: drop-shadow(0 0 20px #00f5ff); }
}
.subtitle {
text-align: center;
color: #00ff88;
font-size: 18px;
margin-bottom: 30px;
letter-spacing: 2px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
/* 统计面板 */
.stats-container {
display: flex;
justify-content: space-around;
margin: 30px 0;
gap: 20px;
}
.stat-box {
background: linear-gradient(135deg, rgba(0, 245, 255, 0.1), rgba(0, 153, 255, 0.1));
border: 2px solid #00f5ff;
border-radius: 15px;
padding: 25px;
flex: 1;
text-align: center;
box-shadow: 0 0 30px rgba(0, 245, 255, 0.3);
transition: all 0.3s ease;
animation: pulse 3s ease-in-out infinite;
position: relative;
overflow: hidden;
}
.stat-box::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle, rgba(0, 245, 255, 0.1) 0%, transparent 70%);
animation: rotate 10s linear infinite;
}
@keyframes pulse {
0%, 100% {
box-shadow: 0 0 30px rgba(0, 245, 255, 0.3);
}
50% {
box-shadow: 0 0 50px rgba(0, 245, 255, 0.5), 0 0 70px rgba(0, 255, 136, 0.3);
}
}
@keyframes rotate {
0% {
transform: rotate(0deg);
}
100% {
transform: rotate(360deg);
}
}
.stat-box:hover {
transform: translateY(-5px) scale(1.02);
box-shadow: 0 5px 40px rgba(0, 245, 255, 0.5);
animation-play-state: paused;
}
.stat-label {
color: #888;
font-size: 14px;
text-transform: uppercase;
letter-spacing: 2px;
margin-bottom: 10px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
position: relative;
z-index: 1;
}
.stat-value {
color: #00ff88;
font-size: 36px;
font-weight: bold;
text-shadow: 0 0 10px rgba(0, 255, 136, 0.5);
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
position: relative;
z-index: 1;
animation: glow-value 2s ease-in-out infinite alternate;
}
@keyframes glow-value {
from {
text-shadow: 0 0 10px rgba(0, 255, 136, 0.5);
}
to {
text-shadow: 0 0 20px rgba(0, 255, 136, 0.8), 0 0 30px rgba(0, 245, 255, 0.4);
}
}
/* 问题容器 */
.question-container {
background: linear-gradient(135deg, rgba(255, 107, 0, 0.15), rgba(255, 0, 128, 0.1));
border-left: 5px solid #ff6b00;
border-radius: 10px;
padding: 25px;
margin: 20px 0;
box-shadow: 0 4px 20px rgba(255, 107, 0, 0.3);
}
.question-label {
color: #ff6b00;
font-size: 18px;
font-weight: bold;
margin-bottom: 15px;
text-transform: uppercase;
letter-spacing: 2px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
.question-text {
color: #e0e0e0;
font-size: 16px;
line-height: 1.8;
font-family: 'Georgia', serif;
}
/* AI 推理容器 */
.reasoning-container {
background: linear-gradient(135deg, rgba(0, 255, 136, 0.1), rgba(0, 200, 255, 0.1));
border-left: 5px solid #00ff88;
border-radius: 10px;
padding: 25px;
margin: 20px 0;
box-shadow: 0 4px 20px rgba(0, 255, 136, 0.3);
}
.reasoning-label {
color: #00ff88;
font-size: 18px;
font-weight: bold;
margin-bottom: 15px;
text-transform: uppercase;
letter-spacing: 2px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
.reasoning-text {
color: #e0e0e0;
font-size: 15px;
line-height: 1.9;
font-family: 'Consolas', monospace;
}
/* 光标闪烁效果 */
.cursor {
display: inline-block;
width: 10px;
height: 20px;
background-color: #00ff88;
margin-left: 3px;
animation: blink 0.7s infinite;
}
@keyframes blink {
0%, 50% { opacity: 1; }
51%, 100% { opacity: 0; }
}
/* 答案框 */
/* 答案框 */
.answer-box {
background: linear-gradient(135deg, rgba(138, 43, 226, 0.2), rgba(75, 0, 130, 0.2));
border: 3px solid #8a2be2;
border-bottom: none;
border-radius: 15px 15px 0 0;
padding: 20px 30px;
margin: 25px 0 0 0;
text-align: center;
box-shadow: 0 -5px 20px rgba(138, 43, 226, 0.3);
}
.answer-body {
background: linear-gradient(135deg, rgba(138, 43, 226, 0.1), rgba(75, 0, 130, 0.15));
border: 3px solid #8a2be2;
border-top: none;
border-radius: 0 0 15px 15px;
padding: 30px;
margin-bottom: 25px;
box-shadow: 0 15px 30px rgba(138, 43, 226, 0.3);
}
.answer-label {
color: #da70d6;
font-size: 16px;
margin-bottom: 15px;
letter-spacing: 2px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
.answer-value {
color: #ffffff;
font-size: 42px;
font-weight: bold;
text-shadow: 0 0 20px rgba(218, 112, 214, 0.8);
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
/* 进度条 */
.progress-container {
margin: 30px 0;
padding: 20px;
background: rgba(0, 0, 0, 0.3);
border-radius: 10px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
.progress-bar {
height: 8px;
background: linear-gradient(90deg, #00f5ff, #00ff88);
border-radius: 10px;
transition: width 0.3s ease;
box-shadow: 0 0 15px rgba(0, 245, 255, 0.6);
}
/* 按钮样式 */
.stButton > button {
background: linear-gradient(135deg, #00f5ff, #0099ff);
color: #000;
border: none;
border-radius: 25px;
padding: 15px 40px;
font-size: 18px;
font-weight: bold;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 0 4px 20px rgba(0, 245, 255, 0.4);
text-transform: uppercase;
letter-spacing: 2px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
.stButton > button:hover {
transform: scale(1.05);
box-shadow: 0 6px 30px rgba(0, 245, 255, 0.6);
}
/* 标签 */
.badge {
display: inline-block;
padding: 8px 16px;
border-radius: 20px;
font-size: 14px;
font-weight: bold;
margin: 5px;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
}
.badge-correct {
background: linear-gradient(135deg, #00ff88, #00cc66);
color: #000;
box-shadow: 0 0 15px rgba(0, 255, 136, 0.5);
}
.badge-incorrect {
background: linear-gradient(135deg, #ff4444, #cc0000);
color: #fff;
box-shadow: 0 0 15px rgba(255, 68, 68, 0.5);
}
</style>
""", unsafe_allow_html=True)
# ==========================================
# 辅助函数
# ==========================================
def load_evaluation_data():
"""加载评估数据"""
try:
with open("evaluation_results.json", "r", encoding="utf-8") as f:
return json.load(f)
except:
return None
def extract_pure_question(question_text):
"""从 question 字段中提取纯粹的数学问题(去除 Assistant 的回答)"""
# 找到第一个 "Assistant:" 的位置
assistant_pos = question_text.find("Assistant:")
if assistant_pos != -1:
# 只取 Assistant 之前的内容
question_only = question_text[:assistant_pos].strip()
else:
question_only = question_text
# 移除提示词部分("请你一步一步地思考..." 之后的内容)
prompt_phrases = [
"请你一步一步地思考,并给出最终答案",
"Please reason step by step"
]
for phrase in prompt_phrases:
if phrase in question_only:
question_only = question_only.split(phrase)[0].strip()
break
return question_only
def extract_reasoning_and_answer(model_output):
"""从 model_output 中提取推理过程和答案(取第一次 Assistant 的推理过程)"""
# 找到所有 "Assistant:" 的位置
assistant_positions = [m.start() for m in re.finditer(r'Assistant:', model_output)]
# 提取第一次 Assistant 的内容(标准答案的推理过程)
if len(assistant_positions) >= 1:
# 取第一个 Assistant 开始到第二个 Assistant 之前(如果有的话)
if len(assistant_positions) >= 2:
first_assistant_output = model_output[assistant_positions[0]:assistant_positions[1]]
else:
first_assistant_output = model_output[assistant_positions[0]:]
else:
first_assistant_output = model_output
# 提取 <reasoning> 标签内容(完整推理过程)
reasoning_match = re.search(r'<reasoning>(.*?)</reasoning>', first_assistant_output, re.DOTALL)
reasoning = reasoning_match.group(1).strip() if reasoning_match else ""
# 提取第一个 <answer> 标签内容
answer_match = re.search(r'<answer>(.*?)</answer>', first_assistant_output, re.DOTALL)
answer = answer_match.group(1).strip() if answer_match else ""
# 提取 \boxed{} 内容
if "\\boxed{" in answer:
boxed_match = re.search(r'\\boxed\{([^}]+)\}', answer)
if boxed_match:
answer = boxed_match.group(1)
return reasoning, answer
def render_latex(text):
"""将文本中的 LaTeX 公式转换为 Streamlit 可渲染的格式"""
# 移除:text = re.sub(r'\$([^\$]+)\$', r'$\1$', text)
# 该正则可能导致不必要转义或破坏
# === 处理双重转义的 LaTeX 定界符 (Main Fix) ===
# JSON 数据中存在双重转义的定界符,例如 \\( ... \\) 和 \\[ ... \\]
# 需要匹配两个反斜杠,Regex 中需要 5 个反斜杠 r'\\\\\(' 来匹配字符串中的 \\(
# 1. 双重转义行内公式: \\( ... \\) -> $ ... $
text = re.sub(r'\\\\\((.*?)\\\\\)', lambda m: f"${m.group(1)}$", text, flags=re.DOTALL)
# 2. 双重转义块级公式: \\[ ... \\] -> $$ ... $$
text = re.sub(r'\\\\\[(.*?)\\\\\]', lambda m: f"$${m.group(1)}$$", text, flags=re.DOTALL)
# === 处理各种块级公式定界符 ===
# 3. 标准 LaTeX: \[ ... \] -> $$ ... $$
# 使用 lambda 避免替换字符串中的反斜杠被误处理
text = re.sub(r'\\\[(.*?)\\\]', lambda m: f"$${m.group(1)}$$", text, flags=re.DOTALL)
# === 处理行内公式定界符 ===
# 3. 标准 LaTeX: \( ... \) -> $ ... $
text = re.sub(r'\\\((.*?)\\\)', lambda m: f"${m.group(1)}$", text, flags=re.DOTALL)
# 4. 将 \\boxed{} 转换为更好的显示格式
text = re.sub(r'\\boxed\{([^}]+)\}', r'**[\1]**', text)
# 5. 将 \qquad 转换为可见的下划线,方便阅读填空题
text = text.replace(r'\qquad', ' ______ ')
return text
def stream_text(text, placeholder, speed=0.01):
"""流式输出文本(打字机效果)- 支持 LaTeX 渲染"""
displayed_text = ""
# 预处理 LaTeX 格式,确保流式输出时也能正确渲染
processed_text = render_latex(text)
# 简单的按字符流式输出可能会破坏 LaTeX 语法(例如拆分了 \frac),
# 但在这里我们简化处理,假设渲染速度足够快,或者用户最终会看到完整结果。
# 为了更好的体验,可以按单词或小块输出,但按字符最简单。
# 如果文本包含 LaTeX,流式输出可能会闪烁或显示源码,直到公式闭合。
# 这是一个已知权衡。
for char in text: # 注意:这里如果用 processed_text 流式输出,光标位置可能不准确,简单起见还是用原文本流式,但渲染时用 processed
displayed_text += char
# 实时渲染需要处理当前的 displayed_text
current_render = render_latex(displayed_text)
# 使用 Streamlit 的 markdown 渲染 LaTeX
placeholder.markdown(
current_render + ' ▊', # 使用方块作为光标
unsafe_allow_html=False
)
time.sleep(speed)
# 最后一次显示完整文本(不带光标)
# 使用处理过的 LaTeX 文本进行最终展示
placeholder.markdown(render_latex(displayed_text), unsafe_allow_html=False)
# ==========================================
# 主应用
# ==========================================
def main():
# Title
st.markdown('<div class="main-title">⚡ AI-MATH REASONING Demonstration</div>', unsafe_allow_html=True)
st.markdown('<div class="subtitle"> Powered by Llama-3.2-3B | Model Optimized via Two-Stage SFT + GRPO</div>', unsafe_allow_html=True)
# 加载数据
data = load_evaluation_data()
if not data:
st.error("❌ 无法加载 evaluation_results.json 文件")
return
# 初始化 session state
if 'current_index' not in st.session_state:
st.session_state.current_index = 0
if 'is_streaming' not in st.session_state:
st.session_state.is_streaming = False
# 使用完整的题目列表
filtered_results = data["detailed_results"]
# 简化的统计面板 - 只显示题目总数和当前题号
st.markdown('<div class="stats-container">', unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
st.markdown(f'''
<div class="stat-box">
<div class="stat-label"> Total Questions</div>
<div class="stat-value">{data["total_questions"]}</div>
</div>
''', unsafe_allow_html=True)
with col2:
st.markdown(f'''
<div class="stat-box">
<div class="stat-label"> Current Question</div>
<div class="stat-value">#{st.session_state.current_index + 1}</div>
</div>
''', unsafe_allow_html=True)
# 进度条
progress = (st.session_state.current_index + 1) / len(filtered_results) * 100 if filtered_results else 0
st.markdown(f'''
<div class="progress-container">
<div style="color: #00f5ff; margin-bottom: 10px; text-align: center;">
Progress: {st.session_state.current_index + 1} / {len(filtered_results)}
</div>
<div style="background: rgba(255,255,255,0.1); border-radius: 10px; overflow: hidden;">
<div class="progress-bar" style="width: {progress}%"></div>
</div>
</div>
''', unsafe_allow_html=True)
# 获取当前题目
current_item = filtered_results[st.session_state.current_index]
# 提取纯粹的问题(不包含 Assistant 回答)
pure_question = extract_pure_question(current_item["question"])
# 显示问题(只显示纯粹的数学问题)
st.markdown('''
<div class="question-container">
<div class="question-label">📐 Problem Statement (Q{0})</div>
</div>
'''.format(st.session_state.current_index + 1), unsafe_allow_html=True)
# 在容器内使用 Streamlit markdown 渲染 LaTeX
with st.container():
st.markdown(f'<div style="padding: 0 25px 25px 25px; background: linear-gradient(135deg, rgba(255, 107, 0, 0.15), rgba(255, 0, 128, 0.1)); border-radius: 0 0 10px 10px; margin-top: -20px;">', unsafe_allow_html=True)
st.markdown(render_latex(pure_question))
st.markdown('</div>', unsafe_allow_html=True)
# 控制按钮
col_btn1, col_btn2, col_btn3, col_btn4 = st.columns([1, 1, 1, 1])
with col_btn1:
if st.button("⏮️ PREVIOUS", disabled=st.session_state.current_index == 0):
st.session_state.current_index -= 1
st.session_state.is_streaming = False
st.rerun()
with col_btn2:
if st.button("▶️ START REASONING", disabled=st.session_state.is_streaming):
st.session_state.is_streaming = True
st.rerun()
with col_btn3:
if st.button("⏭️ NEXT", disabled=st.session_state.current_index >= len(filtered_results) - 1):
st.session_state.current_index += 1
st.session_state.is_streaming = False
st.rerun()
with col_btn4:
if st.button("🔄 RESET"):
st.session_state.current_index = 0
st.session_state.is_streaming = False
st.rerun()
# AI 推理过程展示
st.markdown(f'''
<div class="reasoning-container">
<div class="reasoning-label">⚡ Model Reasoning Process</div>
</div>
''', unsafe_allow_html=True)
# 创建推理内容容器
reasoning_container = st.container()
# 提取推理和答案(只取第二次 Assistant 之后的内容)
reasoning, extracted_answer = extract_reasoning_and_answer(current_item["model_output"])
if st.session_state.is_streaming:
# 流式输出推理过程
with reasoning_container:
st.markdown('<div style="padding: 0 25px 25px 25px; background: linear-gradient(135deg, rgba(0, 255, 136, 0.1), rgba(0, 200, 255, 0.1)); border-radius: 0 0 10px 10px; margin-top: -20px;">', unsafe_allow_html=True)
reasoning_placeholder = st.empty()
if reasoning:
# 流式输出推理过程(支持 LaTeX 渲染)
stream_text(reasoning, reasoning_placeholder, speed=0.05)
else:
reasoning_placeholder.markdown("*No reasoning generated by the model.*")
st.markdown('</div>', unsafe_allow_html=True)
# 显示答案
is_correct = current_item.get("is_correct", False)
badge_class = "badge-correct" if is_correct else "badge-incorrect"
badge_text = "✅ CORRECT" if is_correct else "❌ INCORRECT"
time.sleep(0.5)
st.markdown(f'''
<div class="answer-box">
<div class="answer-label"> Final Answer (Evaluation) <span class="badge {badge_class}">{badge_text}</span></div>
</div>
''', unsafe_allow_html=True)
# 格式化答案用于显示 (LaTeX wrapper)
def fmt_ans(txt):
if not txt: return "N/A"
# 如果看起来像 Latex 或者包含特殊符号,用 $$ 包裹
if any(c in txt for c in ['\\', '^', '_', '{', '}']):
clean_txt = txt.replace('$', '')
return f"${clean_txt}$"
return txt
model_disp = fmt_ans(extracted_answer)
real_disp = fmt_ans(current_item["expected_answer"])
st.markdown(f'''
<div class="answer-body">
<!-- 占位,内容通过 st.columns 动态填充 -->
</div>
''', unsafe_allow_html=True)
# 使用 columns 将内容“移入” answer-body 的视觉范围内
# 注意:Streamlit 不支持直接将组件嵌入自定义 HTML div 中。
# 我们使用负 margin 将 columns 向上移动覆盖到 answer-body 上。
with st.container():
st.markdown('<div style="margin-top: -120px; position: relative; z-index: 100;">', unsafe_allow_html=True)
col_pred, col_truth = st.columns(2)
with col_pred:
st.markdown(
f"""
<div style="text-align: center; padding: 10px; background: rgba(0,0,0,0.2); border-radius: 10px; margin: 0 10px;">
<div style="color: #da70d6; font-size: 14px; margin-bottom: 10px; text-transform: uppercase;">Prediction</div>
<div style="color: #fff; font-size: 24px; font-weight: bold;">{model_disp}</div>
</div>
""",
unsafe_allow_html=True
)
with col_truth:
st.markdown(
f"""
<div style="text-align: center; padding: 10px; background: rgba(0,0,0,0.2); border-radius: 10px; border: 1px solid rgba(0,255,136,0.3); margin: 0 10px;">
<div style="color: #00ff88; font-size: 14px; margin-bottom: 10px; text-transform: uppercase;">Ground Truth</div>
<div style="color: #fff; font-size: 24px; font-weight: bold;">{real_disp}</div>
</div>
""",
unsafe_allow_html=True
)
st.markdown('</div>', unsafe_allow_html=True)
# 自动标记为完成
st.session_state.is_streaming = False
else:
# 未开始推理,显示等待状态
with reasoning_container:
st.markdown('<div style="padding: 0 25px 25px 25px; background: linear-gradient(135deg, rgba(0, 255, 136, 0.1), rgba(0, 200, 255, 0.1)); border-radius: 0 0 10px 10px; margin-top: -20px; color: #666; font-style: italic;">', unsafe_allow_html=True)
st.markdown("⏸️ Waiting to start. Click **START REASONING** to view the chain of thought...")
st.markdown('</div>', unsafe_allow_html=True)
if __name__ == "__main__":
main()