fengkaobiguo / simple_web_enhanced.py
Yaoliang's picture
feat: 更新和新增文件,修复依赖问题
c30406d
#!/usr/bin/env python3
"""
知识库问答系统Web界面 - 增强版(集成图片识别)
"""
import streamlit as st
import requests
import json
import os
from pathlib import Path
from simple_qa import call_llm_api
import pdfplumber
import re
from PIL import Image, ImageFilter
import pytesseract
from enhanced_ocr import image_processor
# 指定 tesseract 主程序路径
pytesseract.pytesseract.tesseract_cmd = r'E:\p\tesseract.exe'
# 指定 tessdata 目录(含chi_sim.traineddata)
os.environ['TESSDATA_PREFIX'] = r'E:\p\tessdata'
# 页面配置
st.set_page_config(page_title="逢考必过·AI考试复习助手", page_icon="🎓", layout="wide")
# 阶跃API配置
STEP_API_KEY = "5LHfDtyA4XFX5ObOqZtIrz0UlOMcYEn2hvy0FQdhT113enLNiLySnSWndOzz75ir4"
BASE_URL = "https://api.stepfun.com/v1"
def upload_file_to_step(file, purpose="file-extract"):
"""上传文件到阶跃API"""
headers = {
"Authorization": f"Bearer {STEP_API_KEY}"
}
files = {
"file": (file.name, file.getvalue(), file.type)
}
data = {
"purpose": purpose
}
try:
response = requests.post(f"{BASE_URL}/files", headers=headers, files=files, data=data)
if response.status_code == 200:
return response.json()
else:
st.error(f"阶跃API上传失败: {response.status_code} - {response.text}")
return None
except Exception as e:
st.error(f"上传文件时出错: {str(e)}")
return None
def extract_text_from_pdf(file):
"""从PDF文件提取文本"""
try:
with pdfplumber.open(file) as pdf:
text = ""
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
except Exception as e:
st.error(f"PDF解析失败: {str(e)}")
return ""
def extract_text_from_image(file):
"""从图片文件提取文本 - 使用增强OCR"""
try:
image = Image.open(file)
# 使用增强图片处理器分析图片
analysis_result = image_processor.analyze_image(image)
# 显示分析结果
with st.expander(f"📊 图片分析结果 - {file.name}", expanded=False):
col1, col2, col3 = st.columns(3)
with col1:
st.metric("图片类型", analysis_result['image_type'])
with col2:
st.metric("置信度", f"{analysis_result['confidence']:.2f}")
with col3:
st.metric("文字长度", len(analysis_result['text']))
return analysis_result['text']
except Exception as e:
st.error(f"图片处理失败: {str(e)}")
# 回退到基础OCR
try:
image = Image.open(file)
image = image.convert('L')
image = image.point(lambda x: 0 if x < 140 else 255, '1')
image = image.filter(ImageFilter.SHARPEN)
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
return text.strip()
except Exception as e2:
st.error(f"基础OCR也失败: {str(e2)}")
return ""
def split_text(text, max_chars=6000):
"""将文本按max_chars分段"""
return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
# 标题
st.title("🎓 逢考必过 · AI考试复习助手")
st.markdown("""
<div style='font-size:20px; color:#1976d2; font-weight:bold;'>
上传教材/讲义/笔记,AI自动生成高质量考题与答案,帮你高效梳理知识点,助力逢考必过!
</div>
---
""", unsafe_allow_html=True)
st.sidebar.header("🛠️ 功能区")
model_type = st.sidebar.selectbox(
"请选择大模型API(推荐DeepSeek)",
["deepseek", "stepfun"],
format_func=lambda x: "DeepSeek(更强)" if x=="deepseek" else "阶跃 StepFun"
)
# 图片处理选项
st.sidebar.subheader("🖼️ 图片处理选项")
image_enhancement = st.sidebar.selectbox(
"图片增强模式",
["auto", "text", "table", "formula"],
format_func=lambda x: {
"auto": "自动检测",
"text": "文本优化",
"table": "表格优化",
"formula": "公式优化"
}[x]
)
show_image_analysis = st.sidebar.checkbox("显示图片分析详情", value=True)
st.sidebar.info("支持PDF/文本/图片上传,智能识别文字、表格、公式,调用大模型API生成高质量考题与答案。")
st.subheader("📚 上传你的复习资料")
uploaded_files = st.file_uploader("上传PDF/TXT/图片讲义/教材/笔记(可多选)", type=["pdf", "txt", "jpg", "jpeg", "png", "bmp", "tiff", "webp"], accept_multiple_files=True)
doc_content = st.text_area("或直接粘贴重点内容:", height=200, placeholder="粘贴你的复习资料、错题本、重点笔记……")
if st.button("🚀 一键生成智能考题"):
results = []
if uploaded_files:
for uploaded_file in uploaded_files:
text = ""
file_name = uploaded_file.name
with st.spinner(f"正在处理文件: {file_name}..."):
if uploaded_file.type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type.startswith("image/"):
text = extract_text_from_image(uploaded_file)
else:
text += uploaded_file.read().decode("utf-8", errors="ignore")
# 判断是否快处理
if len(text) < 6000:
with st.spinner(f"AI正在为【{file_name}】生成考题……"):
try:
qa_result = call_llm_api(text, model_type=model_type)
file_qa = [(1, qa_result)]
results.append((file_name, file_qa))
except Exception as e:
st.error(f"AI生成失败: {e}")
else:
# 分段处理
segments = split_text(text, max_chars=6000)
file_qa = []
for seg_idx, seg in enumerate(segments):
if not seg.strip():
continue
with st.spinner(f"AI正在为【{file_name}】第{seg_idx+1}段生成考题……"):
try:
qa_result = call_llm_api(seg, model_type=model_type)
file_qa.append((seg_idx+1, qa_result))
except Exception as e:
file_qa.append((seg_idx+1, f"生成失败:{e}"))
results.append((file_name, file_qa))
elif doc_content.strip():
text = doc_content.strip()
if len(text) < 6000:
# 快处理:整体调用
with st.spinner(f"AI正在为粘贴内容生成考题……"):
try:
qa_result = call_llm_api(text, model_type=model_type)
file_qa = [(1, qa_result)]
except Exception as e:
file_qa = [(1, f"生成失败:{e}")]
else:
segments = split_text(text, max_chars=6000)
file_qa = []
for seg_idx, seg in enumerate(segments):
if not seg.strip():
continue
with st.spinner(f"AI正在为粘贴内容第{seg_idx+1}段生成考题……"):
try:
qa_result = call_llm_api(seg, model_type=model_type)
file_qa.append((seg_idx+1, qa_result))
except Exception as e:
file_qa.append((seg_idx+1, f"生成失败:{e}"))
results.append(("粘贴内容", file_qa))
else:
st.warning("请上传文件或粘贴内容!")
st.stop()
# 展示所有结果
for file_name, file_qa in results:
st.success(f"🎉 【{file_name}】智能考题生成成功!")
st.markdown("---")
st.markdown(f"<div style='font-size:18px; color:#388e3c; font-weight:bold;'>【{file_name} · AI智能考题与答案】</div>", unsafe_allow_html=True)
for seg_idx, qa_result in file_qa:
st.markdown(f"<span style='color:#888;'>—— 第{seg_idx}段 ——</span>", unsafe_allow_html=True)
if isinstance(qa_result, str):
st.error("AI输出格式异常,原始内容如下:")
st.code(qa_result)
elif isinstance(qa_result, list) and qa_result and isinstance(qa_result[0], dict) and 'question' in qa_result[0]:
for idx, qa in enumerate(qa_result, 1):
st.markdown(f"**Q{idx}{qa['question']}**")
st.markdown(f"<span style='color:#1976d2;'>A{idx}:</span> {qa['answer']}", unsafe_allow_html=True)
st.markdown("---")
else:
st.error("AI输出格式异常,原始内容如下:")
st.code(str(qa_result))
# 新增:图片处理演示区域
if uploaded_files and show_image_analysis:
st.markdown("---")
st.subheader("🖼️ 图片处理演示")
for uploaded_file in uploaded_files:
if uploaded_file.type.startswith("image/"):
col1, col2 = st.columns(2)
with col1:
st.subheader("原图")
st.image(uploaded_file, caption=uploaded_file.name, use_column_width=True)
with col2:
st.subheader("处理后")
try:
image = Image.open(uploaded_file)
processed_image = image_processor.enhance_image(image, image_enhancement)
st.image(processed_image, caption=f"增强模式: {image_enhancement}", use_column_width=True)
except Exception as e:
st.error(f"图片处理失败: {str(e)}")
st.markdown("""
<div style='color:#757575; font-size:14px;'>
<b>Tips:</b> 本助手适合期末复习、考研、四六级、各类资格证备考等场景,支持大段资料一键生成高质量考题,助你高效掌握重点难点。
<br><b>新增功能:</b> 智能图片识别,支持文字、表格、公式自动提取,图像质量自动增强!
</div>
""", unsafe_allow_html=True)
# 侧边栏 - 系统信息
with st.sidebar:
st.header("⚙️ 系统信息")
st.subheader("API状态")
st.success("✅ 阶跃API: 已配置")
st.success("✅ 图片识别: 已启用")
st.subheader("快速操作")
if st.button("🔄 清空当前问答"):
st.session_state["last_qa"] = []
st.rerun()
if st.button("📊 查看统计"):
st.metric("当前问答对数", len(st.session_state.get("last_qa", [])))