Spaces:

Yaoliang
/

fengkaobiguo

Sleeping

App Files Files Community

fengkaobiguo / simple_web_enhanced.py

Yaoliang

feat: 更新和新增文件，修复依赖问题

c30406d 10 months ago

raw

history blame contribute delete

10.9 kB

	#!/usr/bin/env python3
	"""
	知识库问答系统Web界面 - 增强版（集成图片识别）
	"""

	import streamlit as st
	import requests
	import json
	import os
	from pathlib import Path
	from simple_qa import call_llm_api
	import pdfplumber
	import re
	from PIL import Image, ImageFilter
	import pytesseract
	from enhanced_ocr import image_processor

	# 指定 tesseract 主程序路径
	pytesseract.pytesseract.tesseract_cmd = r'E:\p\tesseract.exe'
	# 指定 tessdata 目录（含chi_sim.traineddata）
	os.environ['TESSDATA_PREFIX'] = r'E:\p\tessdata'

	# 页面配置
	st.set_page_config(page_title="逢考必过·AI考试复习助手", page_icon="🎓", layout="wide")

	# 阶跃API配置
	STEP_API_KEY = "5LHfDtyA4XFX5ObOqZtIrz0UlOMcYEn2hvy0FQdhT113enLNiLySnSWndOzz75ir4"
	BASE_URL = "https://api.stepfun.com/v1"

	def upload_file_to_step(file, purpose="file-extract"):
	"""上传文件到阶跃API"""
	headers = {
	"Authorization": f"Bearer {STEP_API_KEY}"
	}

	files = {
	"file": (file.name, file.getvalue(), file.type)
	}

	data = {
	"purpose": purpose
	}

	try:
	response = requests.post(f"{BASE_URL}/files", headers=headers, files=files, data=data)
	if response.status_code == 200:
	return response.json()
	else:
	st.error(f"阶跃API上传失败: {response.status_code} - {response.text}")
	return None
	except Exception as e:
	st.error(f"上传文件时出错: {str(e)}")
	return None

	def extract_text_from_pdf(file):
	"""从PDF文件提取文本"""
	try:
	with pdfplumber.open(file) as pdf:
	text = ""
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text.strip()
	except Exception as e:
	st.error(f"PDF解析失败: {str(e)}")
	return ""

	def extract_text_from_image(file):
	"""从图片文件提取文本 - 使用增强OCR"""
	try:
	image = Image.open(file)

	# 使用增强图片处理器分析图片
	analysis_result = image_processor.analyze_image(image)

	# 显示分析结果
	with st.expander(f"📊 图片分析结果 - {file.name}", expanded=False):
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("图片类型", analysis_result['image_type'])
	with col2:
	st.metric("置信度", f"{analysis_result['confidence']:.2f}")
	with col3:
	st.metric("文字长度", len(analysis_result['text']))

	return analysis_result['text']

	except Exception as e:
	st.error(f"图片处理失败: {str(e)}")
	# 回退到基础OCR
	try:
	image = Image.open(file)
	image = image.convert('L')
	image = image.point(lambda x: 0 if x < 140 else 255, '1')
	image = image.filter(ImageFilter.SHARPEN)
	text = pytesseract.image_to_string(image, lang='chi_sim+eng')
	return text.strip()
	except Exception as e2:
	st.error(f"基础OCR也失败: {str(e2)}")
	return ""

	def split_text(text, max_chars=6000):
	"""将文本按max_chars分段"""
	return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

	# 标题
	st.title("🎓 逢考必过 · AI考试复习助手")
	st.markdown("""
	<div style='font-size:20px; color:#1976d2; font-weight:bold;'>
	上传教材/讲义/笔记，AI自动生成高质量考题与答案，帮你高效梳理知识点，助力逢考必过！
	</div>
	---
	""", unsafe_allow_html=True)

	st.sidebar.header("🛠️ 功能区")
	model_type = st.sidebar.selectbox(
	"请选择大模型API（推荐DeepSeek）",
	["deepseek", "stepfun"],
	format_func=lambda x: "DeepSeek（更强）" if x=="deepseek" else "阶跃 StepFun"
	)

	# 图片处理选项
	st.sidebar.subheader("🖼️ 图片处理选项")
	image_enhancement = st.sidebar.selectbox(
	"图片增强模式",
	["auto", "text", "table", "formula"],
	format_func=lambda x: {
	"auto": "自动检测",
	"text": "文本优化",
	"table": "表格优化",
	"formula": "公式优化"
	}[x]
	)

	show_image_analysis = st.sidebar.checkbox("显示图片分析详情", value=True)

	st.sidebar.info("支持PDF/文本/图片上传，智能识别文字、表格、公式，调用大模型API生成高质量考题与答案。")

	st.subheader("📚 上传你的复习资料")
	uploaded_files = st.file_uploader("上传PDF/TXT/图片讲义/教材/笔记（可多选）", type=["pdf", "txt", "jpg", "jpeg", "png", "bmp", "tiff", "webp"], accept_multiple_files=True)
	doc_content = st.text_area("或直接粘贴重点内容：", height=200, placeholder="粘贴你的复习资料、错题本、重点笔记……")

	if st.button("🚀 一键生成智能考题"):
	results = []
	if uploaded_files:
	for uploaded_file in uploaded_files:
	text = ""
	file_name = uploaded_file.name

	with st.spinner(f"正在处理文件: {file_name}..."):
	if uploaded_file.type == "application/pdf":
	text = extract_text_from_pdf(uploaded_file)
	elif uploaded_file.type.startswith("image/"):
	text = extract_text_from_image(uploaded_file)
	else:
	text += uploaded_file.read().decode("utf-8", errors="ignore")

	# 判断是否快处理
	if len(text) < 6000:
	with st.spinner(f"AI正在为【{file_name}】生成考题……"):
	try:
	qa_result = call_llm_api(text, model_type=model_type)
	file_qa = [(1, qa_result)]
	results.append((file_name, file_qa))
	except Exception as e:
	st.error(f"AI生成失败: {e}")
	else:
	# 分段处理
	segments = split_text(text, max_chars=6000)
	file_qa = []
	for seg_idx, seg in enumerate(segments):
	if not seg.strip():
	continue
	with st.spinner(f"AI正在为【{file_name}】第{seg_idx+1}段生成考题……"):
	try:
	qa_result = call_llm_api(seg, model_type=model_type)
	file_qa.append((seg_idx+1, qa_result))
	except Exception as e:
	file_qa.append((seg_idx+1, f"生成失败：{e}"))
	results.append((file_name, file_qa))
	elif doc_content.strip():
	text = doc_content.strip()
	if len(text) < 6000:
	# 快处理：整体调用
	with st.spinner(f"AI正在为粘贴内容生成考题……"):
	try:
	qa_result = call_llm_api(text, model_type=model_type)
	file_qa = [(1, qa_result)]
	except Exception as e:
	file_qa = [(1, f"生成失败：{e}")]
	else:
	segments = split_text(text, max_chars=6000)
	file_qa = []
	for seg_idx, seg in enumerate(segments):
	if not seg.strip():
	continue
	with st.spinner(f"AI正在为粘贴内容第{seg_idx+1}段生成考题……"):
	try:
	qa_result = call_llm_api(seg, model_type=model_type)
	file_qa.append((seg_idx+1, qa_result))
	except Exception as e:
	file_qa.append((seg_idx+1, f"生成失败：{e}"))
	results.append(("粘贴内容", file_qa))
	else:
	st.warning("请上传文件或粘贴内容！")
	st.stop()

	# 展示所有结果
	for file_name, file_qa in results:
	st.success(f"🎉 【{file_name}】智能考题生成成功！")
	st.markdown("---")
	st.markdown(f"<div style='font-size:18px; color:#388e3c; font-weight:bold;'>【{file_name} · AI智能考题与答案】</div>", unsafe_allow_html=True)
	for seg_idx, qa_result in file_qa:
	st.markdown(f"<span style='color:#888;'>—— 第{seg_idx}段 ——</span>", unsafe_allow_html=True)
	if isinstance(qa_result, str):
	st.error("AI输出格式异常，原始内容如下：")
	st.code(qa_result)
	elif isinstance(qa_result, list) and qa_result and isinstance(qa_result[0], dict) and 'question' in qa_result[0]:
	for idx, qa in enumerate(qa_result, 1):
	st.markdown(f"Q{idx}：{qa['question']}")
	st.markdown(f"<span style='color:#1976d2;'>A{idx}：</span> {qa['answer']}", unsafe_allow_html=True)
	st.markdown("---")
	else:
	st.error("AI输出格式异常，原始内容如下：")
	st.code(str(qa_result))

	# 新增：图片处理演示区域
	if uploaded_files and show_image_analysis:
	st.markdown("---")
	st.subheader("🖼️ 图片处理演示")

	for uploaded_file in uploaded_files:
	if uploaded_file.type.startswith("image/"):
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("原图")
	st.image(uploaded_file, caption=uploaded_file.name, use_column_width=True)

	with col2:
	st.subheader("处理后")
	try:
	image = Image.open(uploaded_file)
	processed_image = image_processor.enhance_image(image, image_enhancement)
	st.image(processed_image, caption=f"增强模式: {image_enhancement}", use_column_width=True)
	except Exception as e:
	st.error(f"图片处理失败: {str(e)}")

	st.markdown("""
	<div style='color:#757575; font-size:14px;'>
	<b>Tips：</b> 本助手适合期末复习、考研、四六级、各类资格证备考等场景，支持大段资料一键生成高质量考题，助你高效掌握重点难点。
	<br><b>新增功能：</b> 智能图片识别，支持文字、表格、公式自动提取，图像质量自动增强！
	</div>
	""", unsafe_allow_html=True)

	# 侧边栏 - 系统信息
	with st.sidebar:
	st.header("⚙️ 系统信息")

	st.subheader("API状态")
	st.success("✅ 阶跃API: 已配置")
	st.success("✅ 图片识别: 已启用")

	st.subheader("快速操作")
	if st.button("🔄 清空当前问答"):
	st.session_state["last_qa"] = []
	st.rerun()

	if st.button("📊 查看统计"):
	st.metric("当前问答对数", len(st.session_state.get("last_qa", [])))