Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,143 +15,125 @@ OUTPUT_DIR = os.path.join(BASE_DIR, "mineru_outputs")
|
|
| 15 |
os.makedirs(PAPERS_DIR, exist_ok=True)
|
| 16 |
|
| 17 |
def get_debug_info():
|
| 18 |
-
"""读取服务器文件系统状态
|
| 19 |
now = datetime.now().strftime("%H:%M:%S")
|
| 20 |
files = os.listdir(PAPERS_DIR) if os.path.exists(PAPERS_DIR) else "Directory missing"
|
| 21 |
|
| 22 |
-
# 检查输出目录
|
| 23 |
-
|
| 24 |
if os.path.exists(OUTPUT_DIR):
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
if os.path.exists(CONFIG_PATH):
|
| 30 |
-
try:
|
| 31 |
-
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
| 32 |
-
config_content = f.read()
|
| 33 |
-
except Exception:
|
| 34 |
-
config_content = "Error reading config"
|
| 35 |
-
|
| 36 |
-
return f"[{now}] 📁 papers/ 文件夹:\n{files}\n\n[{now}] 📂 mineru_outputs:\n{output_status}\n\n[{now}] 📄 config.yaml 内容:\n{config_content}"
|
| 37 |
|
| 38 |
def save_pdf(file):
|
| 39 |
-
if file is None:
|
| 40 |
-
return "❌ 请先选择一个 PDF 文件", get_debug_info()
|
| 41 |
-
|
| 42 |
try:
|
| 43 |
-
|
| 44 |
-
file_path = os.path.join(PAPERS_DIR, file_name)
|
| 45 |
shutil.copy(file.name, file_path)
|
| 46 |
-
return f"✅
|
| 47 |
except Exception as e:
|
| 48 |
-
return f"❌
|
| 49 |
|
| 50 |
def save_api_key(api_key):
|
| 51 |
-
if not api_key:
|
| 52 |
-
return "❌ API Key 不能为空", get_debug_info()
|
| 53 |
-
|
| 54 |
try:
|
| 55 |
config = {}
|
| 56 |
if os.path.exists(CONFIG_PATH):
|
| 57 |
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
| 58 |
config = yaml.safe_load(f) or {}
|
| 59 |
-
|
| 60 |
-
if "api_keys" not in config:
|
| 61 |
-
config["api_keys"] = {}
|
| 62 |
-
config["api_keys"]["gemini_api_key"] = api_key
|
| 63 |
-
|
| 64 |
with open(CONFIG_PATH, "w", encoding="utf-8") as f:
|
| 65 |
-
yaml.dump(config, f, allow_unicode=True
|
| 66 |
-
|
| 67 |
-
return "✅ API Key 已成功写入 config.yaml", get_debug_info()
|
| 68 |
except Exception as e:
|
| 69 |
-
return f"❌
|
| 70 |
|
| 71 |
def run_mineru_parsing():
|
| 72 |
-
"""执行 PDF 解析
|
| 73 |
-
# 1. 判断是否上传了 PDF
|
| 74 |
if not os.path.exists(PAPERS_DIR) or not any(f.endswith('.pdf') for f in os.listdir(PAPERS_DIR)):
|
| 75 |
-
return "❌ 未发现
|
| 76 |
|
| 77 |
try:
|
| 78 |
-
# 2. 设置环境变量 (包含你发现的虚拟显存设置)
|
| 79 |
env = os.environ.copy()
|
| 80 |
env["MINERU_FORMULA_ENABLE"] = "false"
|
| 81 |
env["MINERU_TABLE_ENABLE"] = "false"
|
| 82 |
env["MINERU_DEVICE_MODE"] = "cpu"
|
| 83 |
-
# 核心修改:添加虚拟显存大小设置,规避 torch 检测错误
|
| 84 |
env["MINERU_VIRTUAL_VRAM_SIZE"] = "8"
|
| 85 |
|
| 86 |
-
#
|
| 87 |
-
command = ["mineru", "-p",
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
# 执行子进程
|
| 93 |
-
# capture_output=True 将捕获输出,text=True 将结果作为字符串处理
|
| 94 |
-
result = subprocess.run(command, env=env, capture_output=True, text=True)
|
| 95 |
-
|
| 96 |
if result.returncode == 0:
|
| 97 |
-
|
| 98 |
-
return "✅ PDF解析完成", get_debug_info()
|
| 99 |
else:
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
|
| 104 |
except Exception as e:
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
-
# ---
|
| 108 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 109 |
-
gr.Markdown("# 📑 PDF
|
| 110 |
|
| 111 |
with gr.Row():
|
| 112 |
-
|
| 113 |
-
with gr.Column(scale=2):
|
| 114 |
-
# 第一部分:API 配置
|
| 115 |
with gr.Group():
|
| 116 |
-
gr.Markdown("### 1.
|
| 117 |
-
key_input = gr.Textbox(label="
|
| 118 |
-
key_btn = gr.Button("保存
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
# 第二部分:PDF 上传
|
| 124 |
with gr.Group():
|
| 125 |
-
gr.Markdown("### 2.
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
pdf_status = gr.Textbox(label="上传状态", interactive=False)
|
| 129 |
|
| 130 |
-
gr.Markdown("---")
|
| 131 |
-
|
| 132 |
-
# 第三部分:解析步骤
|
| 133 |
-
with gr.Group():
|
| 134 |
-
gr.Markdown("### 3. 解析 PDF")
|
| 135 |
-
gr.Markdown("<small>已应用环境变量:MINERU_VIRTUAL_VRAM_SIZE=8</small>")
|
| 136 |
-
parse_btn = gr.Button("🚀 开始解析 (Run Mineru)", variant="secondary")
|
| 137 |
-
parse_status = gr.Textbox(label="解析进度/结果", interactive=False)
|
| 138 |
-
|
| 139 |
-
# 右侧调试监控区
|
| 140 |
with gr.Column(scale=1):
|
| 141 |
gr.Markdown("### 🔍 实时系统监控")
|
| 142 |
-
debug_view = gr.Textbox(
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
| 146 |
interactive=False
|
| 147 |
)
|
| 148 |
-
refresh_btn = gr.Button("🔄
|
| 149 |
|
| 150 |
-
#
|
| 151 |
-
key_btn.click(
|
| 152 |
-
pdf_btn.click(
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
if __name__ == "__main__":
|
| 157 |
demo.launch()
|
|
|
|
| 15 |
os.makedirs(PAPERS_DIR, exist_ok=True)
|
| 16 |
|
| 17 |
def get_debug_info():
|
| 18 |
+
"""读取服务器文件系统状态"""
|
| 19 |
now = datetime.now().strftime("%H:%M:%S")
|
| 20 |
files = os.listdir(PAPERS_DIR) if os.path.exists(PAPERS_DIR) else "Directory missing"
|
| 21 |
|
| 22 |
+
# 递归检查输出目录下的内容,看看到底生成了什么
|
| 23 |
+
output_detail = "Not generated"
|
| 24 |
if os.path.exists(OUTPUT_DIR):
|
| 25 |
+
all_output_items = []
|
| 26 |
+
for root, dirs, files_in_out in os.walk(OUTPUT_DIR):
|
| 27 |
+
for name in files_in_out:
|
| 28 |
+
all_output_items.append(os.path.join(os.path.relpath(root, OUTPUT_DIR), name))
|
| 29 |
+
output_detail = f"Found {len(all_output_items)} files: {all_output_items[:5]}..." if all_output_items else "Directory exists but is EMPTY"
|
| 30 |
|
| 31 |
+
return f"[{now}] 📁 papers/ 内容: {files}\n\n[{now}] 📂 mineru_outputs 状态: {output_detail}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def save_pdf(file):
|
| 34 |
+
if file is None: return "❌ 请先选择 PDF", get_debug_info()
|
|
|
|
|
|
|
| 35 |
try:
|
| 36 |
+
file_path = os.path.join(PAPERS_DIR, os.path.basename(file.name))
|
|
|
|
| 37 |
shutil.copy(file.name, file_path)
|
| 38 |
+
return f"✅ 已保存: {os.path.basename(file.name)}", get_debug_info()
|
| 39 |
except Exception as e:
|
| 40 |
+
return f"❌ 出错: {str(e)}", get_debug_info()
|
| 41 |
|
| 42 |
def save_api_key(api_key):
|
| 43 |
+
if not api_key: return "❌ Key 不能为空", get_debug_info()
|
|
|
|
|
|
|
| 44 |
try:
|
| 45 |
config = {}
|
| 46 |
if os.path.exists(CONFIG_PATH):
|
| 47 |
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
| 48 |
config = yaml.safe_load(f) or {}
|
| 49 |
+
config.setdefault("api_keys", {})["gemini_api_key"] = api_key
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
with open(CONFIG_PATH, "w", encoding="utf-8") as f:
|
| 51 |
+
yaml.dump(config, f, allow_unicode=True)
|
| 52 |
+
return "✅ Key 已保存", get_debug_info()
|
|
|
|
| 53 |
except Exception as e:
|
| 54 |
+
return f"❌ 出错: {str(e)}", get_debug_info()
|
| 55 |
|
| 56 |
def run_mineru_parsing():
|
| 57 |
+
"""执行 PDF 解析并捕获完整日志"""
|
|
|
|
| 58 |
if not os.path.exists(PAPERS_DIR) or not any(f.endswith('.pdf') for f in os.listdir(PAPERS_DIR)):
|
| 59 |
+
return "❌ 未发现 PDF 文件", get_debug_info(), "No execution logs."
|
| 60 |
|
| 61 |
try:
|
|
|
|
| 62 |
env = os.environ.copy()
|
| 63 |
env["MINERU_FORMULA_ENABLE"] = "false"
|
| 64 |
env["MINERU_TABLE_ENABLE"] = "false"
|
| 65 |
env["MINERU_DEVICE_MODE"] = "cpu"
|
|
|
|
| 66 |
env["MINERU_VIRTUAL_VRAM_SIZE"] = "8"
|
| 67 |
|
| 68 |
+
# 尝试使用完整的 mineru 命令
|
| 69 |
+
command = ["mineru", "-p", PAPERS_DIR, "-o", OUTPUT_DIR]
|
| 70 |
|
| 71 |
+
# 使用 subprocess.run 捕获所有输出
|
| 72 |
+
result = subprocess.run(
|
| 73 |
+
command,
|
| 74 |
+
env=env,
|
| 75 |
+
capture_output=True,
|
| 76 |
+
text=True,
|
| 77 |
+
timeout=300 # 设置5分钟超时防止卡死
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# 组合 stdout 和 stderr 作为完整的运行日志
|
| 81 |
+
full_log = f"--- STDOUT ---\n{result.stdout}\n\n--- STDERR ---\n{result.stderr}"
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if result.returncode == 0:
|
| 84 |
+
status = "✅ PDF解析完成"
|
|
|
|
| 85 |
else:
|
| 86 |
+
status = f"❌ 解析失败 (Exit Code: {result.returncode})"
|
| 87 |
+
|
| 88 |
+
return status, get_debug_info(), full_log
|
| 89 |
|
| 90 |
except Exception as e:
|
| 91 |
+
error_log = f"Exception occurred during execution:\n{str(e)}"
|
| 92 |
+
return "❌ 运行异常", get_debug_info(), error_log
|
| 93 |
|
| 94 |
+
# --- UI ---
|
| 95 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 96 |
+
gr.Markdown("# 📑 Mineru PDF 解析调试器")
|
| 97 |
|
| 98 |
with gr.Row():
|
| 99 |
+
with gr.Column(scale=1):
|
|
|
|
|
|
|
| 100 |
with gr.Group():
|
| 101 |
+
gr.Markdown("### 1. 配置 & 上传")
|
| 102 |
+
key_input = gr.Textbox(label="API Key", type="password")
|
| 103 |
+
key_btn = gr.Button("保存 Key")
|
| 104 |
+
gr.Markdown("---")
|
| 105 |
+
pdf_input = gr.File(label="选择 PDF", file_types=[".pdf"])
|
| 106 |
+
pdf_btn = gr.Button("保存 PDF")
|
| 107 |
+
|
|
|
|
| 108 |
with gr.Group():
|
| 109 |
+
gr.Markdown("### 2. 执行解析")
|
| 110 |
+
parse_btn = gr.Button("🚀 Run Mineru (CPU Mode)", variant="primary")
|
| 111 |
+
parse_status = gr.Textbox(label="运行状态")
|
|
|
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
with gr.Column(scale=1):
|
| 114 |
gr.Markdown("### 🔍 实时系统监控")
|
| 115 |
+
debug_view = gr.Textbox(label="文件系统快照", value=get_debug_info(), lines=8, interactive=False)
|
| 116 |
+
|
| 117 |
+
gr.Markdown("### 📜 Mineru 终端输出日志")
|
| 118 |
+
cmd_logs = gr.Textbox(
|
| 119 |
+
label="Command Output (Stdout/Stderr)",
|
| 120 |
+
placeholder="等待解析任务开始...",
|
| 121 |
+
lines=15,
|
| 122 |
interactive=False
|
| 123 |
)
|
| 124 |
+
refresh_btn = gr.Button("🔄 刷新状态")
|
| 125 |
|
| 126 |
+
# 逻辑绑定
|
| 127 |
+
key_btn.click(save_api_key, inputs=key_input, outputs=[parse_status, debug_view])
|
| 128 |
+
pdf_btn.click(save_pdf, inputs=pdf_input, outputs=[parse_status, debug_view])
|
| 129 |
+
|
| 130 |
+
# 解析按钮会更新三个地方:状态、文件监控、详细日志
|
| 131 |
+
parse_btn.click(
|
| 132 |
+
fn=run_mineru_parsing,
|
| 133 |
+
outputs=[parse_status, debug_view, cmd_logs]
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
refresh_btn.click(get_debug_info, outputs=debug_view)
|
| 137 |
|
| 138 |
if __name__ == "__main__":
|
| 139 |
demo.launch()
|