snym04 commited on
Commit
50455ba
·
verified ·
1 Parent(s): 8a562d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -89
app.py CHANGED
@@ -15,143 +15,125 @@ OUTPUT_DIR = os.path.join(BASE_DIR, "mineru_outputs")
15
  os.makedirs(PAPERS_DIR, exist_ok=True)
16
 
17
  def get_debug_info():
18
- """读取服务器文件系统状态,用于实时监控"""
19
  now = datetime.now().strftime("%H:%M:%S")
20
  files = os.listdir(PAPERS_DIR) if os.path.exists(PAPERS_DIR) else "Directory missing"
21
 
22
- # 检查输出目录状态
23
- output_status = "Not generated"
24
  if os.path.exists(OUTPUT_DIR):
25
- out_files = os.listdir(OUTPUT_DIR)
26
- output_status = f"Exists ({len(out_files)} items)"
 
 
 
27
 
28
- config_content = "Not found"
29
- if os.path.exists(CONFIG_PATH):
30
- try:
31
- with open(CONFIG_PATH, "r", encoding="utf-8") as f:
32
- config_content = f.read()
33
- except Exception:
34
- config_content = "Error reading config"
35
-
36
- return f"[{now}] 📁 papers/ 文件夹:\n{files}\n\n[{now}] 📂 mineru_outputs:\n{output_status}\n\n[{now}] 📄 config.yaml 内容:\n{config_content}"
37
 
38
  def save_pdf(file):
39
- if file is None:
40
- return "❌ 请先选择一个 PDF 文件", get_debug_info()
41
-
42
  try:
43
- file_name = os.path.basename(file.name)
44
- file_path = os.path.join(PAPERS_DIR, file_name)
45
  shutil.copy(file.name, file_path)
46
- return f"✅ 成功保存文件: {file_name}", get_debug_info()
47
  except Exception as e:
48
- return f"❌ 保存 PDF 出错: {str(e)}", get_debug_info()
49
 
50
  def save_api_key(api_key):
51
- if not api_key:
52
- return "❌ API Key 不能为空", get_debug_info()
53
-
54
  try:
55
  config = {}
56
  if os.path.exists(CONFIG_PATH):
57
  with open(CONFIG_PATH, "r", encoding="utf-8") as f:
58
  config = yaml.safe_load(f) or {}
59
-
60
- if "api_keys" not in config:
61
- config["api_keys"] = {}
62
- config["api_keys"]["gemini_api_key"] = api_key
63
-
64
  with open(CONFIG_PATH, "w", encoding="utf-8") as f:
65
- yaml.dump(config, f, allow_unicode=True, default_flow_style=False)
66
-
67
- return "✅ API Key 已成功写入 config.yaml", get_debug_info()
68
  except Exception as e:
69
- return f"❌ 保存 Key 出错: {str(e)}", get_debug_info()
70
 
71
  def run_mineru_parsing():
72
- """执行 PDF 解析逻辑"""
73
- # 1. 判断是否上传了 PDF
74
  if not os.path.exists(PAPERS_DIR) or not any(f.endswith('.pdf') for f in os.listdir(PAPERS_DIR)):
75
- return "❌ 未发现已上传的 PDF 文件,请先执行步骤 2。", get_debug_info()
76
 
77
  try:
78
- # 2. 设置环境变量 (包含你发现的虚拟显存设置)
79
  env = os.environ.copy()
80
  env["MINERU_FORMULA_ENABLE"] = "false"
81
  env["MINERU_TABLE_ENABLE"] = "false"
82
  env["MINERU_DEVICE_MODE"] = "cpu"
83
- # 核心修改:添加虚拟显存大小设置,规避 torch 检测错误
84
  env["MINERU_VIRTUAL_VRAM_SIZE"] = "8"
85
 
86
- # 3. 执行 Mineru 命令
87
- command = ["mineru", "-p", "papers", "-o", "mineru_outputs"]
88
 
89
- # 将启动信息打印到 stderr 以便在 HF Log 中查看
90
- print(f">>> 开始解析任务: {' '.join(command)}", file=sys.stderr, flush=True)
 
 
 
 
 
 
 
 
 
91
 
92
- # 执行子进程
93
- # capture_output=True 将捕获输出,text=True 将结果作为字符串处理
94
- result = subprocess.run(command, env=env, capture_output=True, text=True)
95
-
96
  if result.returncode == 0:
97
- print(">>> 解析任务功结束", file=sys.stderr, flush=True)
98
- return "✅ PDF解析完成", get_debug_info()
99
  else:
100
- # 打印详细错误到后台日志
101
- print(f">>> 解析任务失败: {result.stderr}", file=sys.stderr, flush=True)
102
- return f"❌ 解析过程中出错: {result.stderr}", get_debug_info()
103
 
104
  except Exception as e:
105
- return f" 执行命令时发生异常: {str(e)}", get_debug_info()
 
106
 
107
- # --- 构建单页 UI ---
108
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
109
- gr.Markdown("# 📑 PDF 助手管理后台")
110
 
111
  with gr.Row():
112
- # 左侧操作区
113
- with gr.Column(scale=2):
114
- # 第一部分:API 配置
115
  with gr.Group():
116
- gr.Markdown("### 1. 密钥配置")
117
- key_input = gr.Textbox(label="Gemini API Key", type="password")
118
- key_btn = gr.Button("保存配置", variant="primary")
119
- key_status = gr.Textbox(label="配置状态", interactive=False)
120
-
121
- gr.Markdown("---")
122
-
123
- # 第二部分:PDF 上传
124
  with gr.Group():
125
- gr.Markdown("### 2. 论文上传")
126
- pdf_input = gr.File(label="选择 PDF 文件", file_types=[".pdf"])
127
- pdf_btn = gr.Button("保存到 papers 文件夹", variant="primary")
128
- pdf_status = gr.Textbox(label="上传状态", interactive=False)
129
 
130
- gr.Markdown("---")
131
-
132
- # 第三部分:解析步骤
133
- with gr.Group():
134
- gr.Markdown("### 3. 解析 PDF")
135
- gr.Markdown("<small>已应用环境变量:MINERU_VIRTUAL_VRAM_SIZE=8</small>")
136
- parse_btn = gr.Button("🚀 开始解析 (Run Mineru)", variant="secondary")
137
- parse_status = gr.Textbox(label="解析进度/结果", interactive=False)
138
-
139
- # 右侧调试监控区
140
  with gr.Column(scale=1):
141
  gr.Markdown("### 🔍 实时系统监控")
142
- debug_view = gr.Textbox(
143
- label="服务器文件状态",
144
- value=get_debug_info(),
145
- lines=25,
 
 
 
146
  interactive=False
147
  )
148
- refresh_btn = gr.Button("🔄 手动刷新状态")
149
 
150
- # 绑定事件逻辑
151
- key_btn.click(fn=save_api_key, inputs=key_input, outputs=[key_status, debug_view])
152
- pdf_btn.click(fn=save_pdf, inputs=pdf_input, outputs=[pdf_status, debug_view])
153
- parse_btn.click(fn=run_mineru_parsing, outputs=[parse_status, debug_view])
154
- refresh_btn.click(fn=get_debug_info, outputs=debug_view)
 
 
 
 
 
 
155
 
156
  if __name__ == "__main__":
157
  demo.launch()
 
15
  os.makedirs(PAPERS_DIR, exist_ok=True)
16
 
17
  def get_debug_info():
18
+ """读取服务器文件系统状态"""
19
  now = datetime.now().strftime("%H:%M:%S")
20
  files = os.listdir(PAPERS_DIR) if os.path.exists(PAPERS_DIR) else "Directory missing"
21
 
22
+ # 递归检查输出目录下的内容,看看到底生成了什么
23
+ output_detail = "Not generated"
24
  if os.path.exists(OUTPUT_DIR):
25
+ all_output_items = []
26
+ for root, dirs, files_in_out in os.walk(OUTPUT_DIR):
27
+ for name in files_in_out:
28
+ all_output_items.append(os.path.join(os.path.relpath(root, OUTPUT_DIR), name))
29
+ output_detail = f"Found {len(all_output_items)} files: {all_output_items[:5]}..." if all_output_items else "Directory exists but is EMPTY"
30
 
31
+ return f"[{now}] 📁 papers/ 内容: {files}\n\n[{now}] 📂 mineru_outputs 状态: {output_detail}"
 
 
 
 
 
 
 
 
32
 
33
  def save_pdf(file):
34
+ if file is None: return "❌ 请先选择 PDF", get_debug_info()
 
 
35
  try:
36
+ file_path = os.path.join(PAPERS_DIR, os.path.basename(file.name))
 
37
  shutil.copy(file.name, file_path)
38
+ return f"✅ 保存: {os.path.basename(file.name)}", get_debug_info()
39
  except Exception as e:
40
+ return f"❌ 出错: {str(e)}", get_debug_info()
41
 
42
  def save_api_key(api_key):
43
+ if not api_key: return "❌ Key 不能为空", get_debug_info()
 
 
44
  try:
45
  config = {}
46
  if os.path.exists(CONFIG_PATH):
47
  with open(CONFIG_PATH, "r", encoding="utf-8") as f:
48
  config = yaml.safe_load(f) or {}
49
+ config.setdefault("api_keys", {})["gemini_api_key"] = api_key
 
 
 
 
50
  with open(CONFIG_PATH, "w", encoding="utf-8") as f:
51
+ yaml.dump(config, f, allow_unicode=True)
52
+ return "✅ Key 已保存", get_debug_info()
 
53
  except Exception as e:
54
+ return f"❌ 出错: {str(e)}", get_debug_info()
55
 
56
  def run_mineru_parsing():
57
+ """执行 PDF 解析并捕获完整日志"""
 
58
  if not os.path.exists(PAPERS_DIR) or not any(f.endswith('.pdf') for f in os.listdir(PAPERS_DIR)):
59
+ return "❌ 未发现 PDF 文件", get_debug_info(), "No execution logs."
60
 
61
  try:
 
62
  env = os.environ.copy()
63
  env["MINERU_FORMULA_ENABLE"] = "false"
64
  env["MINERU_TABLE_ENABLE"] = "false"
65
  env["MINERU_DEVICE_MODE"] = "cpu"
 
66
  env["MINERU_VIRTUAL_VRAM_SIZE"] = "8"
67
 
68
+ # 尝试使用完整的 mineru 命令
69
+ command = ["mineru", "-p", PAPERS_DIR, "-o", OUTPUT_DIR]
70
 
71
+ # 使用 subprocess.run 捕获所有输出
72
+ result = subprocess.run(
73
+ command,
74
+ env=env,
75
+ capture_output=True,
76
+ text=True,
77
+ timeout=300 # 设置5分钟超时防止卡死
78
+ )
79
+
80
+ # 组合 stdout 和 stderr 作为完整的运行日志
81
+ full_log = f"--- STDOUT ---\n{result.stdout}\n\n--- STDERR ---\n{result.stderr}"
82
 
 
 
 
 
83
  if result.returncode == 0:
84
+ status = " PDF解析成"
 
85
  else:
86
+ status = f"❌ 解析失败 (Exit Code: {result.returncode})"
87
+
88
+ return status, get_debug_info(), full_log
89
 
90
  except Exception as e:
91
+ error_log = f"Exception occurred during execution:\n{str(e)}"
92
+ return "❌ 运行异常", get_debug_info(), error_log
93
 
94
+ # --- UI ---
95
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
96
+ gr.Markdown("# 📑 Mineru PDF 解析调试器")
97
 
98
  with gr.Row():
99
+ with gr.Column(scale=1):
 
 
100
  with gr.Group():
101
+ gr.Markdown("### 1. 配置 & 上传")
102
+ key_input = gr.Textbox(label="API Key", type="password")
103
+ key_btn = gr.Button("保存 Key")
104
+ gr.Markdown("---")
105
+ pdf_input = gr.File(label="选择 PDF", file_types=[".pdf"])
106
+ pdf_btn = gr.Button("保存 PDF")
107
+
 
108
  with gr.Group():
109
+ gr.Markdown("### 2. 执行解析")
110
+ parse_btn = gr.Button("🚀 Run Mineru (CPU Mode)", variant="primary")
111
+ parse_status = gr.Textbox(label="运行状态")
 
112
 
 
 
 
 
 
 
 
 
 
 
113
  with gr.Column(scale=1):
114
  gr.Markdown("### 🔍 实时系统监控")
115
+ debug_view = gr.Textbox(label="文件系统快照", value=get_debug_info(), lines=8, interactive=False)
116
+
117
+ gr.Markdown("### 📜 Mineru 终端输出日志")
118
+ cmd_logs = gr.Textbox(
119
+ label="Command Output (Stdout/Stderr)",
120
+ placeholder="等待解析任务开始...",
121
+ lines=15,
122
  interactive=False
123
  )
124
+ refresh_btn = gr.Button("🔄 刷新状态")
125
 
126
+ # 逻辑绑定
127
+ key_btn.click(save_api_key, inputs=key_input, outputs=[parse_status, debug_view])
128
+ pdf_btn.click(save_pdf, inputs=pdf_input, outputs=[parse_status, debug_view])
129
+
130
+ # 解析按钮会更新三个地方:状态、文件监控、详细日志
131
+ parse_btn.click(
132
+ fn=run_mineru_parsing,
133
+ outputs=[parse_status, debug_view, cmd_logs]
134
+ )
135
+
136
+ refresh_btn.click(get_debug_info, outputs=debug_view)
137
 
138
  if __name__ == "__main__":
139
  demo.launch()