linxinhua commited on
Commit
950a28d
·
verified ·
1 Parent(s): 85ddf09

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +460 -0
app.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import csv
4
+ import os
5
+ import re
6
+ from typing import Dict, List, Any, Optional
7
+
8
+ class JournalPaperFilter:
9
+ def __init__(self):
10
+ self.target_journals = {} # 存储目标期刊信息
11
+ self.selected_journals = set() # 存储选中的期刊
12
+ self.paper_files_info = {} # 存储文献文件信息
13
+
14
+ def detect_delimiter(self, file_path):
15
+ """自动检测CSV文件的分隔符"""
16
+ try:
17
+ with open(file_path, 'r', encoding='utf-8') as file:
18
+ sample = file.read(1024)
19
+ sniffer = csv.Sniffer()
20
+ delimiter = sniffer.sniff(sample).delimiter
21
+ return delimiter
22
+ except:
23
+ return ','
24
+
25
+ def load_journal_file(self, file):
26
+ """加载期刊文件"""
27
+ if not file:
28
+ return {}, "请上传期刊文件", [], [], ""
29
+
30
+ try:
31
+ filename = os.path.basename(file.name)
32
+
33
+ if file.name.endswith('.csv'):
34
+ delimiter = self.detect_delimiter(file.name)
35
+ df = pd.read_csv(file.name, delimiter=delimiter, encoding='utf-8')
36
+ elif file.name.endswith(('.xls', '.xlsx')):
37
+ df = pd.read_excel(file.name)
38
+ else:
39
+ return {}, "不支持的文件格式", [], [], ""
40
+
41
+ journal_info = {
42
+ 'dataframe': df,
43
+ 'columns': list(df.columns),
44
+ 'filename': filename
45
+ }
46
+
47
+ info_text = f"文件: {filename}\n"
48
+ info_text += f"列名: {', '.join(df.columns)}\n"
49
+ info_text += f"行数: {len(df)}\n"
50
+
51
+ # 返回列名选择选项
52
+ column_choices = list(df.columns)
53
+
54
+ return journal_info, f"成功加载期刊文件: {filename}", column_choices, column_choices, info_text
55
+
56
+ except Exception as e:
57
+ return {}, f"文件读取失败: {str(e)}", [], [], ""
58
+
59
+ def update_index_options(self, journal_info, title_col, index_col):
60
+ """更新索引选项"""
61
+ if not journal_info or not index_col or index_col == "无":
62
+ return [], "", 0
63
+
64
+ try:
65
+ df = journal_info['dataframe']
66
+
67
+ # 获取不重复的索引值
68
+ unique_values = df[index_col].dropna().unique()
69
+ unique_values = [str(val).strip() for val in unique_values if str(val).strip()]
70
+ unique_values = sorted(unique_values)
71
+
72
+ # 创建选择选项
73
+ choices = [(val, val) for val in unique_values]
74
+
75
+ status_text = f"找到 {len(unique_values)} 个不重复的 {index_col} 值"
76
+
77
+ return choices, status_text, 0
78
+
79
+ except Exception as e:
80
+ return [], f"处理索引列时出错: {str(e)}", 0
81
+
82
+ def count_selected_journals(self, journal_info, title_col, index_col, selected_indices):
83
+ """统计选中的期刊数量"""
84
+ if not journal_info or not title_col or not index_col or not selected_indices:
85
+ return "请完成配置并选择索引值"
86
+
87
+ try:
88
+ df = journal_info['dataframe']
89
+
90
+ # 筛选符合条件的期刊
91
+ filtered_df = df[df[index_col].isin(selected_indices)]
92
+
93
+ # 获取期刊标题
94
+ journal_titles = filtered_df[title_col].dropna().unique()
95
+ journal_count = len(journal_titles)
96
+
97
+ # 存储选中的期刊(用于后续过滤)
98
+ self.selected_journals = set()
99
+ for title in journal_titles:
100
+ # 清理期刊名称:去除标点符号,转小写
101
+ clean_title = re.sub(r'[^\w\s]', '', str(title).lower().strip())
102
+ self.selected_journals.add(clean_title)
103
+
104
+ status_text = f"选中了 {len(selected_indices)} 个索引值\n"
105
+ status_text += f"对应 {journal_count} 个期刊\n"
106
+ status_text += f"期刊列表已更新,可在 Paper Filter 中使用"
107
+
108
+ return status_text
109
+
110
+ except Exception as e:
111
+ return f"统计时出错: {str(e)}"
112
+
113
+ def load_paper_files(self, files):
114
+ """加载文献文件"""
115
+ if not files:
116
+ return {}, "请上传文献文件", ""
117
+
118
+ files_info = {}
119
+ file_info_text = ""
120
+
121
+ for file in files:
122
+ try:
123
+ filename = os.path.basename(file.name)
124
+
125
+ if file.name.endswith('.csv'):
126
+ delimiter = self.detect_delimiter(file.name)
127
+ df = pd.read_csv(file.name, delimiter=delimiter, encoding='utf-8')
128
+ elif file.name.endswith(('.xls', '.xlsx')):
129
+ df = pd.read_excel(file.name)
130
+ else:
131
+ continue
132
+
133
+ files_info[filename] = {
134
+ 'dataframe': df,
135
+ 'columns': list(df.columns)
136
+ }
137
+
138
+ file_info_text += f"文件: {filename}\n"
139
+ file_info_text += f"列名: {', '.join(df.columns)}\n"
140
+ file_info_text += f"行数: {len(df)}\n\n"
141
+
142
+ except Exception as e:
143
+ file_info_text += f"文件 {filename} 读取失败: {str(e)}\n\n"
144
+
145
+ return files_info, f"成功加载 {len(files_info)} 个文献文件", file_info_text
146
+
147
+ def create_paper_mapping_interface(self, files_info):
148
+ """创建文献映射界面HTML"""
149
+ if not files_info:
150
+ return "<p>请先上传文献文件</p>"
151
+
152
+ html = "<div style='font-family: Arial, sans-serif; margin: 10px 0;'>"
153
+ html += "<h4 style='color: #333; margin-bottom: 15px;'>文献文件信息</h4>"
154
+
155
+ for filename, file_info in files_info.items():
156
+ html += f"<div style='margin: 15px 0; padding: 15px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;'>"
157
+ html += f"<h5 style='margin: 0 0 10px 0; color: #333;'>📄 {filename}</h5>"
158
+ html += f"<p style='margin: 5px 0; font-size: 13px; color: #666; line-height: 1.4;'>"
159
+ html += f"<strong>可用列:</strong><br>{', '.join(file_info['columns'])}</p>"
160
+ html += f"<p style='margin: 5px 0; font-size: 12px; color: #888;'>行数: {len(file_info['dataframe'])}</p>"
161
+ html += "</div>"
162
+
163
+ html += "</div>"
164
+ return html
165
+
166
+ def update_paper_mapping_dropdowns(self, files_info):
167
+ """更新文献映射下拉菜单"""
168
+ updates = []
169
+
170
+ if not files_info:
171
+ # 隐藏所有下拉菜单
172
+ for i in range(25): # 5个文件 x 5个字段
173
+ updates.append(gr.update(visible=False))
174
+ return False, "<p>请先上传文献文件</p>", *updates
175
+
176
+ files_list = list(files_info.items())
177
+ dropdown_idx = 0
178
+ required_fields = ["Author", "Journal title", "Article title", "Abstract", "DOI"]
179
+
180
+ for file_idx, (filename, file_info) in enumerate(files_list[:5]): # 最多5个文件
181
+ for field_idx, field_name in enumerate(required_fields):
182
+ if dropdown_idx < 25:
183
+ choices = ["无"] + file_info['columns']
184
+ updates.append(gr.update(
185
+ choices=choices,
186
+ value="无",
187
+ visible=True,
188
+ label=f"文件{file_idx+1} → {field_name}"
189
+ ))
190
+ dropdown_idx += 1
191
+
192
+ # 隐藏未使用的下拉菜单
193
+ while dropdown_idx < 25:
194
+ updates.append(gr.update(visible=False))
195
+ dropdown_idx += 1
196
+
197
+ return True, self.create_paper_mapping_interface(files_info), *updates
198
+
199
+ def filter_papers_by_journals(self, files_info, *mapping_values):
200
+ """根据选中的期刊过滤文献"""
201
+ try:
202
+ if not files_info:
203
+ return "错误:没有加载文献文件", None
204
+
205
+ if not self.selected_journals:
206
+ return "错误:请先在 Journal Processing 中选择期刊", None
207
+
208
+ # 解析映射配置并统计总文献数
209
+ required_fields = ["Author", "Journal title", "Article title", "Abstract", "DOI"]
210
+ all_papers_data = [] # 存储所有文献
211
+ filtered_papers_data = [] # 存储筛选后的文献
212
+ files_list = list(files_info.items())
213
+
214
+ # 统计总文献数
215
+ total_papers_count = sum(len(file_info['dataframe']) for file_info in files_info.values())
216
+
217
+ mapping_idx = 0
218
+ for file_idx, (filename, file_info) in enumerate(files_list[:5]):
219
+ df = file_info['dataframe']
220
+
221
+ # 获取当前文件的列映射
222
+ file_mapping = {}
223
+ for field_idx, field_name in enumerate(required_fields):
224
+ if mapping_idx < len(mapping_values) and mapping_values[mapping_idx] != "无":
225
+ file_mapping[field_name] = mapping_values[mapping_idx]
226
+ mapping_idx += 1
227
+
228
+ # 检查是否有期刊标题映射
229
+ if "Journal title" not in file_mapping:
230
+ continue
231
+
232
+ journal_col = file_mapping["Journal title"]
233
+
234
+ # 处理当前文件的每一行
235
+ for _, row in df.iterrows():
236
+ # 构建完整的文献记录
237
+ paper_row = []
238
+ for field_name in required_fields:
239
+ if field_name in file_mapping:
240
+ value = str(row[file_mapping[field_name]]).strip()
241
+ paper_row.append(value)
242
+ else:
243
+ paper_row.append("")
244
+ all_papers_data.append(paper_row)
245
+
246
+ # 检查期刊是否在选中列表中
247
+ journal_title = str(row[journal_col]).strip()
248
+ clean_journal = re.sub(r'[^\w\s]', '', journal_title.lower().strip())
249
+
250
+ if clean_journal in self.selected_journals:
251
+ filtered_papers_data.append(paper_row)
252
+
253
+ if not filtered_papers_data:
254
+ return f"没有找到匹配的文献\n总文献数: {total_papers_count} 篇", None
255
+
256
+ # 创建DataFrame并去重
257
+ all_papers_df = pd.DataFrame(all_papers_data, columns=required_fields)
258
+ filtered_papers_df = pd.DataFrame(filtered_papers_data, columns=required_fields)
259
+
260
+ # 去重处理
261
+ all_papers_after_dedup = all_papers_df.drop_duplicates()
262
+ after_dedup_count = len(all_papers_after_dedup)
263
+
264
+ filtered_papers_after_dedup = filtered_papers_df.drop_duplicates()
265
+
266
+ # 将Journal title转为小写
267
+ if 'Journal title' in filtered_papers_after_dedup.columns:
268
+ filtered_papers_after_dedup['Journal title'] = filtered_papers_after_dedup['Journal title'].astype(str).str.lower().str.strip()
269
+
270
+ final_count = len(filtered_papers_after_dedup)
271
+
272
+ # 保存结果
273
+ output_filename = "filtered_papers.csv"
274
+ filtered_papers_after_dedup.to_csv(output_filename, index=False, encoding='utf-8')
275
+
276
+ status_msg = f"过滤完成!\n"
277
+ status_msg += f"使用了 {len(self.selected_journals)} 个选中期刊进行过滤\n\n"
278
+ status_msg += f"📊 处理统计:\n"
279
+ status_msg += f"总文献条数: {total_papers_count} 篇\n"
280
+ status_msg += f"去重后: {after_dedup_count} 篇\n"
281
+ status_msg += f"筛选后: {final_count} 篇\n\n"
282
+ status_msg += f"✅ 已保存到: {output_filename}"
283
+
284
+ return status_msg, output_filename
285
+
286
+ except Exception as e:
287
+ return f"过滤文献时出错: {str(e)}", None
288
+
289
+ def create_ui():
290
+ """创建主UI界面"""
291
+ filter_tool = JournalPaperFilter()
292
+
293
+ with gr.Blocks(title="期刊和文献筛选工具") as app:
294
+ gr.Markdown("# 期刊和文献筛选工具")
295
+ gr.Markdown("第一步:在 Journal Processing 中选择目标期刊;第二步:在 Paper Filter 中过滤文献")
296
+
297
+ with gr.Tabs():
298
+ # 第一个标签页:期刊处理
299
+ with gr.TabItem("Journal Processing"):
300
+ gr.Markdown("### 上传期刊列表并选择筛选条件")
301
+
302
+ # 状态管理
303
+ journal_info_state = gr.State({})
304
+
305
+ # 文件上传
306
+ with gr.Row():
307
+ with gr.Column(scale=1):
308
+ journal_upload = gr.File(
309
+ label="上传期刊列表文件",
310
+ file_types=[".csv", ".xls", ".xlsx"]
311
+ )
312
+ with gr.Column(scale=1):
313
+ journal_status = gr.Textbox(label="加载状态", interactive=False)
314
+ with gr.Column(scale=2):
315
+ journal_info = gr.Textbox(label="文件信息", lines=6, interactive=False)
316
+
317
+ # 列选择
318
+ with gr.Row():
319
+ with gr.Column():
320
+ title_col_dropdown = gr.Dropdown(
321
+ choices=[],
322
+ label="Journal Title 列",
323
+ interactive=True
324
+ )
325
+ index_col_dropdown = gr.Dropdown(
326
+ choices=[],
327
+ label="Index 列(用于筛选)",
328
+ interactive=True
329
+ )
330
+ with gr.Column():
331
+ index_status = gr.Textbox(
332
+ label="索引信息",
333
+ interactive=False,
334
+ lines=3
335
+ )
336
+
337
+ # 索引值选择
338
+ with gr.Row():
339
+ with gr.Column(scale=2):
340
+ index_checkboxes = gr.CheckboxGroup(
341
+ choices=[],
342
+ label="选择要包含的索引值",
343
+ interactive=True
344
+ )
345
+ with gr.Column(scale=1):
346
+ journal_count_display = gr.Textbox(
347
+ label="选中期刊统计",
348
+ interactive=False,
349
+ lines=5
350
+ )
351
+
352
+ # 事件绑定 - Journal Processing
353
+ def on_journal_upload(file):
354
+ journal_info, status, title_choices, index_choices, info = filter_tool.load_journal_file(file)
355
+ return (journal_info, status, info,
356
+ gr.update(choices=title_choices),
357
+ gr.update(choices=index_choices))
358
+
359
+ journal_upload.change(
360
+ fn=on_journal_upload,
361
+ inputs=[journal_upload],
362
+ outputs=[journal_info_state, journal_status, journal_info,
363
+ title_col_dropdown, index_col_dropdown]
364
+ )
365
+
366
+ def on_index_col_change(journal_info, title_col, index_col):
367
+ choices, status, count = filter_tool.update_index_options(journal_info, title_col, index_col)
368
+ return gr.update(choices=choices), status
369
+
370
+ index_col_dropdown.change(
371
+ fn=on_index_col_change,
372
+ inputs=[journal_info_state, title_col_dropdown, index_col_dropdown],
373
+ outputs=[index_checkboxes, index_status]
374
+ )
375
+
376
+ def on_checkbox_change(journal_info, title_col, index_col, selected):
377
+ status = filter_tool.count_selected_journals(journal_info, title_col, index_col, selected)
378
+ return status
379
+
380
+ index_checkboxes.change(
381
+ fn=on_checkbox_change,
382
+ inputs=[journal_info_state, title_col_dropdown, index_col_dropdown, index_checkboxes],
383
+ outputs=[journal_count_display]
384
+ )
385
+
386
+ # 第二个标签页:文献过滤
387
+ with gr.TabItem("Paper Filter"):
388
+ gr.Markdown("### 上传文献文件并配置字段映射")
389
+
390
+ # 状态管理
391
+ paper_files_state = gr.State({})
392
+
393
+ # 文件上传
394
+ with gr.Row():
395
+ with gr.Column(scale=1):
396
+ paper_upload = gr.File(
397
+ label="上传文献文件",
398
+ file_count="multiple",
399
+ file_types=[".csv", ".xls", ".xlsx"]
400
+ )
401
+ with gr.Column(scale=1):
402
+ paper_status = gr.Textbox(label="加载状态", interactive=False)
403
+ with gr.Column(scale=2):
404
+ paper_info = gr.Textbox(label="文件信息", lines=6, interactive=False)
405
+
406
+ # 字段映射
407
+ with gr.Row():
408
+ with gr.Column(scale=2):
409
+ paper_mapping_interface = gr.HTML(value="<p>请先上传文献文件</p>")
410
+
411
+ with gr.Column(scale=3):
412
+ # 创建25个下拉菜单(5个文件 x 5个字段)
413
+ paper_mapping_dropdowns = []
414
+ with gr.Row(visible=False) as paper_mapping_row:
415
+ for i in range(25):
416
+ dropdown = gr.Dropdown(
417
+ choices=["无"],
418
+ value="无",
419
+ visible=False,
420
+ interactive=True,
421
+ scale=1
422
+ )
423
+ paper_mapping_dropdowns.append(dropdown)
424
+
425
+ # 处理按钮
426
+ with gr.Row():
427
+ filter_btn = gr.Button("Focus on Selected Journals", variant="primary", size="lg")
428
+
429
+ # 结果显示
430
+ with gr.Row():
431
+ filter_result = gr.Textbox(label="过滤结果", lines=5, interactive=False)
432
+ result_file = gr.File(label="下载过滤后的文献", interactive=False)
433
+
434
+ # 事件绑定 - Paper Filter
435
+ def on_paper_upload(files):
436
+ files_info, status, info = filter_tool.load_paper_files(files)
437
+ row_visible, html, *dropdown_updates = filter_tool.update_paper_mapping_dropdowns(files_info)
438
+ return files_info, status, info, html, gr.update(visible=row_visible), *dropdown_updates
439
+
440
+ paper_upload.change(
441
+ fn=on_paper_upload,
442
+ inputs=[paper_upload],
443
+ outputs=[paper_files_state, paper_status, paper_info,
444
+ paper_mapping_interface, paper_mapping_row] + paper_mapping_dropdowns
445
+ )
446
+
447
+ def on_filter_papers(files_info, *mapping_values):
448
+ return filter_tool.filter_papers_by_journals(files_info, *mapping_values)
449
+
450
+ filter_btn.click(
451
+ fn=on_filter_papers,
452
+ inputs=[paper_files_state] + paper_mapping_dropdowns,
453
+ outputs=[filter_result, result_file]
454
+ )
455
+
456
+ return app
457
+
458
+ if __name__ == "__main__":
459
+ app = create_ui()
460
+ app.launch(debug=True, share=False)