Spaces:

linxinhua
/

Paper_filter

Sleeping

App Files Files Community

linxinhua commited on Jun 23, 2025

Commit

950a28d

verified ·

1 Parent(s): 85ddf09

Create app.py

Browse files

Files changed (1) hide show

app.py +460 -0

app.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import gradio as gr
+import pandas as pd
+import csv
+import os
+import re
+from typing import Dict, List, Any, Optional
+class JournalPaperFilter:
+    def __init__(self):
+        self.target_journals = {}  # 存储目标期刊信息
+        self.selected_journals = set()  # 存储选中的期刊
+        self.paper_files_info = {}  # 存储文献文件信息
+    def detect_delimiter(self, file_path):
+        """自动检测CSV文件的分隔符"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                sample = file.read(1024)
+                sniffer = csv.Sniffer()
+                delimiter = sniffer.sniff(sample).delimiter
+                return delimiter
+        except:
+            return ','
+    def load_journal_file(self, file):
+        """加载期刊文件"""
+        if not file:
+            return {}, "请上传期刊文件", [], [], ""
+        try:
+            filename = os.path.basename(file.name)
+            if file.name.endswith('.csv'):
+                delimiter = self.detect_delimiter(file.name)
+                df = pd.read_csv(file.name, delimiter=delimiter, encoding='utf-8')
+            elif file.name.endswith(('.xls', '.xlsx')):
+                df = pd.read_excel(file.name)
+            else:
+                return {}, "不支持的文件格式", [], [], ""
+            journal_info = {
+                'dataframe': df,
+                'columns': list(df.columns),
+                'filename': filename
+            }
+            info_text = f"文件: {filename}\n"
+            info_text += f"列名: {', '.join(df.columns)}\n"
+            info_text += f"行数: {len(df)}\n"
+            # 返回列名选择选项
+            column_choices = list(df.columns)
+            return journal_info, f"成功加载期刊文件: {filename}", column_choices, column_choices, info_text
+        except Exception as e:
+            return {}, f"文件读取失败: {str(e)}", [], [], ""
+    def update_index_options(self, journal_info, title_col, index_col):
+        """更新索引选项"""
+        if not journal_info or not index_col or index_col == "无":
+            return [], "", 0
+        try:
+            df = journal_info['dataframe']
+            # 获取不重复的索引值
+            unique_values = df[index_col].dropna().unique()
+            unique_values = [str(val).strip() for val in unique_values if str(val).strip()]
+            unique_values = sorted(unique_values)
+            # 创建选择选项
+            choices = [(val, val) for val in unique_values]
+            status_text = f"找到 {len(unique_values)} 个不重复的 {index_col} 值"
+            return choices, status_text, 0
+        except Exception as e:
+            return [], f"处理索引列时出错: {str(e)}", 0
+    def count_selected_journals(self, journal_info, title_col, index_col, selected_indices):
+        """统计选中的期刊数量"""
+        if not journal_info or not title_col or not index_col or not selected_indices:
+            return "请完成配置并选择索引值"
+        try:
+            df = journal_info['dataframe']
+            # 筛选符合条件的期刊
+            filtered_df = df[df[index_col].isin(selected_indices)]
+            # 获取期刊标题
+            journal_titles = filtered_df[title_col].dropna().unique()
+            journal_count = len(journal_titles)
+            # 存储选中的期刊（用于后续过滤）
+            self.selected_journals = set()
+            for title in journal_titles:
+                # 清理期刊名称：去除标点符号，转小写
+                clean_title = re.sub(r'[^\w\s]', '', str(title).lower().strip())
+                self.selected_journals.add(clean_title)
+            status_text = f"选中了 {len(selected_indices)} 个索引值\n"
+            status_text += f"对应 {journal_count} 个期刊\n"
+            status_text += f"期刊列表已更新，可在 Paper Filter 中使用"
+            return status_text
+        except Exception as e:
+            return f"统计时出错: {str(e)}"
+    def load_paper_files(self, files):
+        """加载文献文件"""
+        if not files:
+            return {}, "请上传文献文件", ""
+        files_info = {}
+        file_info_text = ""
+        for file in files:
+            try:
+                filename = os.path.basename(file.name)
+                if file.name.endswith('.csv'):
+                    delimiter = self.detect_delimiter(file.name)
+                    df = pd.read_csv(file.name, delimiter=delimiter, encoding='utf-8')
+                elif file.name.endswith(('.xls', '.xlsx')):
+                    df = pd.read_excel(file.name)
+                else:
+                    continue
+                files_info[filename] = {
+                    'dataframe': df,
+                    'columns': list(df.columns)
+                }
+                file_info_text += f"文件: {filename}\n"
+                file_info_text += f"列名: {', '.join(df.columns)}\n"
+                file_info_text += f"行数: {len(df)}\n\n"
+            except Exception as e:
+                file_info_text += f"文件 {filename} 读取失败: {str(e)}\n\n"
+        return files_info, f"成功加载 {len(files_info)} 个文献文件", file_info_text
+    def create_paper_mapping_interface(self, files_info):
+        """创建文献映射界面HTML"""
+        if not files_info:
+            return "<p>请先上传文献文件</p>"
+        html = "<div style='font-family: Arial, sans-serif; margin: 10px 0;'>"
+        html += "<h4 style='color: #333; margin-bottom: 15px;'>文献文件信息</h4>"
+        for filename, file_info in files_info.items():
+            html += f"<div style='margin: 15px 0; padding: 15px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9;'>"
+            html += f"<h5 style='margin: 0 0 10px 0; color: #333;'>📄 {filename}</h5>"
+            html += f"<p style='margin: 5px 0; font-size: 13px; color: #666; line-height: 1.4;'>"
+            html += f"<strong>可用列:</strong><br>{', '.join(file_info['columns'])}</p>"
+            html += f"<p style='margin: 5px 0; font-size: 12px; color: #888;'>行数: {len(file_info['dataframe'])}</p>"
+            html += "</div>"
+        html += "</div>"
+        return html
+    def update_paper_mapping_dropdowns(self, files_info):
+        """更新文献映射下拉菜单"""
+        updates = []
+        if not files_info:
+            # 隐藏所有下拉菜单
+            for i in range(25):  # 5个文件 x 5个字段
+                updates.append(gr.update(visible=False))
+            return False, "<p>请先上传文献文件</p>", *updates
+        files_list = list(files_info.items())
+        dropdown_idx = 0
+        required_fields = ["Author", "Journal title", "Article title", "Abstract", "DOI"]
+        for file_idx, (filename, file_info) in enumerate(files_list[:5]):  # 最多5个文件
+            for field_idx, field_name in enumerate(required_fields):
+                if dropdown_idx < 25:
+                    choices = ["无"] + file_info['columns']
+                    updates.append(gr.update(
+                        choices=choices,
+                        value="无",
+                        visible=True,
+                        label=f"文件{file_idx+1} → {field_name}"
+                    ))
+                    dropdown_idx += 1
+        # 隐藏未使用的下拉菜单
+        while dropdown_idx < 25:
+            updates.append(gr.update(visible=False))
+            dropdown_idx += 1
+        return True, self.create_paper_mapping_interface(files_info), *updates
+    def filter_papers_by_journals(self, files_info, *mapping_values):
+        """根据选中的期刊过滤文献"""
+        try:
+            if not files_info:
+                return "错误：没有加载文献文件", None
+            if not self.selected_journals:
+                return "错误：请先在 Journal Processing 中选择期刊", None
+            # 解析映射配置并统计总文献数
+            required_fields = ["Author", "Journal title", "Article title", "Abstract", "DOI"]
+            all_papers_data = []  # 存储所有文献
+            filtered_papers_data = []  # 存储筛选后的文献
+            files_list = list(files_info.items())
+            # 统计总文献数
+            total_papers_count = sum(len(file_info['dataframe']) for file_info in files_info.values())
+            mapping_idx = 0
+            for file_idx, (filename, file_info) in enumerate(files_list[:5]):
+                df = file_info['dataframe']
+                # 获取当前文件的列映射
+                file_mapping = {}
+                for field_idx, field_name in enumerate(required_fields):
+                    if mapping_idx < len(mapping_values) and mapping_values[mapping_idx] != "无":
+                        file_mapping[field_name] = mapping_values[mapping_idx]
+                    mapping_idx += 1
+                # 检查是否有期刊标题映射
+                if "Journal title" not in file_mapping:
+                    continue
+                journal_col = file_mapping["Journal title"]
+                # 处理当前文件的每一行
+                for _, row in df.iterrows():
+                    # 构建完整的文献记录
+                    paper_row = []
+                    for field_name in required_fields:
+                        if field_name in file_mapping:
+                            value = str(row[file_mapping[field_name]]).strip()
+                            paper_row.append(value)
+                        else:
+                            paper_row.append("")
+                    all_papers_data.append(paper_row)
+                    # 检查期刊是否在选中列表中
+                    journal_title = str(row[journal_col]).strip()
+                    clean_journal = re.sub(r'[^\w\s]', '', journal_title.lower().strip())
+                    if clean_journal in self.selected_journals:
+                        filtered_papers_data.append(paper_row)
+            if not filtered_papers_data:
+                return f"没有找到匹配的文献\n总文献数: {total_papers_count} 篇", None
+            # 创建DataFrame并去重
+            all_papers_df = pd.DataFrame(all_papers_data, columns=required_fields)
+            filtered_papers_df = pd.DataFrame(filtered_papers_data, columns=required_fields)
+            # 去重处理
+            all_papers_after_dedup = all_papers_df.drop_duplicates()
+            after_dedup_count = len(all_papers_after_dedup)
+            filtered_papers_after_dedup = filtered_papers_df.drop_duplicates()
+            # 将Journal title转为小写
+            if 'Journal title' in filtered_papers_after_dedup.columns:
+                filtered_papers_after_dedup['Journal title'] = filtered_papers_after_dedup['Journal title'].astype(str).str.lower().str.strip()
+            final_count = len(filtered_papers_after_dedup)
+            # 保存结果
+            output_filename = "filtered_papers.csv"
+            filtered_papers_after_dedup.to_csv(output_filename, index=False, encoding='utf-8')
+            status_msg = f"过滤完成！\n"
+            status_msg += f"使用了 {len(self.selected_journals)} 个选中期刊进行过滤\n\n"
+            status_msg += f"📊 处理统计:\n"
+            status_msg += f"总文献条数: {total_papers_count} 篇\n"
+            status_msg += f"去重后: {after_dedup_count} 篇\n"
+            status_msg += f"筛选后: {final_count} 篇\n\n"
+            status_msg += f"✅ 已保存到: {output_filename}"
+            return status_msg, output_filename
+        except Exception as e:
+            return f"过滤文献时出错: {str(e)}", None
+def create_ui():
+    """创建主UI界面"""
+    filter_tool = JournalPaperFilter()
+    with gr.Blocks(title="期刊和文献筛选工具") as app:
+        gr.Markdown("# 期刊和文献筛选工具")
+        gr.Markdown("第一步：在 Journal Processing 中选择目标期刊；第二步：在 Paper Filter 中过滤文献")
+        with gr.Tabs():
+            # 第一个标签页：期刊处理
+            with gr.TabItem("Journal Processing"):
+                gr.Markdown("### 上传期刊列表并选择筛选条件")
+                # 状态管理
+                journal_info_state = gr.State({})
+                # 文件上传
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        journal_upload = gr.File(
+                            label="上传期刊列表文件",
+                            file_types=[".csv", ".xls", ".xlsx"]
+                        )
+                    with gr.Column(scale=1):
+                        journal_status = gr.Textbox(label="加载状态", interactive=False)
+                    with gr.Column(scale=2):
+                        journal_info = gr.Textbox(label="文件信息", lines=6, interactive=False)
+                # 列选择
+                with gr.Row():
+                    with gr.Column():
+                        title_col_dropdown = gr.Dropdown(
+                            choices=[],
+                            label="Journal Title 列",
+                            interactive=True
+                        )
+                        index_col_dropdown = gr.Dropdown(
+                            choices=[],
+                            label="Index 列（用于筛选）",
+                            interactive=True
+                        )
+                    with gr.Column():
+                        index_status = gr.Textbox(
+                            label="索引信息",
+                            interactive=False,
+                            lines=3
+                        )
+                # 索引值选择
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        index_checkboxes = gr.CheckboxGroup(
+                            choices=[],
+                            label="选择要包含的索引值",
+                            interactive=True
+                        )
+                    with gr.Column(scale=1):
+                        journal_count_display = gr.Textbox(
+                            label="选中期刊统计",
+                            interactive=False,
+                            lines=5
+                        )
+                # 事件绑定 - Journal Processing
+                def on_journal_upload(file):
+                    journal_info, status, title_choices, index_choices, info = filter_tool.load_journal_file(file)
+                    return (journal_info, status, info,
+                           gr.update(choices=title_choices),
+                           gr.update(choices=index_choices))
+                journal_upload.change(
+                    fn=on_journal_upload,
+                    inputs=[journal_upload],
+                    outputs=[journal_info_state, journal_status, journal_info,
+                            title_col_dropdown, index_col_dropdown]
+                )
+                def on_index_col_change(journal_info, title_col, index_col):
+                    choices, status, count = filter_tool.update_index_options(journal_info, title_col, index_col)
+                    return gr.update(choices=choices), status
+                index_col_dropdown.change(
+                    fn=on_index_col_change,
+                    inputs=[journal_info_state, title_col_dropdown, index_col_dropdown],
+                    outputs=[index_checkboxes, index_status]
+                )
+                def on_checkbox_change(journal_info, title_col, index_col, selected):
+                    status = filter_tool.count_selected_journals(journal_info, title_col, index_col, selected)
+                    return status
+                index_checkboxes.change(
+                    fn=on_checkbox_change,
+                    inputs=[journal_info_state, title_col_dropdown, index_col_dropdown, index_checkboxes],
+                    outputs=[journal_count_display]
+                )
+            # 第二个标签页：文献过滤
+            with gr.TabItem("Paper Filter"):
+                gr.Markdown("### 上传文献文件并配置字段映射")
+                # 状态管理
+                paper_files_state = gr.State({})
+                # 文件上传
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        paper_upload = gr.File(
+                            label="上传文献文件",
+                            file_count="multiple",
+                            file_types=[".csv", ".xls", ".xlsx"]
+                        )
+                    with gr.Column(scale=1):
+                        paper_status = gr.Textbox(label="加载状态", interactive=False)
+                    with gr.Column(scale=2):
+                        paper_info = gr.Textbox(label="文件信息", lines=6, interactive=False)
+                # 字段映射
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        paper_mapping_interface = gr.HTML(value="<p>请先上传文献文件</p>")
+                    with gr.Column(scale=3):
+                        # 创建25个下拉菜单（5个文件 x 5个字段）
+                        paper_mapping_dropdowns = []
+                        with gr.Row(visible=False) as paper_mapping_row:
+                            for i in range(25):
+                                dropdown = gr.Dropdown(
+                                    choices=["无"],
+                                    value="无",
+                                    visible=False,
+                                    interactive=True,
+                                    scale=1
+                                )
+                                paper_mapping_dropdowns.append(dropdown)
+                # 处理按钮
+                with gr.Row():
+                    filter_btn = gr.Button("Focus on Selected Journals", variant="primary", size="lg")
+                # 结果显示
+                with gr.Row():
+                    filter_result = gr.Textbox(label="过滤结果", lines=5, interactive=False)
+                    result_file = gr.File(label="下载过滤后的文献", interactive=False)
+                # 事件绑定 - Paper Filter
+                def on_paper_upload(files):
+                    files_info, status, info = filter_tool.load_paper_files(files)
+                    row_visible, html, *dropdown_updates = filter_tool.update_paper_mapping_dropdowns(files_info)
+                    return files_info, status, info, html, gr.update(visible=row_visible), *dropdown_updates
+                paper_upload.change(
+                    fn=on_paper_upload,
+                    inputs=[paper_upload],
+                    outputs=[paper_files_state, paper_status, paper_info,
+                            paper_mapping_interface, paper_mapping_row] + paper_mapping_dropdowns
+                )
+                def on_filter_papers(files_info, *mapping_values):
+                    return filter_tool.filter_papers_by_journals(files_info, *mapping_values)
+                filter_btn.click(
+                    fn=on_filter_papers,
+                    inputs=[paper_files_state] + paper_mapping_dropdowns,
+                    outputs=[filter_result, result_file]
+                )
+    return app
+if __name__ == "__main__":
+    app = create_ui()
+    app.launch(debug=True, share=False)