Medical-QA-Data-Sets

Sleeping

App Files Files Community

Malikeh Ehghaghi commited on Oct 23, 2023

Commit

c02fbf1

unverified ·

1 Parent(s): bd757bc

Add files via upload

Browse files

Files changed (4) hide show

app.py +109 -0
dataset_list.py +101 -0
requirements.txt +4 -0
style.css +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/usr/bin/env python
+from __future__ import annotations
+import gradio as gr
+from dataset_list import DatasetList
+DESCRIPTION = '# Explore Medical Question Answering Datasets 🏥'
+NOTES = '''
+'''
+FOOTER = ''''''
+def main():
+    dataset_list = DatasetList()
+    with gr.Blocks(css='style.css') as demo:
+        gr.Markdown(DESCRIPTION)
+        search_box = gr.Textbox(
+            label='Search Dataset Name',
+            placeholder=
+            'You can search for titles with regular expressions. e.g. (?<!sur)face',
+            max_lines=1)
+        case_sensitive = gr.Checkbox(label='Case Sensitive')
+        filter_names = gr.CheckboxGroup(choices=[
+            'Dataset',
+            'Data Link',
+            'Paper',
+        ], label='Filter')
+        # data_type_names = [
+        #     'DNA', 'scRNA', 'mRNA', 'scRNA perturbation', 'RNA structure prediction', 'RNA language model', 'protein language model', 'protein structure prediction',
+        #     'protein generation', 'protein function prediction', 'protein fitness prediction', 'antibody structure prediction', 'antibody language model', 'molecules',
+        #     'ligand generation', 'reaction-to-enzyme', 'enzyme generation', 'epigenomic', 'molecular docking', 'peptide property prediction',
+        # ]
+        # data_types = gr.CheckboxGroup(choices=data_type_names,
+        #                               value=data_type_names,
+        #                               label='Type')
+        # years = ['2020', '2021', '2022', '2023']
+        # years_checkbox = gr.CheckboxGroup(choices=years, value=years, label='Year of Publication/Preprint')
+        # model_type_names = [
+        #     'GPT2', 'GPT-Neo', 'GPT-NeoX', 'ESM', 'BERT', 'RoBERTa', 'BART', 'T5', 'MPNN', 'diffusion', 'custom model'
+        # ]
+        # model_types = gr.CheckboxGroup(choices=model_type_names,
+        #                                value=model_type_names,
+        #                                label='Base Model')
+        search_button = gr.Button('Search')
+        number_of_datasets = gr.Textbox(label='Number of Datasets Found')
+        table = gr.HTML(show_label=False)
+        gr.Markdown(NOTES)
+        gr.Markdown(FOOTER)
+        demo.load(fn=dataset_list.render,
+                  inputs=[
+                      search_box,
+                      case_sensitive,
+                      filter_names
+                      # data_types,
+                      # years_checkbox,
+                      #model_types
+                  ],
+                  outputs=[
+                      number_of_datasets,
+                      table,
+                  ])
+        search_box.submit(fn=dataset_list.render,
+                          inputs=[
+                              search_box,
+                              case_sensitive,
+                              filter_names
+                              # data_types,
+                              # years_checkbox,
+                              #model_types
+                          ],
+                          outputs=[
+                              number_of_datasets,
+                              table,
+                          ])
+        search_button.click(fn=dataset_list.render,
+                            inputs=[
+                                search_box,
+                                case_sensitive,
+                                filter_names
+                                # data_types,
+                                # years_checkbox,
+                                #model_types
+                            ],
+                            outputs=[
+                                number_of_datasets,
+                                table,
+                            ])
+        demo.launch(enable_queue=True, share=False)
+if __name__ == '__main__':
+    main()

dataset_list.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from __future__ import annotations
+import numpy as np
+import pandas as pd
+import requests
+from huggingface_hub.hf_api import SpaceInfo
+url = 'https://docs.google.com/spreadsheets/d/1RoM2DgzaYJg6Ias1YNC2kQN01xSWJb1KEER9efb0X7A/edit#gid=0'
+csv_url = url.replace('/edit#gid=', '/export?format=csv&gid=')
+class DatasetList:
+    def __init__(self):
+        self.table = pd.read_csv(csv_url)
+        self._preprocess_table()
+        self.table_header = '''
+            <tr>
+                <td width="15%">Dataset Name</td>
+                <td width="10%">Question Type</td>
+                <td width="10%">Applied In Paper</td>
+                <td width="10%">Reference Paper</td>
+                <td width="20%">Brief Description</td>
+                <td width="5%">Count</td>
+                <td width="10%">Original Access Link</td>
+                <td width="10%">Publicly Available?</td>
+                <td width="10%">Access link on 🤗</td>
+            </tr>'''
+    def _preprocess_table(self) -> None:
+        self.table['dataset_name_lowercase'] = self.table.dataset_name.str.lower()
+        self.table['count'] = self.table['count'].apply(str)
+        rows = []
+        for row in self.table.itertuples():
+            dataset_name = f'{row.dataset_name}' if isinstance(row.dataset_name, str) else ''
+            question_type = f'{row.question_type}' if isinstance(row.question_type, str) else ''
+            used_in_paper = f'{row.used_in_paper}' if isinstance(row.used_in_paper, str) else ''
+            reference_paper = f'<a href="{row.reference_paper}" target="_blank">Paper</a>' if isinstance(row.reference_paper, str) else ''
+            brief_description = f'{row.brief_description}' if isinstance(row.brief_description, str) else ''
+            count = f'{row.count}' if isinstance(row.count, str) else ''
+            original_link = f'<a href="{row.original_link}" target="_blank">Access Link</a>' if isinstance(row.original_link, str) else ''
+            publicly_available = f'<a href="{row.publicly_available}" target="_blank">License</a>' if isinstance(row.publicly_available, str) else ''
+            huggingface_link = f'<a href="{row.huggingface_link}" target="_blank">HF Link</a>' if isinstance(row.huggingface_link, str) else ''
+            row = f'''
+                <tr>
+                    <td>{dataset_name}</td>
+                    <td>{question_type}</td>
+                    <td>{used_in_paper}</td>
+                    <td>{reference_paper}</td>
+                    <td>{brief_description}</td>
+                    <td>{count}</td>
+                    <td>{original_link}</td>
+                    <td>{publicly_available}</td>
+                    <td>{huggingface_link}</td>
+                </tr>'''
+            rows.append(row)
+        self.table['html_table_content'] = rows
+    def render(self, search_query: str,
+            case_sensitive: bool,
+            filter_names: list[str]
+            ) -> tuple[int, str]:
+        df = self.table
+        if search_query:
+            if case_sensitive:
+                df = df[df.dataset_name.str.contains(search_query)]
+            else:
+                df = df[df.dataset_name_lowercase.str.contains(search_query.lower())]
+        has_dataset = 'Dataset' in filter_names
+        has_datalink = 'Data Link' in filter_names
+        has_paper = 'Paper' in filter_names
+        df = self.filter_table(df, has_dataset, has_datalink, has_paper)
+        #df = self.filter_table(df, has_paper, has_github, has_model, data_types, model_types)
+        return len(df), self.to_html(df, self.table_header)
+    @staticmethod
+    def filter_table(df: pd.DataFrame,
+                     has_dataset: bool,
+                     has_datalink: bool,
+                     has_paper: bool
+                    ) -> pd.DataFrame:
+        if has_dataset:
+            df = df[~df.dataset_name.isna()]
+        if has_datalink:
+            df = df[~df.huggingface_link.isna() | ~df.original_link.isna()]
+        if has_paper:
+            df = df[~df.reference_paper.isna()]
+        # df = df[df.data_type.isin(set(data_types))]
+        #df = df[df.base_model.isin(set(model_types))]
+        # df = df[df.year.isin(set(years))]
+        return df
+    @staticmethod
+    def to_html(df: pd.DataFrame, table_header: str) -> str:
+        table_data = ''.join(df.html_table_content)
+        html = f'''
+        <table>
+            {table_header}
+            {table_data}
+        </table>'''
+        return html

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+gradio
+numpy
+pandas

style.css ADDED Viewed

	@@ -0,0 +1,18 @@

+h1 {
+    text-align: center;
+  }
+  table a {
+    background-color: transparent;
+    color: #58a6ff;
+    text-decoration: none;
+  }
+  a:active,
+  a:hover {
+    outline-width: 0;
+  }
+  a:hover {
+    text-decoration: underline;
+  }
+  table, th, td {
+    border: 1px solid;
+  }