Spaces:

m7mdal7aj
/

KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on May 14, 2024

Commit

cddc6a6

verified ·

1 Parent(s): 020595f

Update my_model/tabs/dataset_analysis.py

Browse files

Files changed (1) hide show

my_model/tabs/dataset_analysis.py +160 -14

my_model/tabs/dataset_analysis.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Tuple, List, Optional
 from my_model.dataset.dataset_processor import process_okvqa_dataset
 from my_model.config import dataset_config as config
 class OKVQADatasetAnalyzer:
     """
     Provides tools for analyzing and visualizing distributions of question types within given question datasets.
@@ -29,22 +30,22 @@ class OKVQADatasetAnalyzer:
         Parameters:
             train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions.
-            test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of
                                   questions.
-            data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or
                                'train_test'indicating whether to load training data, testing data, or both.
-        The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by
         calling the `load_data` method.
         It also prepares structures for categorizing questions and storing the results.
         """
         self.train_file_path = train_file_path
         self.test_file_path = test_file_path
         self.data_choice = data_choice
         self.questions = []
         self.question_types = Counter()
-        self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS}
         self.load_data()
     def load_data(self) -> None:
@@ -71,7 +72,7 @@ class OKVQADatasetAnalyzer:
         questions.
         """
-        question_keywords = config.QUESTION_KEYWORDS
         for question in self.questions:
             question = contractions.fix(question)
@@ -98,7 +99,7 @@ class OKVQADatasetAnalyzer:
         The chart sorts question types by count in descending order and includes detailed tooltips for interaction.
         This method is intended for visualization in a Streamlit application.
         """
         # Prepare data
         total_questions = sum(self.question_types.values())
         items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()]
@@ -118,7 +119,7 @@ class OKVQADatasetAnalyzer:
         # Create the bar chart
         bars = alt.Chart(df).mark_bar().encode(
             x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)),
-            y=alt.Y('Count:Q', title='Frequency'),
             color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None),
             tooltip=[alt.Tooltip('Question Keyword:N', title='Type'),
                      alt.Tooltip('Count:Q', title='Count'),
@@ -138,17 +139,83 @@ class OKVQADatasetAnalyzer:
         # Combine the bar and text layers
         chart = (bars + text).properties(
-            width=700,
-            height=400,
-            title='Distribution of Question Keywords'
-        ).configure_title(fontSize=20).configure_axis(
             labelFontSize=12,
-            titleFontSize=14
         )
         # Display the chart in Streamlit
         st.altair_chart(chart, use_container_width=True)
     def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None:
         """
         Exports the categorized questions and their counts to two separate CSV files.
@@ -174,4 +241,83 @@ class OKVQADatasetAnalyzer:
             writer = csv.writer(file)
             writer.writerow(['Question Type', 'Count'])
             for q_type, count in self.question_types.items():
-                writer.writerow([q_type, count])

 from my_model.dataset.dataset_processor import process_okvqa_dataset
 from my_model.config import dataset_config as config
 class OKVQADatasetAnalyzer:
     """
     Provides tools for analyzing and visualizing distributions of question types within given question datasets.
         Parameters:
             train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions.
+            test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of
                                   questions.
+            data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or
                                'train_test'indicating whether to load training data, testing data, or both.
+        The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by
         calling the `load_data` method.
         It also prepares structures for categorizing questions and storing the results.
         """
         self.train_file_path = train_file_path
         self.test_file_path = test_file_path
         self.data_choice = data_choice
         self.questions = []
         self.question_types = Counter()
+        self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS + ['others']}
         self.load_data()
     def load_data(self) -> None:
         questions.
         """
+        question_keywords = self.QUESTION_KEYWORDS
         for question in self.questions:
             question = contractions.fix(question)
         The chart sorts question types by count in descending order and includes detailed tooltips for interaction.
         This method is intended for visualization in a Streamlit application.
         """
         # Prepare data
         total_questions = sum(self.question_types.values())
         items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()]
         # Create the bar chart
         bars = alt.Chart(df).mark_bar().encode(
             x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)),
+            y=alt.Y('Count:Q', title='Question Count'),
             color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None),
             tooltip=[alt.Tooltip('Question Keyword:N', title='Type'),
                      alt.Tooltip('Count:Q', title='Count'),
         # Combine the bar and text layers
         chart = (bars + text).properties(
+            width=800,
+            height=600,
+        ).configure_axis(
             labelFontSize=12,
+            titleFontSize=16,
+            labelFontWeight='bold',
+            titleFontWeight='bold',
+            grid=False
+        ).configure_text(
+            fontWeight='bold'
+        ).configure_title(
+        fontSize=20,
+        font='bold',
+        anchor='middle'
+        )
+        # Display the chart in Streamlit
+        st.altair_chart(chart, use_container_width=True)
+    def plot_bar_chart(self, df: pd.DataFrame, category_col: str, value_col: str, chart_title: str) -> None:
+        """
+        Plots an interactive bar chart using Altair and Streamlit.
+        Args:
+            df (pd.DataFrame): DataFrame containing the data for the bar chart.
+            category_col (str): Name of the column containing the categories.
+            value_col (str): Name of the column containing the values.
+            chart_title (str): Title of the chart.
+        Returns:
+            None
+        """
+        # Calculate percentage for each category
+        df['Percentage'] = (df[value_col] / df[value_col].sum()) * 100
+        df['PercentageText'] = df['Percentage'].round(1).astype(str) + '%'
+        # Create the bar chart
+        bars = alt.Chart(df).mark_bar().encode(
+            x=alt.X(field=category_col, title='Category', sort='-y', axis=alt.Axis(labelAngle=-45)),
+            y=alt.Y(field=value_col, type='quantitative', title='Percentage'),
+            color=alt.Color(field=category_col, type='nominal', legend=None),
+            tooltip=[
+                alt.Tooltip(field=category_col, type='nominal', title='Category'),
+                alt.Tooltip(field=value_col, type='quantitative', title='Percentage'),
+                alt.Tooltip(field='Percentage', type='quantitative', title='Percentage', format='.1f')
+            ]
+        ).properties(
+            width=800,
+            height=600
+        )
+        # Add text labels to the bars
+        text = bars.mark_text(
+            align='center',
+            baseline='bottom',
+            dy=-10  # Nudges text up so it appears above the bar
+        ).encode(
+            text=alt.Text('PercentageText:N')
         )
+        # Combine the bar chart and text labels
+        chart = (bars + text).configure_title(
+            fontSize=20
+        ).configure_axis(
+            labelFontSize=12,
+            titleFontSize=16,
+            labelFontWeight='bold',
+            titleFontWeight='bold',
+            grid=False
+        ).configure_text(
+            fontWeight='bold')
         # Display the chart in Streamlit
         st.altair_chart(chart, use_container_width=True)
     def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None:
         """
         Exports the categorized questions and their counts to two separate CSV files.
             writer = csv.writer(file)
             writer.writerow(['Question Type', 'Count'])
             for q_type, count in self.question_types.items():
+                writer.writerow([q_type, count])
+def run_dataset_analyzer():
+    datasets_comparison_table = pd.read_excel("dataset_analyses.xlsx", sheet_name="VQA Datasets Comparison")
+    okvqa_dataset_characteristics = pd.read_excel("dataset_analyses.xlsx", sheet_name="OK-VQA Dataset Characteristics")
+    val_data = process_okvqa_dataset('OpenEnded_mscoco_val2014_questions.json', 'mscoco_val2014_annotations.json',
+                                     save_to_csv=False)
+    train_data = process_okvqa_dataset('OpenEnded_mscoco_train2014_questions.json', 'mscoco_train2014_annotations.json',
+                                       save_to_csv=False)
+    dataset_analyzer = OKVQADatasetAnalyzer('OpenEnded_mscoco_train2014_questions.json',
+                                            'OpenEnded_mscoco_val2014_questions.json', 'train_test')
+    with st.container():
+        st.markdown("## Overview of KB-VQA Datasets")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.write(" ")
+            with st.expander("1 - Knowledge-Based VQA (KB-VQA)"):
+                st.markdown(""" [Knowledge-Based VQA (KB-VQA)](https://arxiv.org/abs/1511.02570): One of the earliest
+                                datasets in this domain, KB-VQA comprises 700 images and 2,402 questions, with each
+                                question associated with both an image and a knowledge base (KB). The KB encapsulates
+                                facts about the world, including object names, properties, and relationships, aiming to
+                                 foster models capable of answering questions through reasoning over both the image
+                                 and the KB.\n""")
+            with st.expander("2 - Factual VQA (FVQA)"):
+                st.markdown(""" [Factual VQA (FVQA)](https://arxiv.org/abs/1606.05433): This dataset includes 2,190
+                                images and 5,826 questions, accompanied by a knowledge base containing 193,449 facts.
+                                The FVQA's questions are predominantly factual and less open-ended compared to those
+                                in KB-VQA, offering a different challenge in knowledge-based reasoning.\n""")
+            with st.expander("3 - Outside-Knowledge VQA (OK-VQA)"):
+                st.markdown(""" [Outside-Knowledge VQA (OK-VQA)](https://arxiv.org/abs/1906.00067): OK-VQA poses a more
+                                demanding challenge than KB-VQA, featuring an open-ended knowledge base that can be
+                                updated during model training. This dataset contains 14,055 questions and 14,031 images.
+                                Questions are carefully curated to ensure they require reasoning beyond the image
+                                content alone.\n""")
+            with st.expander("4 - Augmented OK-VQA (A-OKVQA)"):
+                st.markdown(""" [Augmented OK-VQA (A-OKVQA)](https://arxiv.org/abs/2206.01718): Augmented successor of
+                                OK-VQA dataset, focused on common-sense knowledge and reasoning rather than purely
+                                factual knowledge, A-OKVQA offers approximately 24,903 questions across 23,692 images.
+                                Questions in this dataset demand commonsense reasoning about the scenes depicted in the
+                                images, moving beyond straightforward knowledge base queries. It also provides
+                                rationales for answers, aiming to be a significant testbed for the development of AI
+                                models that integrate visual and natural language reasoning.\n""")
+        with col2:
+            st.markdown("#### KB-VQA Datasets Comparison")
+            st.write(datasets_comparison_table, use_column_width=True)
+    st.write("-----------------------")
+    with st.container():
+        st.write("\n" * 10)
+        st.markdown("## OK-VQA Dataset")
+        st.write("This model was fine-tuned and evaluated using OK-VQA dataset.\n")
+        col1, col2, col3 = st.columns([2, 5, 5])
+        with col1:
+            st.markdown("#### OK-VQA Dataset Characteristics")
+            st.write(okvqa_dataset_characteristics)
+        with col2:
+            df = pd.read_excel("dataset_analyses.xlsx", sheet_name="Question Category Dist")
+            st.markdown("#### Questions Distribution over Knowledge Category")
+            dataset_analyzer.plot_bar_chart(df, "Knowledge Category", "Percentage", "Questions Distribution over "
+                                                                                      "Knowledge Category")
+        with col3:
+            #with st.expander("Distribution of Question Keywords"):
+            dataset_analyzer.categorize_questions()
+            st.markdown("#### Distribution of Question Keywords")
+            dataset_analyzer.plot_question_distribution()
+    with st.container():
+        with st.expander("Show Dataset Samples"):
+            st.write(train_data[:10])