Spaces:

m7mdal7aj
/

KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on May 13, 2024

Commit

c996cf4

verified ·

1 Parent(s): 67b9883

Create dataset_analysis.py

Browse files

Files changed (1) hide show

my_model/tabs/dataset_analysis.py +177 -0

my_model/tabs/dataset_analysis.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import streamlit as st
+import json
+from collections import Counter
+import contractions
+import csv
+import altair as alt
+from typing import Tuple, List, Optional
+from my_model.dataset.dataset_processor import process_okvqa_dataset
+from my_model.config import dataset_config as config
+class OKVQADatasetAnalyzer:
+    """
+    Provides tools for analyzing and visualizing distributions of question types within given question datasets.
+    It supports operations such as data loading, categorization of questions based on keywords, visualization of q
+    uestion distribution, and exporting data to CSV files.
+    Attributes:
+        train_file_path (str): Path to the training dataset file.
+        test_file_path (str): Path to the testing dataset file.
+        data_choice (str): Choice of dataset(s) to analyze; options include 'train', 'test', or 'train_test'.
+        questions (List[str]): List of questions aggregated based on the dataset choice.
+        question_types (Counter): Counter object tracking the frequency of each question type.
+        Qs (Dict[str, List[str]]): Dictionary mapping question types to lists of corresponding questions.
+    """
+    def __init__(self, train_file_path: str, test_file_path: str, data_choice: str):
+        """
+        Initializes the OKVQADatasetAnalyzer with paths to dataset files and a choice of which datasets to analyze.
+        Parameters:
+            train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions.
+            test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of
+                                  questions.
+            data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or
+                               'train_test'indicating whether to load training data, testing data, or both.
+        The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by
+        calling the `load_data` method.
+        It also prepares structures for categorizing questions and storing the results.
+        """
+        self.train_file_path = train_file_path
+        self.test_file_path = test_file_path
+        self.data_choice = data_choice
+        self.questions = []
+        self.question_types = Counter()
+        self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS}
+        self.load_data()
+    def load_data(self) -> None:
+        """
+        Loads the dataset(s) from the specified JSON file(s) based on the user's choice of 'train', 'test', or
+        'train_test'.
+        This method updates the internal list of questions depending on the chosen dataset.
+        """
+        if self.data_choice in ['train', 'train_test']:
+            with open(self.train_file_path, 'r') as file:
+                train_data = json.load(file)
+                self.questions += [q['question'] for q in train_data['questions']]
+        if self.data_choice in ['test', 'train_test']:
+            with open(self.test_file_path, 'r') as file:
+                test_data = json.load(file)
+                self.questions += [q['question'] for q in test_data['questions']]
+    def categorize_questions(self) -> None:
+        """
+        Categorizes each question in the loaded data into predefined categories based on keywords.
+        This method updates the internal dictionary `self.Qs` and the Counter `self.question_types` with categorized
+        questions.
+        """
+        question_keywords = config.QUESTION_KEYWORDS
+        for question in self.questions:
+            question = contractions.fix(question)
+            words = question.lower().split()
+            question_keyword = None
+            if words[:2] == ['name', 'the']:
+                question_keyword = 'name the'
+            else:
+                for word in words:
+                    if word in question_keywords:
+                        question_keyword = word
+                        break
+            if question_keyword:
+                self.question_types[question_keyword] += 1
+                self.Qs[question_keyword].append(question)
+            else:
+                self.question_types["others"] += 1
+                self.Qs["others"].append(question)
+    def plot_question_distribution(self) -> None:
+        """
+        Plots an interactive bar chart of question types using Altair and Streamlit, displaying the count and percentage
+         of each type.
+        The chart sorts question types by count in descending order and includes detailed tooltips for interaction.
+        This method is intended for visualization in a Streamlit application.
+        """
+        # Prepare data
+        total_questions = sum(self.question_types.values())
+        items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()]
+        df = pd.DataFrame(items, columns=['Question Keyword', 'Count', 'Percentage'])
+        # Sort data and handle 'others' category specifically if present
+        df = df[df['Question Keyword'] != 'others'].sort_values('Count', ascending=False)
+        if 'others' in self.question_types:
+            others_df = pd.DataFrame([('others', self.question_types['others'],
+                                       (self.question_types['others'] / total_questions) * 100)],
+                                     columns=['Question Keyword', 'Count', 'Percentage'])
+            df = pd.concat([df, others_df], ignore_index=True)
+        # Explicitly set the order of the x-axis based on the sorted DataFrame
+        order = df['Question Keyword'].tolist()
+        # Create the bar chart
+        bars = alt.Chart(df).mark_bar().encode(
+            x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)),
+            y=alt.Y('Count:Q', title='Frequency'),
+            color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None),
+            tooltip=[alt.Tooltip('Question Keyword:N', title='Type'),
+                     alt.Tooltip('Count:Q', title='Count'),
+                     alt.Tooltip('Percentage:Q', title='Percentage', format='.1f')]
+        )
+        # Create text labels for the bars with count and percentage
+        text = bars.mark_text(
+            align='center',
+            baseline='bottom',
+            dy=-5  # Nudges text up so it appears above the bar
+        ).encode(
+            text=alt.Text('PercentageText:N')
+        ).transform_calculate(
+            PercentageText="datum.Count + ' (' + format(datum.Percentage, '.1f') + '%)'"
+        )
+        # Combine the bar and text layers
+        chart = (bars + text).properties(
+            width=700,
+            height=400,
+            title='Distribution of Question Keywords'
+        ).configure_title(fontSize=20).configure_axis(
+            labelFontSize=12,
+            titleFontSize=14
+        )
+        # Display the chart in Streamlit
+        st.altair_chart(chart, use_container_width=True)
+    def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None:
+        """
+        Exports the categorized questions and their counts to two separate CSV files.
+        Parameters:
+            qs_filename (str): The filename or path for exporting the `self.Qs` dictionary data.
+            question_types_filename (str): The filename or path for exporting the `self.question_types` Counter data.
+        This method writes the contents of `self.Qs` and `self.question_types` to the specified files in CSV format.
+        Each CSV file includes headers for better understanding and use of the exported data.
+        """
+        # Export self.Qs dictionary
+        with open(qs_filename, mode='w', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            writer.writerow(['Question Type', 'Questions'])
+            for q_type, questions in self.Qs.items():
+                for question in questions:
+                    writer.writerow([q_type, question])
+        # Export self.question_types Counter
+        with open(question_types_filename, mode='w', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            writer.writerow(['Question Type', 'Count'])
+            for q_type, count in self.question_types.items():
+                writer.writerow([q_type, count])