Spaces:

ai4data
/

monitoring_of_datause

Running

File size: 27,437 Bytes

import os
import json
from typing import List, Dict, Tuple, Optional, Any
from collections import Counter, defaultdict

import gradio as gr

# ── Local CONFIG ──────────────────────────────────────────────────────────────
DATA_FILE = "consolidated_data_optimized.json"


def load_initial_data() -> List[Dict]:
    if not os.path.exists(DATA_FILE):
        raise FileNotFoundError(f"{DATA_FILE} not found in current directory.")
    with open(DATA_FILE, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Sort to show records with relations first (most informative)
    data.sort(key=lambda x: len(x.get('ner_text', [])), reverse=True)
    
    return data


class DynamicDataset:
    def __init__(self, data: List[Dict]):
        self.data = data
        self.len = len(data)
        self.current = 0

    def example(self, idx: int) -> Dict:
        self.current = max(0, min(self.len - 1, idx))
        return self.data[self.current]


class ComparisonManager:
    def __init__(self, data: List[Dict]):
        self.data = data
        
        # Group by type
        self.type_groups = defaultdict(lambda: {'validated': [], 'not_validated': []})
        
        # Group by term (extract from ner_text)
        self.term_groups = defaultdict(lambda: {'validated': [], 'not_validated': []})
        
        for rec in data:
            dtype = rec.get("type")
            is_validated = rec.get("validated", False)
            tags = rec.get("tags", [])
            
            # Only include borderline cases
            if "borderline" not in tags:
                continue
            
            # Group by type
            if dtype:
                key = 'validated' if is_validated else 'not_validated'
                self.type_groups[dtype][key].append(rec)
            
            # Extract term from ner_text
            if rec.get('ner_text') and len(rec['ner_text']) > 0:
                start, end, label = rec['ner_text'][0]
                if label == 'named' and rec.get('text'):
                    term = rec['text'][start:end]
                    if term and "confusing_term" in tags:
                        key = 'validated' if is_validated else 'not_validated'
                        self.term_groups[term][key].append(rec)
        
        # Get mixed types (sorted by total count)
        self.mixed_types = []
        for dtype, groups in self.type_groups.items():
            if groups['validated'] and groups['not_validated']:
                total = len(groups['validated']) + len(groups['not_validated'])
                self.mixed_types.append((dtype, total))
        self.mixed_types.sort(key=lambda x: x[1], reverse=True)
        self.mixed_types = [t[0] for t in self.mixed_types]
        
        # Get confusing terms (sorted by total count)
        self.confusing_terms = []
        for term, groups in self.term_groups.items():
            if groups['validated'] and groups['not_validated']:
                total = len(groups['validated']) + len(groups['not_validated'])
                self.confusing_terms.append((term, total))
        self.confusing_terms.sort(key=lambda x: x[1], reverse=True)
        self.confusing_terms = [t[0] for t in self.confusing_terms]

    def get_example_by_type(self, dtype: str, is_validated: bool, idx: int) -> Dict:
        if dtype not in self.type_groups:
            return {}
        group = self.type_groups[dtype]['validated' if is_validated else 'not_validated']
        if not group:
            return {}
        safe_idx = idx % len(group)
        return group[safe_idx]

    def get_count_by_type(self, dtype: str, is_validated: bool) -> int:
        if dtype not in self.type_groups:
            return 0
        return len(self.type_groups[dtype]['validated' if is_validated else 'not_validated'])
    
    def get_example_by_term(self, term: str, is_validated: bool, idx: int) -> Dict:
        if term not in self.term_groups:
            return {}
        group = self.term_groups[term]['validated' if is_validated else 'not_validated']
        if not group:
            return {}
        safe_idx = idx % len(group)
        return group[safe_idx]

    def get_count_by_term(self, term: str, is_validated: bool) -> int:
        if term not in self.term_groups:
            return 0
        return len(self.term_groups[term]['validated' if is_validated else 'not_validated'])


# ── Highlight utils ──────────────────────────────────────────────────────────
def prepare_for_highlight(rec: Dict) -> List[Tuple[str, Optional[str]]]:
    text = rec.get("text", "") or ""
    ner_spans = rec.get("ner_text", []) or []

    segments = []
    last_idx = 0

    for start, end, label in sorted(ner_spans, key=lambda x: x[0]):
        try:
            start = int(start)
            end = int(end)
        except:
            continue

        if start < 0 or end <= start or start > len(text):
            continue
        end = min(end, len(text))

        if start > last_idx:
            segments.append((text[last_idx:start], None))

        segments.append((text[start:end], str(label)))
        last_idx = end

    if last_idx < len(text):
        segments.append((text[last_idx:], None))

    return segments


# ── Filtering helpers ─────────────────────────────────────────────────────────
def record_matches_filters(rec: Dict, dataset_filter: str, type_filter: str):
    is_validated = rec.get("validated", False)
    tags = rec.get("tags", [])
    
    if dataset_filter == "Datasets only" and not is_validated:
        return False
    if dataset_filter == "Non-datasets only" and is_validated:
        return False
    if dataset_filter == "Borderline Cases Only":
        return "borderline" in tags

    if type_filter != "All types":
        return rec.get("type") == type_filter

    return True


# ── Documentation ─────────────────────────────────────────────────────────────
DOCUMENTATION = """
# 📊 Monitoring of Data Use - User Guide

## What is this tool?

This application helps you **review and explore dataset mentions** extracted from documents. 
It displays text excerpts where potential datasets have been identified, along with metadata about each mention.

## What you'll see

Each record shows:
- **📄 Source Document**: The filename and page number where the text was found
- **🔍 Highlighted Text**: The original text with dataset mentions highlighted
- **📋 Data Type**: The category of the dataset (e.g., census, survey, database)
- **✅ Dataset Status**: Whether this mention actually refers to a dataset
- **💡 Context**: The surrounding text that provides context
- **📝 Explanation**: Why this was classified as a dataset (or not)
- **🏷️ Tags**: Borderline, mixed type, or confusing term indicators

## How to use this tool

### 🎯 Navigation
- **Browse Records**: Use the slider to jump to any record by number
- **Previous/Next Buttons**: Navigate through records one at a time
- **Filters**: The Previous/Next buttons respect your active filters

### 🔍 Filtering Options

1. **Dataset Status Filter**
   - **All**: Show all records
   - **Datasets only**: Show only records that contain actual dataset references
   - **Non-datasets only**: Show records that were identified but don't actually refer to datasets
   - **🔥 Borderline Cases Only**: Show only confusing/mixed cases

2. **Data Type Filter**
   - Filter by specific data types (census, survey, database, etc.)
   - Types are sorted by frequency (most common first)

### ⚖️ Comparison Tab

The Comparison tab helps you understand **why the same type or term** can be validated differently:

1. **By Type**: Compare examples of the same data type (e.g., "system") with different validation outcomes
2. **By Term**: Compare the exact same term (e.g., "Project MIS") appearing in different contexts

This helps identify:
- What contextual signals distinguish valid from invalid datasets
- Why borderline cases are confusing
- Patterns in validation decisions

### 💡 Tips
- Use filters to focus on specific types of data mentions
- The "Validated" field tells you if the mention is a true dataset reference
- Review the "Explanation" to understand the classification reasoning
- Highlighted text shows exactly where the dataset mention appears in context
- Check tags to identify borderline/confusing cases

## Data Source

This viewer uses data from World Bank project documents with revalidation analysis.
"""


# ── Gradio App ───────────────────────────────────────────────────────────────
def create_demo() -> gr.Blocks:
    data = load_initial_data()
    dynamic_dataset = DynamicDataset(data)
    comparison_manager = ComparisonManager(data)

    # Count types and sort by frequency (most common first)
    type_counter = Counter(rec.get("type") for rec in data if rec.get("type"))
    type_values = [t for t, _ in type_counter.most_common()]
    type_choices = ["All types"] + type_values

    def make_info(rec):
        """Format record metadata for display."""
        fn = rec.get("filename", "—")
        pg = rec.get("page", "—")
        v_type = rec.get("type", "—")
        empirical_context = rec.get("empirical_context", "—")
        explanation = rec.get("explanation", "—")
        tags = rec.get("tags", [])
        is_validated = rec.get("validated", False)
        contextual_signal = rec.get("contextual_signal", "—")
        contextual_reason_model = rec.get("contextual_reason_model", "—")
        contextual_reason_agent = rec.get("contextual_reason_agent", "—")

        # Apply conditional highlighting based on validation
        if rec.get("ner_text") and rec.get("text") and is_validated is not None:
            try:
                start, end = rec["ner_text"][0][0], rec["ner_text"][0][1]
                term = rec["text"][start:end]
                if is_validated:
                    highlight_style = 'background-color: #90ee90; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #5cb85c;'
                else:
                    highlight_style = 'background-color: #ff7f7f; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #d9534f;'
                if term and term in empirical_context:
                    empirical_context = empirical_context.replace(term, f'<span style="{highlight_style}">{term}</span>')
            except Exception:
                pass

        # Build HTML
        type_html = f"<code>{v_type}</code>"
        
        # Add type stats if available
        type_stats = rec.get("type_stats")
        if type_stats:
            type_html += f" <small>(Type: {type_stats['validated']} ✅ / {type_stats['not_validated']} ❌)</small>"

        tags_html = ""
        # Add tags
        if tags:
            tag_badges = []
            if "borderline" in tags:
                tag_badges.append("⚠️ <b>Borderline</b>")
            if "mixed_type" in tags:
                tag_badges.append("🔍 <b>Mixed Type</b>")
            if "confusing_term" in tags:
                tag_badges.append("🤔 <b>Confusing Term</b>")
            if tag_badges:
                tags_html = " ".join(tag_badges)

        html = f"""
        <h3>📄 Document Information</h3>
        <p><b>File:</b> <code>{fn}</code><br>
        <b>Page:</b> <code>{pg}</code></p>

        <h3>🏷️ Type</h3>
        <p>{type_html}</p>
        """
        
        if tags_html:
            html += f"""
            <h3>🚩 Tags</h3>
            <p>{tags_html}</p>
            """

        html += f"""
        <h3>📝 Surrounding Text</h3>
        <p>{empirical_context}</p>
        """
        
        # Add validation analysis
        status_icon = '✅' if is_validated else '❌'
        status_text = 'Is a dataset' if is_validated else 'Not a dataset'
        html += f"""
        <h3>🤖 Validation Analysis</h3>
        <p><b>Assessment:</b> {status_icon} {status_text}</p>
        <p><b>Contextual Signal:</b> <code>{contextual_signal}</code></p>
        """
        
        if contextual_reason_agent:
            html += f"""
            <p><b>Agent Reasoning:</b></p>
            <blockquote style="border-left: 3px solid #ccc; padding-left: 10px; color: #666;">
            {contextual_reason_agent}
            </blockquote>
            """
        
        if contextual_reason_model:
            html += f"""
            <p><b>Model Reasoning:</b></p>
            <blockquote style="border-left: 3px solid #999; padding-left: 10px; color: #888;">
            {contextual_reason_model}
            </blockquote>
            """
        
        return html

    # Basic load by slider index (ignores filters)
    def load_example(idx: int):
        rec = dynamic_dataset.example(idx)
        segs = prepare_for_highlight(rec)
        return segs, idx, make_info(rec)

    # When filters change → jump to first matching record
    def jump_on_filters(dataset_filter, type_filter):
        n = dynamic_dataset.len
        for i in range(n):
            if record_matches_filters(data[i], dataset_filter, type_filter):
                dynamic_dataset.current = i
                rec = data[i]
                segs = prepare_for_highlight(rec)
                return segs, i, make_info(rec)

        # No match → return blank
        return [], 0, "⚠️ No matching records found with the selected filters."

    # Navigation respecting filters
    def nav_next(dataset_filter, type_filter):
        i = dynamic_dataset.current + 1
        n = dynamic_dataset.len
        while i < n:
            if record_matches_filters(data[i], dataset_filter, type_filter):
                break
            i += 1
        if i >= n:
            i = dynamic_dataset.current
        dynamic_dataset.current = i
        rec = data[i]
        return prepare_for_highlight(rec), i, make_info(rec)

    def nav_prev(dataset_filter, type_filter):
        i = dynamic_dataset.current - 1
        while i >= 0:
            if record_matches_filters(data[i], dataset_filter, type_filter):
                break
            i -= 1
        if i < 0:
            i = dynamic_dataset.current
        dynamic_dataset.current = i
        rec = data[i]
        return prepare_for_highlight(rec), i, make_info(rec)

    # Comparison Logic - By Type
    def load_type_comparison(dtype, pos_idx, neg_idx):
        if not dtype:
            return [], "Select a type", [], "Select a type", "### ✅ IS Dataset", "### ❌ NOT Dataset"
            
        pos_rec = comparison_manager.get_example_by_type(dtype, True, pos_idx)
        neg_rec = comparison_manager.get_example_by_type(dtype, False, neg_idx)
        
        pos_hl = prepare_for_highlight(pos_rec) if pos_rec else []
        neg_hl = prepare_for_highlight(neg_rec) if neg_rec else []
        
        pos_info = make_info(pos_rec) if pos_rec else "No examples"
        neg_info = make_info(neg_rec) if neg_rec else "No examples"
        
        # Add count info
        pos_total = comparison_manager.get_count_by_type(dtype, True)
        neg_total = comparison_manager.get_count_by_type(dtype, False)
        
        pos_header = f"### ✅ IS Dataset ({(pos_idx % pos_total) + 1 if pos_total > 0 else 0}/{pos_total})"
        neg_header = f"### ❌ NOT Dataset ({(neg_idx % neg_total) + 1 if neg_total > 0 else 0}/{neg_total})"
        
        return pos_hl, pos_info, neg_hl, neg_info, pos_header, neg_header

    # Comparison Logic - By Term
    def load_term_comparison(term, pos_idx, neg_idx):
        if not term:
            return [], "Select a term", [], "Select a term", "### ✅ IS Dataset", "### ❌ NOT Dataset"
            
        pos_rec = comparison_manager.get_example_by_term(term, True, pos_idx)
        neg_rec = comparison_manager.get_example_by_term(term, False, neg_idx)
        
        pos_hl = prepare_for_highlight(pos_rec) if pos_rec else []
        neg_hl = prepare_for_highlight(neg_rec) if neg_rec else []
        
        pos_info = make_info(pos_rec) if pos_rec else "No examples"
        neg_info = make_info(neg_rec) if neg_rec else "No examples"
        
        # Add count info
        pos_total = comparison_manager.get_count_by_term(term, True)
        neg_total = comparison_manager.get_count_by_term(term, False)
        
        pos_header = f"### ✅ IS Dataset ({(pos_idx % pos_total) + 1 if pos_total > 0 else 0}/{pos_total})"
        neg_header = f"### ❌ NOT Dataset ({(neg_idx % neg_total) + 1 if neg_total > 0 else 0}/{neg_total})"
        
        return pos_hl, pos_info, neg_hl, neg_info, pos_header, neg_header

    def next_pos(current_idx):
        return current_idx + 1

    def next_neg(current_idx):
        return current_idx + 1

    # ---- UI ----
    with gr.Blocks(title="Monitoring of Data Use") as demo:
        gr.Markdown("# 📊 Monitoring of Data Use")
        
        with gr.Tabs():
            with gr.Tab("📖 How to Use"):
                gr.Markdown(DOCUMENTATION)
            
            with gr.Tab("🔍 Viewer"):
                with gr.Row():
                    prog = gr.Slider(
                        minimum=0,
                        maximum=dynamic_dataset.len - 1,
                        value=0,
                        step=1,
                        label=f"📑 Browse Records (1 to {dynamic_dataset.len:,})",
                        interactive=True,
                    )

                with gr.Row():
                    dataset_filter = gr.Dropdown(
                        choices=["All", "Datasets only", "Non-datasets only", "Borderline Cases Only"],
                        value="Datasets only",
                        label="🎯 Filter by Validation Status",
                    )

                    type_filter = gr.Dropdown(
                        choices=type_choices,
                        value="All types",
                        label="📂 Filter by Data Type",
                    )

                inp_box = gr.HighlightedText(
                    label="📄 Document Text (with highlighted dataset mentions)",
                    interactive=False,
                    show_legend=False,
                    value=""
                )
                
                info_md = gr.HTML(label="ℹ️ Record Details")

                with gr.Row():
                    prev_btn = gr.Button("⬅️ Previous", variant="secondary", size="lg")
                    next_btn = gr.Button("Next ➡️", variant="primary", size="lg")

                # Initial load
                demo.load(
                    fn=load_example,
                    inputs=prog,
                    outputs=[inp_box, prog, info_md],
                )

                # Slider navigation
                prog.release(
                    fn=load_example,
                    inputs=prog,
                    outputs=[inp_box, prog, info_md],
                )

                # Filters
                dataset_filter.change(
                    fn=jump_on_filters,
                    inputs=[dataset_filter, type_filter],
                    outputs=[inp_box, prog, info_md],
                )
                type_filter.change(
                    fn=jump_on_filters,
                    inputs=[dataset_filter, type_filter],
                    outputs=[inp_box, prog, info_md],
                )

                # Prev / Next navigation respecting filters
                prev_btn.click(
                    fn=nav_prev,
                    inputs=[dataset_filter, type_filter],
                    outputs=[inp_box, prog, info_md],
                )
                next_btn.click(
                    fn=nav_next,
                    inputs=[dataset_filter, type_filter],
                    outputs=[inp_box, prog, info_md],
                )

            with gr.Tab("⚖️ Comparison"):
                gr.Markdown("### Side-by-Side Comparison of Borderline Cases")
                gr.Markdown("Compare examples to understand **why the same type or term** is validated differently based on context.")
                
                comparison_mode = gr.Radio(
                    choices=["By Type", "By Term"],
                    value="By Type",
                    label="Comparison Mode"
                )
                
                # Type comparison
                with gr.Group(visible=True) as type_comparison_group:
                    gr.Markdown("**Compare by Data Type**: See how the same type (e.g., 'system') can be valid or invalid")
                    comp_type_selector = gr.Dropdown(
                        choices=comparison_manager.mixed_types,
                        value=comparison_manager.mixed_types[0] if comparison_manager.mixed_types else None,
                        label="Select Mixed Type to Compare",
                    )
                    
                    type_pos_idx_state = gr.State(0)
                    type_neg_idx_state = gr.State(0)

                    with gr.Row():
                        with gr.Column():
                            type_pos_header = gr.Markdown("### ✅ IS Dataset")
                            type_pos_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
                            type_pos_info_box = gr.HTML()
                            type_pos_next_btn = gr.Button("Next Example ➡️")

                        with gr.Column():
                            type_neg_header = gr.Markdown("### ❌ NOT Dataset")
                            type_neg_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
                            type_neg_info_box = gr.HTML()
                            type_neg_next_btn = gr.Button("Next Example ➡️")
                
                # Term comparison
                with gr.Group(visible=False) as term_comparison_group:
                    gr.Markdown("**Compare by Term**: See how the exact same term appears in different validation contexts")
                    comp_term_selector = gr.Dropdown(
                        choices=comparison_manager.confusing_terms,
                        value=comparison_manager.confusing_terms[0] if comparison_manager.confusing_terms else None,
                        label="Select Confusing Term to Compare",
                    )
                    
                    term_pos_idx_state = gr.State(0)
                    term_neg_idx_state = gr.State(0)

                    with gr.Row():
                        with gr.Column():
                            term_pos_header = gr.Markdown("### ✅ IS Dataset")
                            term_pos_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
                            term_pos_info_box = gr.HTML()
                            term_pos_next_btn = gr.Button("Next Example ➡️")

                        with gr.Column():
                            term_neg_header = gr.Markdown("### ❌ NOT Dataset")
                            term_neg_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
                            term_neg_info_box = gr.HTML()
                            term_neg_next_btn = gr.Button("Next Example ➡️")

                # Toggle visibility based on mode
                def toggle_comparison_mode(mode):
                    return gr.update(visible=mode == "By Type"), gr.update(visible=mode == "By Term")
                
                comparison_mode.change(
                    fn=toggle_comparison_mode,
                    inputs=[comparison_mode],
                    outputs=[type_comparison_group, term_comparison_group]
                )

                # Type comparison events
                comp_type_selector.change(
                    fn=lambda: (0, 0),
                    outputs=[type_pos_idx_state, type_neg_idx_state]
                ).then(
                    fn=load_type_comparison,
                    inputs=[comp_type_selector, type_pos_idx_state, type_neg_idx_state],
                    outputs=[type_pos_hl_box, type_pos_info_box, type_neg_hl_box, type_neg_info_box, type_pos_header, type_neg_header]
                )
                
                type_pos_next_btn.click(
                    fn=next_pos,
                    inputs=[type_pos_idx_state],
                    outputs=[type_pos_idx_state]
                ).then(
                    fn=load_type_comparison,
                    inputs=[comp_type_selector, type_pos_idx_state, type_neg_idx_state],
                    outputs=[type_pos_hl_box, type_pos_info_box, type_neg_hl_box, type_neg_info_box, type_pos_header, type_neg_header]
                )

                type_neg_next_btn.click(
                    fn=next_neg,
                    inputs=[type_neg_idx_state],
                    outputs=[type_neg_idx_state]
                ).then(
                    fn=load_type_comparison,
                    inputs=[comp_type_selector, type_pos_idx_state, type_neg_idx_state],
                    outputs=[type_pos_hl_box, type_pos_info_box, type_neg_hl_box, type_neg_info_box, type_pos_header, type_neg_header]
                )

                # Term comparison events
                comp_term_selector.change(
                    fn=lambda: (0, 0),
                    outputs=[term_pos_idx_state, term_neg_idx_state]
                ).then(
                    fn=load_term_comparison,
                    inputs=[comp_term_selector, term_pos_idx_state, term_neg_idx_state],
                    outputs=[term_pos_hl_box, term_pos_info_box, term_neg_hl_box, term_neg_info_box, term_pos_header, term_neg_header]
                )
                
                term_pos_next_btn.click(
                    fn=next_pos,
                    inputs=[term_pos_idx_state],
                    outputs=[term_pos_idx_state]
                ).then(
                    fn=load_term_comparison,
                    inputs=[comp_term_selector, term_pos_idx_state, term_neg_idx_state],
                    outputs=[term_pos_hl_box, term_pos_info_box, term_neg_hl_box, term_neg_info_box, term_pos_header, term_neg_header]
                )

                term_neg_next_btn.click(
                    fn=next_neg,
                    inputs=[term_neg_idx_state],
                    outputs=[term_neg_idx_state]
                ).then(
                    fn=load_term_comparison,
                    inputs=[comp_term_selector, term_pos_idx_state, term_neg_idx_state],
                    outputs=[term_pos_hl_box, term_pos_info_box, term_neg_hl_box, term_neg_info_box, term_pos_header, term_neg_header]
                )

    return demo


if __name__ == "__main__":
    create_demo().launch(share=False, debug=False)