import os import json from typing import List, Dict, Tuple, Optional, Any from collections import Counter, defaultdict import gradio as gr # ── Local CONFIG ────────────────────────────────────────────────────────────── DATA_FILE = "gradio_ner_data.json" def load_initial_data() -> List[Dict]: if not os.path.exists(DATA_FILE): raise FileNotFoundError(f"{DATA_FILE} not found in current directory.") with open(DATA_FILE, "r", encoding="utf-8") as f: data = json.load(f) # Calculate mixed types (types that have both True and False LLM assessments) type_assessments = defaultdict(set) for rec in data: if rec.get("type") and rec.get("llm_is_dataset_contextual") is not None: type_assessments[rec["type"]].add(rec["llm_is_dataset_contextual"]) mixed_types = {t for t, assessments in type_assessments.items() if True in assessments and False in assessments} # Flag records for rec in data: rec["is_mixed_type"] = rec.get("type") in mixed_types return data class DynamicDataset: def __init__(self, data: List[Dict]): self.data = data self.len = len(data) self.current = 0 def example(self, idx: int) -> Dict: self.current = max(0, min(self.len - 1, idx)) return self.data[self.current] class MixedTypeManager: def __init__(self, data: List[Dict]): self.grouped_data = defaultdict(lambda: {'true': [], 'false': []}) self.mixed_types = [] # Group data for rec in data: dtype = rec.get("type") is_ds = rec.get("llm_is_dataset_contextual") if dtype and is_ds is not None: key = 'true' if is_ds else 'false' self.grouped_data[dtype][key].append(rec) # Identify mixed types for dtype, groups in self.grouped_data.items(): if groups['true'] and groups['false']: self.mixed_types.append(dtype) # Sort by total count self.mixed_types.sort(key=lambda t: len(self.grouped_data[t]['true']) + len(self.grouped_data[t]['false']), reverse=True) def get_example(self, dtype: str, is_dataset: bool, idx: int) -> Dict: if dtype not in self.grouped_data: return {} group = self.grouped_data[dtype]['true' if is_dataset else 'false'] if not group: return {} # Cycle through examples safe_idx = idx % len(group) return group[safe_idx] def get_count(self, dtype: str, is_dataset: bool) -> int: if dtype not in self.grouped_data: return 0 return len(self.grouped_data[dtype]['true' if is_dataset else 'false']) # ── Highlight utils ────────────────────────────────────────────────────────── def prepare_for_highlight(rec: Dict) -> List[Tuple[str, Optional[str]]]: text = rec.get("text", "") or "" ner_spans = rec.get("ner_annotated", rec.get("ner_text", [])) or [] segments = [] last_idx = 0 for start, end, label in sorted(ner_spans, key=lambda x: x[0]): try: start = int(start) end = int(end) except: continue if start < 0 or end <= start or start > len(text): continue end = min(end, len(text)) if start > last_idx: segments.append((text[last_idx:start], None)) segments.append((text[start:end], str(label))) last_idx = end if last_idx < len(text): segments.append((text[last_idx:], None)) return segments # ── Filtering helpers ───────────────────────────────────────────────────────── def record_matches_filters(rec: Dict, llm_dataset_filter: str, type_filter: str): # Use LLM assessment instead of is_dataset llm_is_ds = rec.get("llm_is_dataset_contextual") # If LLM assessment is not available, skip this record if llm_is_ds is None: return False if llm_dataset_filter == "LLM: Datasets only" and not llm_is_ds: return False if llm_dataset_filter == "LLM: Non-datasets only" and llm_is_ds: return False if llm_dataset_filter == "🔥 Show Confusion/Mixed Cases": # Only show records that are part of a mixed type group return rec.get("is_mixed_type", False) if type_filter != "All types": return rec.get("type") == type_filter return True # ── Documentation ───────────────────────────────────────────────────────────── DOCUMENTATION = """ # 📊 Monitoring of Data Use - User Guide ## What is this tool? This application helps you **review and explore dataset mentions** extracted documents. It displays text excerpts where potential datasets have been identified, along with metadata about each mention. ## What you'll see Each record shows: - **📄 Source Document**: The filename and page number where the text was found - **🔍 Highlighted Text**: The original text with dataset mentions highlighted - **📋 Data Type**: The category of the dataset (e.g., census, survey, database) - **✅ Dataset Status**: Whether this mention actually refers to a dataset - **💡 Context**: The surrounding text that provides context - **📝 Explanation**: Why this was classified as a dataset (or not) ## How to use this tool ### 🎯 Navigation - **Browse Records**: Use the slider to jump to any record by number - **Previous/Next Buttons**: Navigate through records one at a time - **Filters**: The Previous/Next buttons respect your active filters ### 🔍 Filtering Options 1. **Dataset Status Filter** - **All**: Show all records - **Datasets only**: Show only records that contain actual dataset references - **Non-datasets only**: Show records that were identified but don't actually refer to datasets 2. **Data Type Filter** - Filter by specific data types (census, survey, database, etc.) - Types are sorted by frequency (most common first) ### 💡 Tips - Use filters to focus on specific types of data mentions - The "Contains Dataset" field tells you if the mention is a true dataset reference - Review the "Explanation" to understand the classification reasoning - Highlighted text shows exactly where the dataset mention appears in context ## 🚀 Try It Yourself! Want to extract datasets from your own text? Try our **Dataset Extraction Tool**: 👉 **[Launch Dataset Extraction Tool](https://huggingface.co/spaces/ai4data/datause-extraction)** This interactive tool allows you to: - ✨ **Extract datasets** from your own text or documents - 📝 **Use predefined samples** to see how it works - 🔬 **Explore the extraction process** in real-time Perfect for testing the extraction capabilities on new documents or experimenting with different types of text! ## Data Source This viewer uses data from World Bank project documents. """ # ── Gradio App ─────────────────────────────────────────────────────────────── def create_demo() -> gr.Blocks: data = load_initial_data() dynamic_dataset = DynamicDataset(data) mixed_manager = MixedTypeManager(data) # Count types and sort by frequency (most common first) type_counter = Counter(rec.get("type") for rec in data if rec.get("type")) type_values = [t for t, _ in type_counter.most_common()] type_choices = ["All types"] + type_values def make_info(rec): """Format record metadata for display.""" fn = rec.get("filename", "—") pg = rec.get("page", "—") v_type = rec.get("type", "—") empirical_context = rec.get("empirical_context", "—") explanation = rec.get("explanation", "—") is_mixed = rec.get("is_mixed_type", False) # Highlight term in empirical context if rec.get("ner_text") and rec.get("text"): try: # Get the term from the full text start, end = rec["ner_text"][0][0], rec["ner_text"][0][1] term = rec["text"][start:end] # Highlight it in the empirical context if present # We use HTML styling for better visibility if term and term in empirical_context: highlight_style = 'background-color: #ffd700; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #e6c200;' empirical_context = empirical_context.replace(term, f'{term}') except: pass # Build HTML type_html = f"{v_type}" if is_mixed: type_html += " ⚠️ Mixed/Confusing Type" html = f"""

📄 Document Information

File: {fn}
Page: {pg}

🏷️ Type

{type_html}

📝 Surrounding Text

{empirical_context}

""" # Add LLM contextual analysis section if available llm_is_dataset = rec.get("llm_is_dataset_contextual") llm_reasons = rec.get("llm_contextual_reason", []) llm_thinking = rec.get("llm_thinking_contextual", "") if llm_is_dataset is not None: status_icon = '✅' if llm_is_dataset else '❌' status_text = 'Is a dataset' if llm_is_dataset else 'Not a dataset' html += f"""

🤖 Contextual Analysis

Assessment: {status_icon} {status_text}

""" if llm_reasons: html += "

Reasoning:

" if llm_thinking: html += f"""

Detailed Analysis:

{llm_thinking}
""" return html # Basic load by slider index (ignores filters) def load_example(idx: int): rec = dynamic_dataset.example(idx) segs = prepare_for_highlight(rec) return segs, idx, make_info(rec) # When filters change → jump to first matching record def jump_on_filters(llm_dataset_filter, type_filter): n = dynamic_dataset.len for i in range(n): if record_matches_filters(data[i], llm_dataset_filter, type_filter): dynamic_dataset.current = i rec = data[i] segs = prepare_for_highlight(rec) return segs, i, make_info(rec) # No match → return blank return [], 0, "⚠️ No matching records found with the selected filters." # Navigation respecting filters def nav_next(llm_dataset_filter, type_filter): i = dynamic_dataset.current + 1 n = dynamic_dataset.len while i < n: if record_matches_filters(data[i], llm_dataset_filter, type_filter): break i += 1 if i >= n: i = dynamic_dataset.current dynamic_dataset.current = i rec = data[i] return prepare_for_highlight(rec), i, make_info(rec) def nav_prev(llm_dataset_filter, type_filter): i = dynamic_dataset.current - 1 while i >= 0: if record_matches_filters(data[i], llm_dataset_filter, type_filter): break i -= 1 if i < 0: i = dynamic_dataset.current dynamic_dataset.current = i rec = data[i] return prepare_for_highlight(rec), i, make_info(rec) # Comparison Logic def load_comparison(dtype, pos_idx, neg_idx): if not dtype: return [], "Select a type", [], "Select a type" pos_rec = mixed_manager.get_example(dtype, True, pos_idx) neg_rec = mixed_manager.get_example(dtype, False, neg_idx) pos_hl = prepare_for_highlight(pos_rec) neg_hl = prepare_for_highlight(neg_rec) pos_info = make_info(pos_rec) neg_info = make_info(neg_rec) # Add count info pos_total = mixed_manager.get_count(dtype, True) neg_total = mixed_manager.get_count(dtype, False) pos_header = f"### ✅ IS Dataset ({pos_idx % pos_total + 1}/{pos_total})" neg_header = f"### ❌ NOT Dataset ({neg_idx % neg_total + 1}/{neg_total})" return pos_hl, pos_info, neg_hl, neg_info, pos_header, neg_header def next_pos(dtype, current_idx): return current_idx + 1 def next_neg(dtype, current_idx): return current_idx + 1 # ---- UI ---- with gr.Blocks(title="Monitoring of Data Use") as demo: gr.Markdown("# 📊 Monitoring of Data Use") # gr.Markdown(f"*Exploring {dynamic_dataset.len:,} dataset mentions from World Bank documents*") with gr.Tabs(): with gr.Tab("📖 How to Use"): gr.Markdown(DOCUMENTATION) with gr.Tab("🔍 Viewer"): with gr.Row(): prog = gr.Slider( minimum=0, maximum=dynamic_dataset.len - 1, value=0, step=1, label=f"📑 Browse Records (1 to {dynamic_dataset.len:,})", interactive=True, ) with gr.Row(): llm_dataset_filter = gr.Dropdown( choices=["🔥 Show Confusion/Mixed Cases", "All", "LLM: Datasets only", "LLM: Non-datasets only"], value="🔥 Show Confusion/Mixed Cases", label="🤖 Filter by Assessment", ) type_filter = gr.Dropdown( choices=type_choices, value="All types", label="📂 Filter by Data Type", ) inp_box = gr.HighlightedText( label="📄 Document Text (with highlighted dataset mentions)", interactive=False, show_legend=False, ) info_md = gr.HTML(label="ℹ️ Record Details") with gr.Row(): prev_btn = gr.Button("⬅️ Previous", variant="secondary", size="lg") next_btn = gr.Button("Next ➡️", variant="primary", size="lg") # Initial load demo.load( fn=load_example, inputs=prog, outputs=[inp_box, prog, info_md], ) # Slider navigation prog.release( fn=load_example, inputs=prog, outputs=[inp_box, prog, info_md], ) # Filters llm_dataset_filter.change( fn=jump_on_filters, inputs=[llm_dataset_filter, type_filter], outputs=[inp_box, prog, info_md], ) type_filter.change( fn=jump_on_filters, inputs=[llm_dataset_filter, type_filter], outputs=[inp_box, prog, info_md], ) # Prev / Next navigation respecting filters prev_btn.click( fn=nav_prev, inputs=[llm_dataset_filter, type_filter], outputs=[inp_box, prog, info_md], ) next_btn.click( fn=nav_next, inputs=[llm_dataset_filter, type_filter], outputs=[inp_box, prog, info_md], ) with gr.Tab("⚖️ Comparison"): gr.Markdown("### Side-by-Side Comparison of Mixed Types") gr.Markdown("Compare examples where the **same type** is classified differently based on context.") with gr.Row(): comp_type_selector = gr.Dropdown( choices=mixed_manager.mixed_types, value=mixed_manager.mixed_types[0] if mixed_manager.mixed_types else None, label="Select Mixed Type to Compare", ) # State for indices pos_idx_state = gr.State(0) neg_idx_state = gr.State(0) with gr.Row(): # Left Column: Positive with gr.Column(): pos_header = gr.Markdown("### ✅ IS Dataset") pos_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False) pos_info_box = gr.HTML() pos_next_btn = gr.Button("Next Example ➡️") # Right Column: Negative with gr.Column(): neg_header = gr.Markdown("### ❌ NOT Dataset") neg_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False) neg_info_box = gr.HTML() neg_next_btn = gr.Button("Next Example ➡️") # Events comp_type_selector.change( fn=lambda: (0, 0), # Reset indices outputs=[pos_idx_state, neg_idx_state] ).then( fn=load_comparison, inputs=[comp_type_selector, pos_idx_state, neg_idx_state], outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header] ) pos_next_btn.click( fn=next_pos, inputs=[comp_type_selector, pos_idx_state], outputs=[pos_idx_state] ).then( fn=load_comparison, inputs=[comp_type_selector, pos_idx_state, neg_idx_state], outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header] ) neg_next_btn.click( fn=next_neg, inputs=[comp_type_selector, neg_idx_state], outputs=[neg_idx_state] ).then( fn=load_comparison, inputs=[comp_type_selector, pos_idx_state, neg_idx_state], outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header] ) # Initial Load demo.load( fn=load_comparison, inputs=[comp_type_selector, pos_idx_state, neg_idx_state], outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header] ) return demo if __name__ == "__main__": create_demo().launch(share=False, debug=False)