Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Food Mapper - Semantic Matching Application | |
| USDA Agricultural Research Service, Western Human Nutrition Research Center | |
| Matches food descriptions between databases using neural embeddings (GTE-Large model). | |
| API-first with automatic CPU fallback for reliability. | |
| """ | |
| import os | |
| import io | |
| import json | |
| import re | |
| import time | |
| import asyncio | |
| import math | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Optional, Dict, List, Tuple | |
| from shiny import App, Inputs, Outputs, Session, reactive, render, ui | |
| from shiny.types import FileInfo | |
| import shinyswatch | |
| from shinywidgets import render_widget, output_widget | |
| from functools import lru_cache | |
| # ============================================================================ | |
| # IMPORTS | |
| # ============================================================================ | |
| # Matching algorithms | |
| from rapidfuzz import fuzz, process | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # API client for embeddings | |
| from openai import OpenAI, AsyncOpenAI | |
| import httpx | |
| try: | |
| import h2 # type: ignore | |
| HTTP2_AVAILABLE = True | |
| except Exception: | |
| HTTP2_AVAILABLE = False | |
| # ============================================================================ | |
| # STYLE CONFIGURATION | |
| # ============================================================================ | |
| custom_css = """ | |
| /* Professional neutral color scheme */ | |
| :root { | |
| --primary-color: #475569; | |
| --primary-dark: #334155; | |
| --secondary-color: #64748b; | |
| --success-color: #059669; | |
| --warning-color: #d97706; | |
| --danger-color: #dc2626; | |
| --background: #ffffff; | |
| --surface: #f8fafc; | |
| --text-primary: #1e293b; | |
| --text-secondary: #64748b; | |
| --border-color: #e2e8f0; | |
| --shadow-sm: 0 1px 2px 0 rgb(0 0 0 / 0.05); | |
| --shadow-md: 0 4px 6px -1px rgb(0 0 0 / 0.08); | |
| --shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.1); | |
| --gradient-primary: linear-gradient(135deg, #64748b 0%, #475569 100%); | |
| } | |
| /* Main container with responsive design */ | |
| .container-fluid { | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| background: var(--background); | |
| } | |
| /* NEW: Typography with system font stack */ | |
| body, .container-fluid, .card, .btn, .table { | |
| font-family: -apple-system, BlinkMacSystemFont, "Inter", "Segoe UI", "Helvetica Neue", Arial, sans-serif; | |
| color: var(--text-primary); | |
| line-height: 1.6; | |
| } | |
| /* Clean header styling */ | |
| .app-header { | |
| background: var(--gradient-primary); | |
| color: white; | |
| padding: 1.25rem 1.25rem; /* NEW: thinner header */ | |
| border-radius: 12px; | |
| margin-bottom: 1.25rem; | |
| text-align: center; | |
| box-shadow: var(--shadow-md); | |
| } | |
| .app-header h1 { | |
| color: white; | |
| margin: 0; | |
| font-size: 2rem; /* NEW: smaller title */ | |
| font-weight: 700; | |
| letter-spacing: 0.2px; | |
| } | |
| .app-header p { | |
| color: rgba(255, 255, 255, 0.95); | |
| font-size: 0.95rem; /* NEW: smaller subtitle */ | |
| margin-top: 0.35rem; | |
| margin-bottom: 0; | |
| } | |
| /* Clean card styling */ | |
| .card { | |
| background: white; | |
| border: 1px solid var(--border-color); | |
| box-shadow: var(--shadow-sm); | |
| border-radius: 12px; | |
| margin-bottom: 1.5rem; | |
| transition: box-shadow 0.2s ease; | |
| } | |
| .card:hover { | |
| box-shadow: var(--shadow-md); | |
| } | |
| .card-header { | |
| background: linear-gradient(135deg, var(--secondary-color), var(--primary-color)); | |
| color: white; | |
| font-weight: 600; | |
| border-radius: 12px 12px 0 0 !important; | |
| padding: 1rem 1.5rem; | |
| border-bottom: none; | |
| } | |
| /* Simple button styling */ | |
| .btn { | |
| font-weight: 600; | |
| border-radius: 8px; | |
| padding: 0.75rem 1.5rem; | |
| transition: all 0.2s ease; | |
| border: none; | |
| } | |
| .btn-primary { | |
| background: var(--primary-color); | |
| color: white; | |
| } | |
| .btn-primary:hover { | |
| background: var(--primary-dark); | |
| transform: translateY(-1px); | |
| box-shadow: var(--shadow-md); | |
| } | |
| .btn-success { | |
| background: var(--success-color); | |
| color: white; | |
| } | |
| .btn-success:hover { | |
| filter: brightness(0.9); | |
| transform: translateY(-1px); | |
| } | |
| .btn-warning { | |
| background: var(--warning-color); | |
| color: white; | |
| } | |
| .btn-warning:hover { | |
| filter: brightness(0.9); | |
| transform: translateY(-1px); | |
| } | |
| .btn-outline-warning { | |
| background: transparent; | |
| border: 2px solid var(--warning-color); | |
| color: var(--warning-color); | |
| } | |
| .btn-outline-warning:hover { | |
| background: var(--warning-color); | |
| color: white; | |
| } | |
| /* Export button grouping */ | |
| .btn-group-export { | |
| display: flex; | |
| gap: 1rem; | |
| justify-content: center; | |
| flex-wrap: wrap; | |
| } | |
| /* File upload area */ | |
| .file-upload-area { | |
| border: 2px dashed var(--border-color); | |
| border-radius: 12px; | |
| padding: 2rem; | |
| background: var(--background-light); | |
| text-align: center; | |
| transition: all 0.3s ease; | |
| } | |
| .file-upload-area:hover { | |
| border-color: var(--primary-color); | |
| background: white; | |
| } | |
| /* Fix table header alignment */ | |
| .shiny-table table { | |
| width: 100%; | |
| table-layout: fixed; | |
| } | |
| .shiny-table th { | |
| text-align: left !important; | |
| padding-left: 8px !important; | |
| } | |
| .shiny-table td { | |
| text-align: left !important; | |
| padding-left: 8px !important; | |
| word-wrap: break-word; | |
| overflow-wrap: break-word; | |
| } | |
| /* Special styling for preview tables in Step 1 - data_frame outputs */ | |
| /* Force left alignment for the entire data_frame container */ | |
| #input_col_preview, | |
| #target_col_preview { | |
| text-align: left !important; | |
| display: block !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| } | |
| #input_col_preview .shiny-data-frame, | |
| #target_col_preview .shiny-data-frame { | |
| display: block !important; | |
| text-align: left !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| width: 100% !important; | |
| } | |
| #input_col_preview .shiny-data-frame > div, | |
| #target_col_preview .shiny-data-frame > div { | |
| display: block !important; | |
| text-align: left !important; | |
| margin: 0 auto 0 0 !important; /* This forces left alignment by removing auto centering */ | |
| padding: 0 !important; | |
| } | |
| #input_col_preview .shiny-data-frame table, | |
| #target_col_preview .shiny-data-frame table { | |
| width: 100% !important; | |
| table-layout: fixed !important; | |
| margin: 0 !important; | |
| margin-left: 0 !important; | |
| margin-right: auto !important; | |
| border-collapse: collapse !important; | |
| } | |
| /* Headers - force left alignment and dark mode support */ | |
| #input_col_preview .shiny-data-frame thead th, | |
| #target_col_preview .shiny-data-frame thead th { | |
| text-align: left !important; | |
| padding: 8px !important; | |
| vertical-align: middle !important; | |
| font-weight: bold !important; | |
| color: var(--bs-body-color, #212529) !important; | |
| background-color: var(--bs-gray-200, #e9ecef) !important; | |
| border-bottom: 2px solid var(--bs-border-color, #dee2e6) !important; | |
| } | |
| /* First column header (Row) - centered and narrow */ | |
| #input_col_preview .shiny-data-frame thead th:first-child, | |
| #target_col_preview .shiny-data-frame thead th:first-child { | |
| width: 60px !important; | |
| min-width: 60px !important; | |
| max-width: 60px !important; | |
| text-align: center !important; | |
| } | |
| /* Second column header (Sample Values) - left aligned, takes remaining space */ | |
| #input_col_preview .shiny-data-frame thead th:nth-child(2), | |
| #target_col_preview .shiny-data-frame thead th:nth-child(2) { | |
| text-align: left !important; | |
| padding-left: 12px !important; | |
| width: auto !important; | |
| } | |
| /* Data cells - with dark mode support */ | |
| #input_col_preview .shiny-data-frame tbody td, | |
| #target_col_preview .shiny-data-frame tbody td { | |
| text-align: left !important; | |
| padding: 8px !important; | |
| vertical-align: top !important; | |
| word-wrap: break-word !important; | |
| overflow-wrap: break-word !important; | |
| white-space: normal !important; | |
| color: var(--bs-body-color, #212529) !important; | |
| background-color: var(--bs-body-bg, white) !important; | |
| } | |
| /* First column data (Row numbers) - centered */ | |
| #input_col_preview .shiny-data-frame tbody td:first-child, | |
| #target_col_preview .shiny-data-frame tbody td:first-child { | |
| width: 60px !important; | |
| text-align: center !important; | |
| } | |
| /* Second column data - left aligned with wrapping */ | |
| #input_col_preview .shiny-data-frame tbody td:nth-child(2), | |
| #target_col_preview .shiny-data-frame tbody td:nth-child(2) { | |
| text-align: left !important; | |
| white-space: normal !important; | |
| word-break: break-word !important; | |
| } | |
| /* When there are 3 columns (cleaning preview) */ | |
| #input_col_preview .shiny-data-frame thead th:nth-child(3), | |
| #target_col_preview .shiny-data-frame thead th:nth-child(3) { | |
| text-align: left !important; | |
| width: auto !important; | |
| } | |
| /* Matching Configuration - Polished Production Styling (theme-friendly) */ | |
| .matching-config-card { | |
| background-color: var(--bs-tertiary-bg, var(--bs-body-bg)); | |
| border-radius: 12px; | |
| padding: 1.25rem; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.08); | |
| border: 1px solid var(--bs-border-color, rgba(0,0,0,0.05)); | |
| } | |
| .matching-config-card h5 { | |
| color: var(--primary-dark); | |
| font-weight: 600; | |
| letter-spacing: 0.5px; | |
| } | |
| .matching-config-card h6 { | |
| color: var(--primary-color); | |
| font-weight: 500; | |
| } | |
| .matching-config-card .border-end { | |
| border-color: var(--border-color) !important; | |
| } | |
| .matching-config-card .border-start { | |
| border-color: var(--border-color) !important; | |
| } | |
| /* Slider styling for production look - centered */ | |
| /* Ensure slider stays centered and doesn't break on resize */ | |
| .matching-config-card .d-flex.justify-content-center { | |
| display: flex !important; | |
| justify-content: center !important; | |
| } | |
| .matching-config-card .mx-auto { | |
| margin-left: auto !important; | |
| margin-right: auto !important; | |
| width: 100% !important; | |
| } | |
| .matching-config-card .irs-bar { | |
| background: var(--primary-color); | |
| } | |
| .matching-config-card .irs-handle { | |
| border: 3px solid var(--primary-color); | |
| } | |
| /* Centered description text */ | |
| .threshold-description { | |
| max-width: 600px; | |
| margin: 0 auto; | |
| line-height: 1.5; | |
| color: var(--text-secondary); | |
| } | |
| /* Start button hover effect */ | |
| .matching-config-card .btn-primary:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(71, 85, 105, 0.3); } | |
| /* Progress indicator */ | |
| .progress-container { | |
| background: white; | |
| border-radius: 12px; | |
| padding: 2rem; | |
| box-shadow: 0 2px 10px rgba(0,0,0,0.08); | |
| margin: 1rem 0; | |
| } | |
| .progress { | |
| background-color: #e9ecef; | |
| border-radius: 10px; | |
| overflow: hidden; | |
| } | |
| .progress-bar { | |
| background: linear-gradient(90deg, var(--primary-color), var(--secondary-color)); | |
| border-radius: 8px; | |
| transition: width 0.5s ease-in-out; | |
| font-size: 14px; | |
| font-weight: 600; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| color: white; | |
| } | |
| .progress-bar-striped { | |
| background-image: linear-gradient(45deg, rgba(255,255,255,.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,.15) 50%, rgba(255,255,255,.15) 75%, transparent 75%, transparent); | |
| background-size: 1rem 1rem; | |
| } | |
| .progress-bar-animated { | |
| animation: progress-bar-stripes 1s linear infinite; | |
| } | |
| @keyframes progress-bar-stripes { | |
| from { background-position: 1rem 0; } | |
| to { background-position: 0 0; } | |
| } | |
| /* Status messages */ | |
| .alert-custom { | |
| border-radius: 8px; | |
| padding: 1rem 1.5rem; | |
| margin: 1rem 0; | |
| border-left: 4px solid; | |
| } | |
| .alert-success-custom { | |
| background: #f0fdf4; | |
| border-color: var(--success-color); | |
| color: #166534; | |
| } | |
| .alert-warning-custom { | |
| background: #fffbeb; | |
| border-color: var(--warning-color); | |
| color: #92400e; | |
| } | |
| .alert-info-custom { | |
| background: #f0f9ff; | |
| border-color: var(--primary-color); | |
| color: var(--primary-dark); | |
| } | |
| /* NEW: Enhanced table with proper alignment */ | |
| .table { | |
| width: 100%; | |
| table-layout: auto; | |
| border-collapse: separate; | |
| border-spacing: 0; | |
| background: var(--surface); | |
| color: var(--text-primary); | |
| } | |
| .table thead { | |
| background: var(--surface); | |
| color: var(--text-primary); | |
| position: sticky; | |
| top: 0; | |
| z-index: 10; | |
| border-bottom: 2px solid var(--primary-color); | |
| } | |
| .table thead th { | |
| padding: 12px; | |
| font-weight: 700; | |
| text-align: left; | |
| white-space: nowrap; | |
| border-bottom: 2px solid var(--primary-color); | |
| } | |
| /* NEW: Numeric column alignment */ | |
| .table th.num, | |
| .table td.num { | |
| text-align: right; | |
| font-variant-numeric: tabular-nums; | |
| } | |
| .table th.text, | |
| .table td.text { | |
| text-align: left; | |
| } | |
| .table tbody td { | |
| padding: 12px; | |
| vertical-align: middle; | |
| border-bottom: 1px solid var(--border-color); | |
| } | |
| /* NEW: Compact density mode */ | |
| #results_container.compact .table tbody td { | |
| padding: 6px 12px; | |
| font-size: 0.875rem; | |
| line-height: 1.25; | |
| } | |
| #results_container.compact .table thead th { | |
| padding: 8px 12px; | |
| } | |
| .table-striped tbody tr:nth-child(odd) { | |
| background: rgba(0, 0, 0, 0.02); | |
| } | |
| .table tbody tr:hover { | |
| background: rgba(71, 85, 105, 0.05) !important; | |
| cursor: pointer; | |
| } | |
| /* Footer */ | |
| .footer { | |
| margin-top: 3rem; | |
| padding: 2rem; | |
| border-top: 2px solid var(--border-color); | |
| text-align: center; | |
| color: #64748b; | |
| } | |
| /* Animated alerts */ | |
| .alert-animated { animation: fadeIn 0.25s ease-out both; } | |
| @keyframes fadeIn { from { opacity: 0; transform: translateY(4px);} to { opacity: 1; transform: none; } } | |
| /* NEW: Results container with better scrolling */ | |
| .results-container { | |
| height: 520px; | |
| overflow: auto; | |
| border: 1px solid var(--border-color); | |
| border-radius: 12px; | |
| padding: 0; | |
| background: var(--surface); | |
| box-shadow: var(--shadow-sm); | |
| position: relative; | |
| } | |
| .results-container::-webkit-scrollbar { | |
| width: 10px; | |
| height: 10px; | |
| } | |
| .results-container::-webkit-scrollbar-track { | |
| background: var(--surface); | |
| border-radius: 10px; | |
| } | |
| .results-container::-webkit-scrollbar-thumb { | |
| background: var(--primary-color); | |
| border-radius: 10px; | |
| } | |
| .results-container::-webkit-scrollbar-thumb:hover { | |
| background: var(--primary-dark); | |
| } | |
| /* NEW: Make Shiny progress panel wider so text doesn't wrap */ | |
| .shiny-progress-panel { min-width: 520px !important; width: 520px !important; } | |
| .shiny-progress-panel .progress { height: 0.75rem; } | |
| .shiny-progress-panel p { white-space: normal; } | |
| /* Loading spinner */ | |
| .spinner-container { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| padding: 2rem; | |
| } | |
| .spinner { | |
| width: 40px; | |
| height: 40px; | |
| border: 3px solid var(--border-color); | |
| border-top-color: var(--primary-color); | |
| border-radius: 50%; | |
| animation: spin 1s linear infinite; | |
| } | |
| @keyframes spin { | |
| to { transform: rotate(360deg); } | |
| } | |
| /* Score bar visualization */ | |
| .scorebar { | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; | |
| letter-spacing: 0.5px; | |
| white-space: nowrap; | |
| color: var(--secondary-color); | |
| } | |
| /* NEW: Status badges */ | |
| .status-badge { | |
| display: inline-block; | |
| font-weight: 600; | |
| font-size: 0.75rem; | |
| padding: 0.25rem 0.75rem; | |
| border-radius: 9999px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.025em; | |
| } | |
| .status-ok { | |
| background: rgba(16, 185, 129, 0.1); | |
| color: var(--success-color); | |
| border: 1px solid var(--success-color); | |
| } | |
| .status-warn { | |
| background: rgba(245, 158, 11, 0.1); | |
| color: var(--warning-color); | |
| border: 1px solid var(--warning-color); | |
| } | |
| /* Method chips */ | |
| .method-chip { | |
| display: inline-block; | |
| background: var(--secondary-color); | |
| color: white; | |
| padding: 0.25rem 0.75rem; | |
| border-radius: 16px; | |
| font-size: 0.875rem; | |
| font-weight: 500; | |
| margin: 0.25rem; | |
| } | |
| /* NEW: Nav tabs styling */ | |
| .nav-tabs { | |
| border-bottom: 2px solid var(--border-color); | |
| margin-bottom: 1.5rem; | |
| } | |
| .nav-tabs .nav-link { | |
| color: var(--primary-color); | |
| border: none; | |
| padding: 0.75rem 1.5rem; | |
| font-weight: 500; | |
| transition: all 0.3s ease; | |
| position: relative; | |
| } | |
| .nav-tabs .nav-link:hover { | |
| color: var(--primary-color); | |
| background: transparent; | |
| } | |
| .nav-tabs .nav-link.active { | |
| color: var(--primary-dark); | |
| background: transparent; | |
| border: none; | |
| font-weight: 600; | |
| } | |
| .nav-tabs .nav-link.active::after { | |
| content: ''; | |
| position: absolute; | |
| bottom: -2px; | |
| left: 0; | |
| right: 0; | |
| height: 2px; | |
| background: var(--primary-color); | |
| } | |
| /* NEW: Tooltip styles */ | |
| .tooltip { | |
| font-size: 0.875rem; | |
| } | |
| /* NEW: Mobile responsive improvements */ | |
| @media (max-width: 768px) { | |
| .container-fluid { | |
| padding: 12px; | |
| } | |
| .app-header { | |
| padding: 2rem 1rem; | |
| border-radius: 12px; | |
| } | |
| .card { | |
| margin-bottom: 1rem; | |
| border-radius: 12px; | |
| } | |
| .btn { | |
| padding: 0.625rem 1.25rem; | |
| font-size: 0.875rem; | |
| } | |
| .btn-group-export { justify-content: center; } | |
| .btn-group-export { | |
| flex-direction: column; | |
| } | |
| .results-container { | |
| height: 400px; | |
| } | |
| .nav-tabs .nav-link { | |
| padding: 0.5rem 0.75rem; | |
| font-size: 0.875rem; | |
| } | |
| } | |
| @media (max-width: 480px) { | |
| .app-header h1 { | |
| font-size: 1.75rem; | |
| } | |
| .app-header p { | |
| font-size: 0.875rem; | |
| } | |
| .table { | |
| font-size: 0.75rem; | |
| } | |
| .table thead th, | |
| .table tbody td { | |
| padding: 8px 6px; | |
| } | |
| } | |
| /* Simple transitions */ | |
| .fade-in { | |
| animation: fadeIn 0.2s ease; | |
| } | |
| @keyframes fadeIn { | |
| from { opacity: 0; } | |
| to { opacity: 1; } | |
| } | |
| /* NEW: File input styling */ | |
| .file-upload-area { | |
| border: 2px dashed var(--border-color); | |
| border-radius: 12px; | |
| padding: 2rem; | |
| background: var(--surface); | |
| text-align: center; | |
| transition: all 0.3s ease; | |
| cursor: pointer; | |
| } | |
| .file-upload-area:hover { | |
| border-color: var(--primary-color); | |
| background: var(--background); | |
| box-shadow: var(--shadow-sm); | |
| } | |
| /* Alert animations */ | |
| .alert-animated { | |
| animation: fadeIn 0.2s ease; | |
| } | |
| /* Fix Shiny's file input progress bar - center text vertically */ | |
| .shiny-input-container .progress { | |
| min-height: 1.5rem !important; | |
| height: 1.5rem !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| } | |
| .shiny-input-container .progress-bar { | |
| min-height: 1.5rem !important; | |
| height: 1.5rem !important; | |
| line-height: 1 !important; /* Reset line-height */ | |
| font-size: 0.875rem; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| padding: 0 0.5rem; | |
| } | |
| /* Make the 'Loaded X rows' status alerts thinner */ | |
| #input_file_status .alert, | |
| #target_file_status .alert { | |
| padding: 0.4rem 0.75rem !important; | |
| margin-bottom: 0.25rem; | |
| font-size: 0.875rem; | |
| } | |
| #input_file_status p, | |
| #target_file_status p { | |
| margin: 0; | |
| line-height: 1.2; | |
| } | |
| """ | |
| # Minimal custom CSS to preserve app behaviors while letting themes show | |
| custom_css_min = """ | |
| /* Fix Shiny's file input progress bar - thinner with centered text */ | |
| .sidebar { min-height: 100vh; } | |
| .page-sidebar .sidebar { min-height: 100vh; } | |
| /* Small, theme-friendly footer shown on all pages */ | |
| .footer { | |
| margin-top: 1rem !important; | |
| padding: 0.75rem 1rem !important; | |
| border-top: 1px solid var(--bs-border-color, #dee2e6) !important; | |
| text-align: center !important; | |
| color: var(--bs-secondary-color, inherit) !important; | |
| font-size: 0.9rem !important; | |
| } | |
| .shiny-input-container .progress { | |
| min-height: 1.4rem !important; | |
| height: 1.4rem !important; | |
| margin-bottom: 0 !important; /* Remove bottom margin */ | |
| } | |
| /* Let Shiny's default animation work - only adjust height and text position */ | |
| .shiny-input-container .progress-bar { | |
| min-height: 1.4rem !important; | |
| height: 1.4rem !important; | |
| line-height: 1.4rem !important; | |
| font-size: 0.8rem !important; | |
| padding-top: 0.1rem !important; /* Small padding to center text */ | |
| /* Allow Shiny's default transition animation */ | |
| transition: width 0.6s ease !important; | |
| } | |
| /* Make the 'Loaded X rows' status alerts much thinner and even closer to upload bar */ | |
| #input_file_status .alert, | |
| #target_file_status .alert { | |
| padding: 0.25rem 0.5rem !important; | |
| margin-bottom: 0.5rem !important; | |
| margin-top: -0.75rem !important; /* Even more negative margin to bring closer */ | |
| font-size: 0.85rem !important; | |
| line-height: 1.1 !important; | |
| min-height: auto !important; | |
| position: relative !important; | |
| top: -0.25rem !important; /* Additional upward shift */ | |
| } | |
| /* Also adjust the container divs to reduce spacing */ | |
| #input_status, | |
| #target_status { | |
| margin-top: -0.25rem !important; | |
| margin-bottom: 0 !important; | |
| padding-top: 0 !important; | |
| } | |
| #input_file_status p, | |
| #target_file_status p { | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| line-height: 1.1 !important; | |
| } | |
| /* Results container: bounded height and scrolling */ | |
| .results-container { | |
| height: 520px; | |
| overflow: auto; | |
| } | |
| /* Compact density mode for results table */ | |
| #results_container.compact table tbody td { | |
| padding: 6px 12px; | |
| font-size: 0.875rem; | |
| line-height: 1.25; | |
| } | |
| #results_container.compact table thead th { | |
| padding: 8px 12px; | |
| } | |
| /* Numeric/text alignment used by JS alignment helper */ | |
| .table th.num, .table td.num { text-align: right; font-variant-numeric: tabular-nums; } | |
| .table th.text, .table td.text { text-align: left; } | |
| /* Score bar visualization */ | |
| .scorebar { | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; | |
| white-space: nowrap; | |
| letter-spacing: 0.5px; | |
| } | |
| /* Simple spinner for long-running tasks */ | |
| .spinner-container { display: flex; justify-content: center; align-items: center; padding: 2rem; } | |
| .spinner { width: 40px; height: 40px; border: 3px solid rgba(0,0,0,0.1); border-top-color: currentColor; border-radius: 50%; animation: spin 1s linear infinite; } | |
| @keyframes spin { to { transform: rotate(360deg); } } | |
| /* Export button group layout */ | |
| .btn-group-export { display: flex; gap: 1rem; justify-content: center; flex-wrap: wrap; } | |
| /* Optional small fade-in animation class used by notifications */ | |
| .alert-animated { animation: fadeIn 0.25s ease-out both; } | |
| @keyframes fadeIn { from { opacity: 0; transform: translateY(4px);} to { opacity: 1; transform: none; } } | |
| /* Shiny progress panel: polished and theme-adaptive */ | |
| .shiny-progress-panel { | |
| width: 420px !important; | |
| min-width: 360px !important; | |
| max-width: 90vw; | |
| padding: 1.25rem; | |
| border-radius: 0.5rem; | |
| background-color: var(--bs-body-bg, white) !important; | |
| border: 1px solid var(--bs-border-color, rgba(0,0,0,0.125)) !important; | |
| box-shadow: 0 0.5rem 1rem rgba(0,0,0,0.15), 0 0.125rem 0.25rem rgba(0,0,0,0.075); | |
| animation: slide-in-bottom 0.3s cubic-bezier(0.25, 0.46, 0.45, 0.94) both; | |
| /* Position at bottom right with proper spacing */ | |
| position: fixed !important; | |
| right: 20px !important; | |
| bottom: 20px !important; | |
| left: auto !important; | |
| top: auto !important; | |
| transform: none !important; | |
| z-index: 9999 !important; | |
| } | |
| .shiny-progress-panel .progress-text, | |
| .shiny-progress-panel p { | |
| color: var(--bs-body-color, #212529) !important; | |
| font-size: 0.875rem !important; | |
| font-weight: 500 !important; | |
| margin-bottom: 0.75rem !important; | |
| line-height: 1.4 !important; | |
| display: block !important; | |
| text-align: left !important; | |
| } | |
| .shiny-progress-panel .progress { | |
| height: 1.25rem !important; | |
| background-color: var(--bs-gray-200, #e9ecef) !important; | |
| border-radius: 0.375rem !important; | |
| overflow: hidden !important; | |
| margin-bottom: 0.5rem !important; | |
| box-shadow: inset 0 1px 2px rgba(0,0,0,0.075) !important; | |
| } | |
| .shiny-progress-panel .progress-bar { | |
| background: linear-gradient(90deg, var(--bs-primary, #0d6efd), var(--bs-info, #0dcaf0)) !important; | |
| transition: width 0.6s cubic-bezier(0.25, 0.46, 0.45, 0.94) !important; | |
| font-size: 0.75rem !important; | |
| font-weight: 600 !important; | |
| color: white !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| position: relative !important; | |
| overflow: hidden !important; | |
| } | |
| .shiny-progress-panel .progress-bar::after { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: 0; | |
| bottom: 0; | |
| right: 0; | |
| background: linear-gradient( | |
| 90deg, | |
| transparent, | |
| rgba(255, 255, 255, 0.2), | |
| transparent | |
| ); | |
| animation: shimmer 2s infinite; | |
| } | |
| @keyframes shimmer { | |
| 0% { transform: translateX(-100%); } | |
| 100% { transform: translateX(100%); } | |
| } | |
| .shiny-progress-panel .progress-bar-animated { | |
| background-image: linear-gradient( | |
| 45deg, | |
| rgba(255,255,255,.15) 25%, | |
| transparent 25%, | |
| transparent 50%, | |
| rgba(255,255,255,.15) 50%, | |
| rgba(255,255,255,.15) 75%, | |
| transparent 75%, | |
| transparent | |
| ) !important; | |
| background-size: 1rem 1rem !important; | |
| animation: progress-bar-stripes 1s linear infinite !important; | |
| } | |
| @keyframes slide-in-bottom { | |
| 0% { | |
| transform: translateY(50px); | |
| opacity: 0; | |
| } | |
| 100% { | |
| transform: translateY(0); | |
| opacity: 1; | |
| } | |
| } | |
| @keyframes progress-bar-stripes { | |
| from { background-position: 1rem 0; } | |
| to { background-position: 0 0; } | |
| } | |
| /* Enhanced navbar tab styling */ | |
| .navbar-nav .nav-item .nav-link { | |
| border-radius: 0.375rem; | |
| padding: 0.5rem 1rem !important; | |
| margin: 0 0.25rem; | |
| transition: all 0.2s ease; | |
| position: relative; | |
| color: var(--bs-nav-link-color, #495057) !important; | |
| } | |
| .navbar-nav .nav-item .nav-link.active { | |
| background-color: var(--bs-primary, #0d6efd) !important; | |
| color: white !important; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| .navbar-nav .nav-item .nav-link:hover:not(.active) { | |
| background-color: var(--bs-gray-200, #e9ecef); | |
| color: var(--bs-body-color, #212529) !important; | |
| } | |
| /* Remove default underline */ | |
| .navbar-nav .nav-item .nav-link.active::after { | |
| display: none !important; | |
| } | |
| /* Matching configuration panel - theme friendly */ | |
| .config-card { | |
| background-color: var(--bs-tertiary-bg, var(--bs-body-bg)) !important; | |
| border: 1px solid var(--bs-border-color, #dee2e6) !important; | |
| border-radius: .5rem !important; | |
| padding: 1rem !important; | |
| } | |
| /* threshold-badge removed (using slider only) */ | |
| .config-row .btn { width: 100%; } | |
| /* Step 1 preview tables: keep left-aligned and full width at all sizes */ | |
| #input_col_preview, #target_col_preview { | |
| text-align: left !important; | |
| display: block !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| width: 100% !important; | |
| } | |
| #input_preview_wrap, #target_preview_wrap { | |
| display: flex !important; | |
| width: 100% !important; | |
| } | |
| #input_preview_wrap > *, #target_preview_wrap > * { | |
| flex: 1 1 auto !important; | |
| width: 100% !important; | |
| } | |
| #input_col_preview .shiny-data-frame, #target_col_preview .shiny-data-frame { | |
| display: block !important; | |
| text-align: left !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| width: 100% !important; | |
| } | |
| #input_col_preview .gridjs-container, #target_col_preview .gridjs-container, | |
| #input_col_preview .gridjs-wrapper, #target_col_preview .gridjs-wrapper { | |
| width: 100% !important; | |
| max-width: none !important; | |
| margin-left: 0 !important; | |
| margin-right: 0 !important; | |
| } | |
| /* Improve text density so more characters fit on two lines */ | |
| #input_col_preview .gridjs-table td.gridjs-td, | |
| #target_col_preview .gridjs-table td.gridjs-td { | |
| padding-left: 6px !important; | |
| padding-right: 6px !important; | |
| font-size: 0.95rem !important; | |
| line-height: 1.2 !important; | |
| white-space: normal !important; | |
| word-break: break-word !important; | |
| overflow-wrap: anywhere !important; | |
| hyphens: auto !important; | |
| } | |
| #input_col_preview .gridjs-table th.gridjs-th, | |
| #target_col_preview .gridjs-table th.gridjs-th { | |
| padding-left: 8px !important; | |
| padding-right: 8px !important; | |
| } | |
| #input_col_preview .shiny-data-frame > div, #target_col_preview .shiny-data-frame > div { | |
| display: block !important; | |
| text-align: left !important; | |
| margin: 0 auto 0 0 !important; /* prevent auto-centering */ | |
| padding: 0 !important; | |
| width: 100% !important; | |
| max-width: none !important; | |
| } | |
| #input_col_preview .shiny-data-frame table, #target_col_preview .shiny-data-frame table, | |
| #input_col_preview .gridjs-table, #target_col_preview .gridjs-table { | |
| width: 100% !important; | |
| /* Allow content to determine width so text columns expand and 'Row' stays narrow */ | |
| table-layout: auto !important; | |
| margin: 0 !important; | |
| margin-left: 0 !important; | |
| margin-right: auto !important; | |
| border-collapse: collapse !important; | |
| } | |
| #input_preview_wrap .table-responsive, #target_preview_wrap .table-responsive { | |
| width: 100% !important; | |
| margin: 0 !important; | |
| } | |
| /* Make any Shiny fill wrappers align to the left and fill width */ | |
| #input_col_preview .html-fill-container, #target_col_preview .html-fill-container, | |
| #input_col_preview .html-fill-item, #target_col_preview .html-fill-item { | |
| display: block !important; | |
| justify-content: flex-start !important; | |
| align-items: stretch !important; | |
| width: 100% !important; | |
| max-width: none !important; | |
| margin-left: 0 !important; | |
| margin-right: 0 !important; | |
| } | |
| /* Catch-all for inline auto-centering styles */ | |
| #input_col_preview div[style*="margin: 0 auto"], | |
| #target_col_preview div[style*="margin: 0 auto"], | |
| #input_col_preview div[style*="margin-left: auto"], | |
| #input_col_preview div[style*="margin-right: auto"], | |
| #target_col_preview div[style*="margin-left: auto"], | |
| #target_col_preview div[style*="margin-right: auto"] { | |
| margin-left: 0 !important; | |
| margin-right: 0 !important; | |
| width: 100% !important; | |
| max-width: none !important; | |
| } | |
| /* Shiny fill layout elements sometimes center children with auto margins */ | |
| #input_col_preview .html-fill-item, #target_col_preview .html-fill-item, | |
| #input_col_preview .html-fill-container, #target_col_preview .html-fill-container { | |
| margin-left: 0 !important; | |
| margin-right: 0 !important; | |
| width: 100% !important; | |
| max-width: none !important; | |
| } | |
| /* In case an inner wrapper uses inline style widths, let it stretch */ | |
| #input_col_preview div[style*="margin-left: auto"], | |
| #input_col_preview div[style*="margin-right: auto"], | |
| #target_col_preview div[style*="margin-left: auto"], | |
| #target_col_preview div[style*="margin-right: auto"] { | |
| margin-left: 0 !important; | |
| margin-right: 0 !important; | |
| width: 100% !important; | |
| } | |
| /* Header/data alignment for consistency */ | |
| #input_col_preview .shiny-data-frame thead th, | |
| #target_col_preview .shiny-data-frame thead th { text-align: left !important; } | |
| #input_col_preview .shiny-data-frame thead th:first-child, | |
| #target_col_preview .shiny-data-frame thead th:first-child { width: 36px !important; text-align: center !important; } | |
| #input_col_preview .shiny-data-frame tbody td, | |
| #target_col_preview .shiny-data-frame tbody td { text-align: left !important; vertical-align: top !important; } | |
| #input_col_preview .shiny-data-frame tbody td:first-child, | |
| #target_col_preview .shiny-data-frame tbody td:first-child { width: 36px !important; text-align: center !important; white-space: nowrap !important; } | |
| /* Also constrain first column via colgroup to override library sizing */ | |
| #input_col_preview .shiny-data-frame colgroup col:first-child, | |
| #target_col_preview .shiny-data-frame colgroup col:first-child { | |
| width: 36px !important; | |
| min-width: 36px !important; | |
| max-width: 36px !important; | |
| } | |
| /* Tighten padding on the small first column */ | |
| #input_col_preview .shiny-data-frame thead th:first-child, | |
| #input_col_preview .shiny-data-frame tbody td:first-child, | |
| #target_col_preview .shiny-data-frame thead th:first-child, | |
| #target_col_preview .shiny-data-frame tbody td:first-child { | |
| padding-left: 6px !important; | |
| padding-right: 6px !important; | |
| } | |
| /* Grid.js (Shiny DataGrid) often enforces a min column width (~120px). | |
| Explicitly override only for the first column within these two previews. */ | |
| #input_col_preview .gridjs-table thead th:first-child, | |
| #input_col_preview .gridjs-table tbody td:first-child, | |
| #input_col_preview .gridjs-header .gridjs-th:first-child, | |
| #input_col_preview .gridjs-body .gridjs-td:first-child, | |
| #target_col_preview .gridjs-table thead th:first-child, | |
| #target_col_preview .gridjs-table tbody td:first-child, | |
| #target_col_preview .gridjs-header .gridjs-th:first-child, | |
| #target_col_preview .gridjs-body .gridjs-td:first-child { | |
| width: 36px !important; | |
| min-width: 36px !important; | |
| max-width: 40px !important; | |
| text-align: center !important; | |
| white-space: nowrap !important; | |
| } | |
| /* Make sure the table can use the freed space for text columns */ | |
| #input_col_preview .gridjs-table, | |
| #target_col_preview .gridjs-table { width: 100% !important; } | |
| /* Center overlay progress (theme-friendly) */ | |
| .center-progress-backdrop { position: fixed; inset: 0; background: rgba(0,0,0,0.25); display: flex; align-items: center; justify-content: center; z-index: 2000; } | |
| .center-progress-panel { width: 720px; max-width: 90vw; } | |
| /* App title size */ | |
| .app-title { font-size: 2rem; font-weight: 700; margin: 0; } | |
| /* Highlight NO MATCH rows with light red background */ | |
| .no-match-row { | |
| background-color: rgba(220, 53, 69, 0.08) !important; | |
| } | |
| .no-match-row:hover { | |
| background-color: rgba(220, 53, 69, 0.15) !important; | |
| } | |
| """ | |
| # ============================================================================ | |
| # EMBEDDING CONFIGURATION | |
| # ============================================================================ | |
| # Model and API settings | |
| DEEPINFRA_MODEL = "thenlper/gte-large" | |
| # Concurrency settings | |
| MAX_CONCURRENCY = int(os.environ.get("EMBEDDING_CONCURRENCY", "100")) | |
| EMBED_BATCH_SIZE = int(os.environ.get("EMBEDDING_BATCH_SIZE", "200")) | |
| USE_PRIORITY_TIER = os.environ.get("DEEPINFRA_PRIORITY", "false").lower() in {"1", "true", "yes", "on"} | |
| USE_ASYNC = os.environ.get("EMBEDDING_ASYNC", "true").lower() in {"1", "true", "yes", "on"} | |
| # Fallback behavior | |
| API_EMBED_TIMEOUT_SECS = int(os.environ.get("API_EMBED_TIMEOUT_SECS", "45")) # overall call timeout | |
| API_MAX_FAILURES = int(os.environ.get("API_EMBED_MAX_FAILURES", "3")) # consecutive failures before CPU fallback | |
| MODEL_FALLBACK_MODE = os.environ.get("MODEL_FALLBACK_MODE", "auto").lower() | |
| # Values: "auto" (try API then fallback), "api" (force API only), "local" (force CPU), "off" (no fallback) | |
| # Runtime state | |
| FALLBACK_ACTIVE: bool = False | |
| _API_FAILURES: int = 0 | |
| _LOCAL_ST_MODEL: Optional["SentenceTransformer"] = None # lazy-loaded cache | |
| def get_api_key(): | |
| """Get API key from environment variable or HuggingFace secret""" | |
| # Try HuggingFace secret first | |
| api_key = os.environ.get("DEEPINFRA_API_KEY") | |
| if not api_key: | |
| # Try standard environment variable | |
| api_key = os.environ.get("DEEPINFRA_TOKEN") | |
| return api_key | |
| # Client caching for connection pooling | |
| _CLIENT_CACHE: Dict[str, OpenAI] = {} | |
| _ASYNC_CLIENT_CACHE: Dict[str, AsyncOpenAI] = {} | |
| def get_openai_client(api_key: str) -> OpenAI: | |
| """Create or retrieve cached OpenAI client configured for DeepInfra""" | |
| if api_key in _CLIENT_CACHE: | |
| return _CLIENT_CACHE[api_key] | |
| client = OpenAI( | |
| api_key=api_key, | |
| base_url="https://api.deepinfra.com/v1/openai" | |
| ) | |
| _CLIENT_CACHE[api_key] = client | |
| return client | |
| def get_async_openai_client(api_key: str) -> AsyncOpenAI: | |
| """Create or retrieve cached AsyncOpenAI client configured for DeepInfra""" | |
| if api_key in _ASYNC_CLIENT_CACHE: | |
| return _ASYNC_CLIENT_CACHE[api_key] | |
| # Configure HTTP client for higher concurrency | |
| limits = httpx.Limits( | |
| max_connections=max(10, MAX_CONCURRENCY), | |
| max_keepalive_connections=max(10, MAX_CONCURRENCY), | |
| ) | |
| timeout = httpx.Timeout(60.0, connect=20.0, read=60.0, write=60.0) | |
| # Enable HTTP/2 only if the 'h2' package is available | |
| http_client = httpx.AsyncClient(limits=limits, timeout=timeout, http2=HTTP2_AVAILABLE) | |
| if not HTTP2_AVAILABLE: | |
| print("[async] HTTP/2 not available (h2 not installed). Falling back to HTTP/1.1") | |
| client = AsyncOpenAI( | |
| api_key=api_key, | |
| base_url="https://api.deepinfra.com/v1/openai", | |
| http_client=http_client, | |
| ) | |
| _ASYNC_CLIENT_CACHE[api_key] = client | |
| return client | |
| def compute_embeddings_deepinfra(texts: List[str], api_key: str) -> np.ndarray: | |
| """Compute embeddings using DeepInfra API via OpenAI client""" | |
| client = get_openai_client(api_key) | |
| try: | |
| extra_body = {"normalize": True} | |
| if USE_PRIORITY_TIER: | |
| extra_body["service_tier"] = "priority" | |
| # Create embeddings using OpenAI client | |
| response = client.embeddings.create( | |
| model=DEEPINFRA_MODEL, | |
| input=texts, | |
| encoding_format="float", | |
| extra_body=extra_body, | |
| ), | |
| # Extract embeddings from response (already normalized if normalize=True) | |
| embeddings = np.array([data.embedding for data in response.data], dtype=np.float32) | |
| return embeddings | |
| except Exception as e: | |
| raise Exception(f"DeepInfra API error: {str(e)}") | |
| async def compute_embeddings_deepinfra_async(texts: List[str], api_key: str) -> np.ndarray: | |
| """Async embeddings using DeepInfra via AsyncOpenAI client""" | |
| client = get_async_openai_client(api_key) | |
| try: | |
| extra_body = {"normalize": True} | |
| if USE_PRIORITY_TIER: | |
| extra_body["service_tier"] = "priority" | |
| # Retry loop to avoid transient throttling | |
| last_err = None | |
| for attempt in range(5): | |
| try: | |
| t0 = time.perf_counter() | |
| response = await client.embeddings.create( | |
| model=DEEPINFRA_MODEL, | |
| input=texts, | |
| encoding_format="float", | |
| extra_body=extra_body, | |
| ) | |
| dt = time.perf_counter() - t0 | |
| embeddings = np.array([data.embedding for data in response.data], dtype=np.float32) | |
| print(f"[async] embeddings.create batch_size={len(texts)} took {dt:.2f}s") | |
| return embeddings | |
| except Exception as e: | |
| last_err = e | |
| await asyncio.sleep(min(4.0, 0.25 * (2 ** attempt))) | |
| raise Exception(f"DeepInfra API error after retries: {str(last_err)}") | |
| except Exception as e: | |
| raise Exception(f"DeepInfra API error: {str(e)}") | |
| def _chunk_indices(total: int, chunk_size: int) -> List[Tuple[int, int]]: | |
| return [(i, min(i + chunk_size, total)) for i in range(0, total, chunk_size)] | |
| def _embed_batch_slice(args: Tuple[int, int, List[str], str]) -> Tuple[int, np.ndarray]: | |
| start, end, texts, api_key = args | |
| batch_vecs = compute_embeddings_deepinfra(texts[start:end], api_key) | |
| return start, batch_vecs | |
| def compute_embeddings_parallel( | |
| texts: List[str], | |
| api_key: str, | |
| batch_size: int = EMBED_BATCH_SIZE, | |
| max_concurrency: int = MAX_CONCURRENCY, | |
| progress_callback=None, | |
| ) -> np.ndarray: | |
| """Concurrent embedding across batches while preserving order.""" | |
| n = len(texts) | |
| if n == 0: | |
| return np.empty((0, 0), dtype=np.float32) | |
| slices = _chunk_indices(n, batch_size) | |
| results: Dict[int, np.ndarray] = {} | |
| total_batches = len(slices) | |
| completed = 0 | |
| with ThreadPoolExecutor(max_workers=max_concurrency) as ex: | |
| futures = [ | |
| ex.submit(_embed_batch_slice, (start, end, texts, api_key)) | |
| for (start, end) in slices | |
| ] | |
| for fut in as_completed(futures): | |
| start, vecs = fut.result() | |
| results[start] = vecs | |
| completed += 1 | |
| if progress_callback: | |
| pct = int((completed / total_batches) * 100) | |
| progress_callback(f"Embedding batches: {pct}% ({completed}/{total_batches})") | |
| # Assemble in order | |
| ordered_starts = sorted(results.keys()) | |
| assembled = np.vstack([results[s] for s in ordered_starts]) | |
| return assembled | |
| async def compute_embeddings_parallel_async( | |
| texts: List[str], | |
| api_key: str, | |
| batch_size: int = EMBED_BATCH_SIZE, | |
| max_concurrency: int = MAX_CONCURRENCY, | |
| progress_callback=None, | |
| ) -> np.ndarray: | |
| """Async concurrent embedding across batches while preserving order.""" | |
| n = len(texts) | |
| if n == 0: | |
| return np.empty((0, 0), dtype=np.float32) | |
| slices = _chunk_indices(n, batch_size) | |
| results: Dict[int, np.ndarray] = {} | |
| sem = asyncio.Semaphore(max_concurrency) | |
| total_batches = len(slices) | |
| completed = 0 | |
| async def worker(start: int, end: int): | |
| nonlocal completed | |
| print(f"[async] launch target slice {start}:{end}") | |
| async with sem: | |
| vecs = await compute_embeddings_deepinfra_async(texts[start:end], api_key) | |
| print(f"[async] done target slice {start}:{end}") | |
| results[start] = vecs | |
| completed += 1 | |
| if progress_callback: | |
| pct = int((completed / total_batches) * 100) | |
| progress_callback(f"Embedding batches: {pct}% ({completed}/{total_batches})") | |
| await asyncio.gather(*(worker(start, end) for (start, end) in slices)) | |
| ordered_starts = sorted(results.keys()) | |
| return np.vstack([results[s] for s in ordered_starts]) | |
| # Local CPU Embedding Backend (async-compatible) | |
| async def _load_local_model() -> "SentenceTransformer": | |
| global _LOCAL_ST_MODEL | |
| if _LOCAL_ST_MODEL is not None: | |
| return _LOCAL_ST_MODEL | |
| # Show notification that model is loading (may need to download) | |
| try: | |
| from shiny import ui | |
| ui.notification_show( | |
| "Loading local embedding model (thenlper/gte-large). " | |
| "First-time loading may take a few minutes to download the model (~670MB).", | |
| type="info", | |
| duration=None, # Keep showing until we dismiss it | |
| id="model_loading" | |
| ) | |
| except: | |
| pass | |
| # Lazy import to avoid heavy import if API is healthy | |
| from sentence_transformers import SentenceTransformer | |
| # Load CPU model (default behavior). This may take time on first run (download + init). | |
| model = await asyncio.to_thread(SentenceTransformer, "thenlper/gte-large") | |
| _LOCAL_ST_MODEL = model | |
| # Dismiss loading notification | |
| try: | |
| from shiny import ui | |
| ui.notification_remove("model_loading") | |
| ui.notification_show( | |
| "Local embedding model loaded successfully.", | |
| type="success", | |
| duration=3 | |
| ) | |
| except: | |
| pass | |
| return model | |
| async def compute_embeddings_local_async(texts: List[str]) -> np.ndarray: | |
| # Minimal cleaning consistent with your embedding path | |
| texts = clean_text_for_embedding(texts) | |
| if len(texts) == 0: | |
| return np.empty((0, 0), dtype=np.float32) | |
| model = await _load_local_model() | |
| # Batch via your existing chunking to keep memory bounded | |
| slices = _chunk_indices(len(texts), EMBED_BATCH_SIZE) | |
| results: Dict[int, np.ndarray] = {} | |
| completed = 0 | |
| total = len(slices) | |
| async def work(start: int, end: int): | |
| # Run CPU-bound encode in a thread to keep event loop responsive | |
| vecs = await asyncio.to_thread(model.encode, texts[start:end], normalize_embeddings=True) | |
| # vecs is a numpy array | |
| results[start] = vecs.astype(np.float32, copy=False) | |
| await asyncio.gather(*(work(s, e) for (s, e) in slices)) | |
| # Assemble in order | |
| ordered = [results[s] for s in sorted(results.keys())] | |
| return np.vstack(ordered) if ordered else np.empty((0, 0), dtype=np.float32) | |
| # Resilient Wrapper (API first, CPU fallback) | |
| async def _try_api_embeddings(texts: List[str], api_key: str, progress_callback=None) -> np.ndarray: | |
| # Wrap your existing async API call with a timeout | |
| coro = compute_embeddings_deepinfra_async(texts, api_key) | |
| return await asyncio.wait_for(coro, timeout=API_EMBED_TIMEOUT_SECS) | |
| async def compute_embeddings_resilient_async( | |
| texts: List[str], | |
| api_key: str, | |
| progress_callback=None, | |
| ) -> np.ndarray: | |
| global FALLBACK_ACTIVE, _API_FAILURES | |
| mode = MODEL_FALLBACK_MODE # "auto" | "api" | "local" | "off" | |
| # Force-local mode | |
| if mode == "local": | |
| FALLBACK_ACTIVE = True | |
| if progress_callback: | |
| progress_callback("Local CPU embeddings (forced).") | |
| return await compute_embeddings_local_async(texts) | |
| # Force-API mode | |
| if mode == "api": | |
| FALLBACK_ACTIVE = False | |
| if progress_callback: | |
| progress_callback("Using API embeddings (forced).") | |
| return await _try_api_embeddings(texts, api_key, progress_callback) | |
| # Fallback disabled entirely | |
| if mode == "off": | |
| FALLBACK_ACTIVE = False | |
| return await _try_api_embeddings(texts, api_key, progress_callback) | |
| # Auto mode: try API, fallback on failure or repeated errors | |
| if FALLBACK_ACTIVE: | |
| # Circuit open: stay on local until next run | |
| if progress_callback: | |
| progress_callback("Local CPU embeddings (fallback active).") | |
| return await compute_embeddings_local_async(texts) | |
| try: | |
| vecs = await _try_api_embeddings(texts, api_key, progress_callback) | |
| # On success, reset failure counter | |
| _API_FAILURES = 0 | |
| FALLBACK_ACTIVE = False | |
| return vecs | |
| except Exception as e: | |
| _API_FAILURES += 1 | |
| if _API_FAILURES >= API_MAX_FAILURES: | |
| FALLBACK_ACTIVE = True | |
| # Show immediate notification when switching to CPU fallback | |
| try: | |
| from shiny import ui | |
| ui.notification_show( | |
| f"API failed after {_API_FAILURES} attempts. Switching to LOCAL CPU processing. " | |
| f"This will be much slower. Processing {len(texts)} items may take several minutes.", | |
| type="warning", | |
| duration=10, | |
| ) | |
| except: | |
| pass # ui might not be available in all contexts | |
| # Loggable note | |
| if progress_callback: | |
| progress_callback("API unavailable. Falling back to local CPU.") | |
| return await compute_embeddings_local_async(texts) | |
| # Re-raise before we hit threshold so upstream can decide (e.g., show an error or retry) | |
| raise | |
| def clean_text_simple(text_list: List[str]) -> List[str]: | |
| """Clean text by removing punctuation and extra spaces""" | |
| cleaned = [] | |
| for text in text_list: | |
| text = str(text).strip() | |
| text = re.sub(r'\s+', ' ', text) # Multiple spaces to single | |
| text = re.sub(r'[^\w\s,.-]', '', text) # Keep basic punctuation | |
| cleaned.append(text.lower()) | |
| return cleaned | |
| def clean_text_for_embedding(text_list: List[str]) -> List[str]: | |
| """Minimal cleaning for embedding models""" | |
| cleaned = [] | |
| for text in text_list: | |
| text = str(text).strip() | |
| text = re.sub(r'\s+', ' ', text) | |
| cleaned.append(text) | |
| return cleaned | |
| def run_fuzzy_match(input_list: List[str], target_list: List[str], clean: bool = True) -> Dict: | |
| """Run fuzzy string matching""" | |
| if clean: | |
| input_list = clean_text_simple(input_list) | |
| target_list = clean_text_simple(target_list) | |
| matches = [] | |
| scores = [] | |
| for input_desc in input_list: | |
| best_match, score, _ = process.extractOne( | |
| input_desc, | |
| target_list, | |
| scorer=fuzz.ratio | |
| ) | |
| matches.append(best_match) | |
| scores.append(score / 100.0) # Normalize to 0-1 | |
| return {"match": matches, "score": scores} | |
| def run_tfidf_match(input_list: List[str], target_list: List[str], clean: bool = True) -> Dict: | |
| """Run TF-IDF matching with cosine similarity""" | |
| if clean: | |
| input_list = clean_text_simple(input_list) | |
| target_list = clean_text_simple(target_list) | |
| combined = input_list + target_list | |
| vectorizer = TfidfVectorizer() | |
| vectorizer.fit(combined) | |
| tfidf_input = vectorizer.transform(input_list) | |
| tfidf_target = vectorizer.transform(target_list) | |
| similarity_matrix = cosine_similarity(tfidf_input, tfidf_target) | |
| matches = [] | |
| scores = [] | |
| for row in similarity_matrix: | |
| best_idx = np.argmax(row) | |
| best_score = row[best_idx] | |
| best_match = target_list[best_idx] | |
| matches.append(best_match) | |
| scores.append(float(best_score)) | |
| return {"match": matches, "score": scores} | |
| def run_embed_match( | |
| input_list: List[str], | |
| target_list: List[str], | |
| api_key: str, | |
| batch_size: int = EMBED_BATCH_SIZE, | |
| progress_callback=None, | |
| max_concurrency: int = MAX_CONCURRENCY, | |
| clean_input: bool = False, | |
| clean_target: bool = False, | |
| ) -> Dict: | |
| """Run semantic embedding matching using DeepInfra API with concurrent batching. | |
| Respects DeepInfra's 1024 max batch size and uses up to `max_concurrency` | |
| concurrent requests to reduce wall-clock time. Results are reassembled in | |
| the original order. | |
| """ | |
| # Apply cleaning based on user selection | |
| input_list_clean = clean_text_for_embedding(input_list) if clean_input else input_list | |
| target_list_clean = clean_text_for_embedding(target_list) if clean_target else target_list | |
| total_inputs = len(input_list_clean) | |
| total_targets = len(target_list_clean) | |
| # 1) Compute target embeddings concurrently (once per run) | |
| if progress_callback: | |
| progress_callback("Computing target embeddings (concurrent)...") | |
| target_embeddings = compute_embeddings_parallel( | |
| target_list_clean, | |
| api_key, | |
| batch_size=min(batch_size, 1024), | |
| max_concurrency=max_concurrency, | |
| progress_callback=progress_callback, | |
| ) | |
| # 2) Prepare concurrent input embedding + local similarity | |
| if progress_callback: | |
| progress_callback("Computing input embeddings (concurrent)...") | |
| input_slices = _chunk_indices(total_inputs, min(batch_size, 1024)) | |
| results_match: Dict[int, List[str]] = {} | |
| results_score: Dict[int, List[float]] = {} | |
| def _process_input_slice(args: Tuple[int, int]) -> Tuple[int, List[str], List[float]]: | |
| start, end = args | |
| emb = compute_embeddings_deepinfra(input_list_clean[start:end], api_key) | |
| sim = cosine_similarity(emb, target_embeddings) | |
| batch_matches: List[str] = [] | |
| batch_scores: List[float] = [] | |
| for row in sim: | |
| idx = int(np.argmax(row)) | |
| batch_matches.append(target_list[idx]) | |
| batch_scores.append(float(row[idx])) | |
| return start, batch_matches, batch_scores | |
| total_batches = len(input_slices) | |
| completed = 0 | |
| with ThreadPoolExecutor(max_workers=max_concurrency) as ex: | |
| futures = [ex.submit(_process_input_slice, sl) for sl in input_slices] | |
| for fut in as_completed(futures): | |
| start, m, s = fut.result() | |
| results_match[start] = m | |
| results_score[start] = s | |
| completed += 1 | |
| if progress_callback: | |
| pct = int((completed / total_batches) * 100) | |
| progress_callback(f"Matching: {pct}% ({completed}/{total_batches})") | |
| # 3) Assemble results in order | |
| matches: List[str] = [] | |
| scores: List[float] = [] | |
| for start in sorted(results_match.keys()): | |
| matches.extend(results_match[start]) | |
| scores.extend(results_score[start]) | |
| if progress_callback: | |
| progress_callback("Finalizing results...") | |
| return {"match": matches, "score": scores} | |
| async def run_embed_match_async( | |
| input_list: List[str], | |
| target_list: List[str], | |
| api_key: str, | |
| batch_size: int = EMBED_BATCH_SIZE, | |
| progress_callback=None, | |
| max_concurrency: int = MAX_CONCURRENCY, | |
| clean_input: bool = False, | |
| clean_target: bool = False, | |
| ) -> Dict: | |
| """Async version using AsyncOpenAI and asyncio concurrency.""" | |
| # Apply cleaning based on user selection | |
| input_list_clean = clean_text_for_embedding(input_list) if clean_input else input_list | |
| target_list_clean = clean_text_for_embedding(target_list) if clean_target else target_list | |
| # 1) Targets once | |
| if progress_callback: | |
| progress_callback("Computing target embeddings (async concurrent)...") | |
| target_embeddings = await compute_embeddings_resilient_async( | |
| target_list_clean, | |
| api_key, | |
| progress_callback=progress_callback, | |
| ) | |
| # 2) Inputs concurrent and local similarity | |
| if progress_callback: | |
| progress_callback("Computing input embeddings (async concurrent)...") | |
| input_slices = _chunk_indices(len(input_list_clean), min(batch_size, 1024)) | |
| results_match: Dict[int, List[str]] = {} | |
| results_score: Dict[int, List[float]] = {} | |
| sem = asyncio.Semaphore(max_concurrency) | |
| completed = 0 | |
| total_batches = len(input_slices) | |
| async def worker(start: int, end: int): | |
| nonlocal completed | |
| print(f"[async] launch input slice {start}:{end}") | |
| async with sem: | |
| emb = await compute_embeddings_resilient_async(input_list_clean[start:end], api_key, progress_callback) | |
| print(f"[async] done input slice {start}:{end}") | |
| sim = cosine_similarity(emb, target_embeddings) | |
| batch_matches: List[str] = [] | |
| batch_scores: List[float] = [] | |
| for row in sim: | |
| idx = int(np.argmax(row)) | |
| batch_matches.append(target_list[idx]) | |
| batch_scores.append(float(row[idx])) | |
| results_match[start] = batch_matches | |
| results_score[start] = batch_scores | |
| completed += 1 | |
| if progress_callback: | |
| pct = int((completed / total_batches) * 100) | |
| progress_callback(f"Matching: {pct}% ({completed}/{total_batches})") | |
| await asyncio.gather(*(worker(s, e) for (s, e) in input_slices)) | |
| matches: List[str] = [] | |
| scores: List[float] = [] | |
| for start in sorted(results_match.keys()): | |
| matches.extend(results_match[start]) | |
| scores.extend(results_score[start]) | |
| if progress_callback: | |
| progress_callback("Finalizing results...") | |
| return {"match": matches, "score": scores} | |
| def get_sample_data() -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| """Get sample datasets for demonstration""" | |
| # Sample input data | |
| input_data = pd.DataFrame({ | |
| "id": range(1, 26), | |
| "description": [ | |
| "apple juice", "chicken breast grilled", "whole milk", | |
| "orange juice fresh", "bread whole wheat", "cheddar cheese", "scrambled eggs", | |
| "pasta with tomato sauce", "beef steak medium rare", "yogurt plain", | |
| "brown rice cooked", "salmon fillet baked", | |
| "vegetable soup", "fruit salad mixed", "pizza slice pepperoni", | |
| "ice cream vanilla", "coffee with cream", | |
| "energy drink", "protein bar chocolate", "trail mix nuts", "smoothie berry", | |
| "xyz123 test item", "random text here", "unknown food item 999", "synthetic compound ABC" | |
| ] | |
| }) | |
| # Sample target data | |
| target_data = pd.DataFrame({ | |
| "code": [f"A{i:03d}" for i in range(1, 26)], | |
| "reference": [ | |
| "Apple juice, unsweetened, bottled, without added ascorbic acid", | |
| "Chicken, broilers or fryers, breast, meat only, cooked, grilled", | |
| "Milk, whole, 3.25% milkfat, with added vitamin D", | |
| "Orange juice, raw, includes from concentrate, fortified with calcium", | |
| "Bread, whole-wheat, commercially prepared", | |
| "Cheese, cheddar, sharp, sliced", | |
| "Egg, whole, cooked, scrambled", | |
| "Pasta with tomato-based sauce", | |
| "Beef, short loin, t-bone steak, separable lean and fat, trimmed to 1/8\" fat, all grades, cooked, grilled", | |
| "Yogurt, plain, whole milk", | |
| "Rice, brown, medium-grain, cooked", | |
| "Fish, salmon, Atlantic, farmed, cooked, dry heat", | |
| "Soup, vegetable with beef broth, canned, prepared with equal volume water", | |
| "Fruit salad, (pineapple and papaya and banana and guava), tropical, canned, heavy syrup, solids and liquids", | |
| "Pizza, meat and vegetable topping, regular crust, frozen, cooked", | |
| "Ice creams, vanilla", | |
| "Coffee, brewed from grounds, prepared with tap water, decaffeinated", | |
| "Beverages, Energy drink, RED BULL", | |
| "Snacks, granola bar, chocolate chip", | |
| "Snacks, trail mix, regular, unsalted", | |
| "Beverages, Smoothie, strawberry", | |
| "Water, tap, municipal", | |
| "Crackers, standard snack-type, regular", | |
| "Cookies, chocolate chip, commercially prepared, regular", | |
| "Candies, milk chocolate" | |
| ] | |
| }) | |
| return input_data, target_data | |
| # Create Shiny app with modern theme | |
| ''' LEGACY LAYOUT (disabled) | |
| app_ui = ui.page_sidebar( | |
| # Sidebar must be the first positional argument | |
| ui.sidebar( | |
| ui.h5("Quick Start"), | |
| ui.input_action_button( | |
| "load_sample", | |
| "Load Sample Dataset", | |
| class_="btn btn-success w-100" | |
| ), | |
| ui.div(id="sample_status", class_="mt-2"), | |
| ui.hr(), | |
| ui.h5("Upload Data"), | |
| ui.input_file("input_file", "Input CSV", accept=[".csv"], multiple=False), | |
| ui.div(id="input_status", class_="mt-2"), | |
| ui.input_file("target_file", "Target CSV", accept=[".csv"], multiple=False), | |
| ui.div(id="target_status", class_="mt-2"), | |
| ui.hr(), | |
| ui.output_ui("sidebar_results_summary_block"), | |
| open="open", | |
| ), | |
| # Then page contents (positional) | |
| ui.tags.head( | |
| ui.tags.link(rel="stylesheet", href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap"), | |
| ui.tags.link(rel="stylesheet", href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.0/font/bootstrap-icons.css"), | |
| ui.tags.link(rel="stylesheet", href="https://unpkg.com/tabulator-tables@5.5.2/dist/css/tabulator.min.css"), | |
| ui.tags.style(custom_css_min), | |
| ui.tags.script(src="https://unpkg.com/tabulator-tables@5.5.2/dist/js/tabulator.min.js"), | |
| # JavaScript for tooltips and table features | |
| ui.tags.script(""" | |
| // Initialize tooltips | |
| document.addEventListener('DOMContentLoaded', () => { | |
| const tooltipTriggerList = [].slice.call(document.querySelectorAll('[data-bs-toggle="tooltip"]')); | |
| tooltipTriggerList.map(function (tooltipTriggerEl) { | |
| return new bootstrap.Tooltip(tooltipTriggerEl); | |
| }); | |
| }); | |
| // Toggle compact mode for results table | |
| document.addEventListener('change', function(e){ | |
| if(e.target && e.target.id === 'compact_density'){ | |
| const c = document.getElementById('results_container'); | |
| if(c){ e.target.checked ? c.classList.add('compact') : c.classList.remove('compact'); } | |
| } | |
| }); | |
| // NEW: Apply column alignment classes after table render | |
| document.addEventListener('shiny:value', function(ev) { | |
| if (ev.detail && ev.detail.name === 'results_table') { | |
| setTimeout(() => { | |
| const table = document.querySelector('#results_container table'); | |
| if (!table) return; | |
| // Apply compact mode if needed | |
| const compact = window.Shiny && Shiny.shinyapp && Shiny.shinyapp.$inputValues['compact_density']; | |
| const container = document.querySelector('#results_container'); | |
| if (container) container.classList.toggle('compact', !!compact); | |
| // Fix column alignment | |
| const headerCells = Array.from(table.querySelectorAll('thead th')); | |
| const bodyRows = Array.from(table.querySelectorAll('tbody tr')); | |
| // Find the status column index | |
| let statusColIdx = -1; | |
| headerCells.forEach((th, idx) => { | |
| const header = th.innerText || ''; | |
| if (header.toLowerCase() === 'status') { | |
| statusColIdx = idx; | |
| } | |
| }); | |
| headerCells.forEach((th, idx) => { | |
| const header = th.innerText || ''; | |
| const isNum = /score|similarity|count|percent|^\\d+/.test(header.toLowerCase()); | |
| th.classList.toggle('num', isNum); | |
| th.classList.toggle('text', !isNum); | |
| bodyRows.forEach(tr => { | |
| const td = tr.children[idx]; | |
| if (!td) return; | |
| const txt = td.innerText || ''; | |
| const looksNum = isNum || /^[\\d\\s\\.\\,\\-]+%?$/.test(txt); | |
| td.classList.toggle('num', looksNum); | |
| td.classList.toggle('text', !looksNum); | |
| // Add scorebar class for bar columns | |
| if (/bar$/i.test(header)) { | |
| td.classList.add('scorebar'); | |
| } | |
| }); | |
| }); | |
| // Apply no-match-row class to rows with NO MATCH status | |
| if (statusColIdx >= 0) { | |
| bodyRows.forEach(tr => { | |
| const statusCell = tr.children[statusColIdx]; | |
| if (statusCell && statusCell.innerText.trim() === 'NO MATCH') { | |
| tr.classList.add('no-match-row'); | |
| } | |
| }); | |
| } | |
| }, 100); | |
| } | |
| }); | |
| // Ensure Step 1 preview tables stick left and first column stays narrow | |
| function adjustPreview(id){ | |
| var root = document.getElementById(id); | |
| if(!root) return; | |
| // Remove auto-centering and allow full width on wrappers | |
| var nodes = [root].concat(Array.from(root.querySelectorAll('.shiny-data-frame, .html-fill-container, .html-fill-item, .table-responsive, table, div'))); | |
| nodes.forEach(function(el){ | |
| try { | |
| el.style.marginLeft = '0'; | |
| el.style.marginRight = '0'; | |
| el.style.width = '100%'; | |
| el.style.maxWidth = 'none'; | |
| } catch(e){} | |
| }); | |
| var table = root.querySelector('table'); | |
| if(!table) return; | |
| table.style.width = '100%'; | |
| table.style.tableLayout = 'auto'; | |
| // Add/adjust colgroup for first column | |
| var colgroup = table.querySelector('colgroup'); | |
| var colCount = (table.querySelectorAll('thead th').length) || (table.querySelectorAll('tbody tr:first-child td').length); | |
| if(!colgroup && colCount){ | |
| colgroup = document.createElement('colgroup'); | |
| for (var i=0;i<colCount;i++){ colgroup.appendChild(document.createElement('col')); } | |
| table.insertBefore(colgroup, table.firstChild); | |
| } | |
| if(colgroup && colgroup.firstElementChild){ | |
| var c0 = colgroup.firstElementChild; | |
| c0.style.width = '36px'; | |
| c0.style.minWidth = '36px'; | |
| c0.style.maxWidth = '40px'; | |
| } | |
| // Set widths on first TH/TDs as well | |
| var th0 = table.querySelector('thead th:first-child'); | |
| if(th0){ th0.style.width = '36px'; th0.style.minWidth = '36px'; th0.style.whiteSpace = 'nowrap'; th0.style.textAlign = 'center'; } | |
| table.querySelectorAll('tbody td:first-child').forEach(function(td){ | |
| td.style.width = '36px'; | |
| td.style.minWidth = '36px'; | |
| td.style.whiteSpace = 'nowrap'; | |
| td.style.textAlign = 'center'; | |
| }); | |
| } | |
| document.addEventListener('shiny:value', function(ev){ | |
| if (ev.detail && (ev.detail.name === 'input_col_preview' || ev.detail.name === 'target_col_preview')){ | |
| setTimeout(function(){ adjustPreview(ev.detail.name); }, 0); | |
| } | |
| }); | |
| document.addEventListener('DOMContentLoaded', function(){ | |
| adjustPreview('input_col_preview'); | |
| adjustPreview('target_col_preview'); | |
| }); | |
| """) | |
| , | |
| ui.tags.script(""" | |
| (function(){ | |
| // Debounced search mirror -> search_debounced | |
| var _t; | |
| document.addEventListener('input', function(e){ | |
| if (e.target && e.target.id === 'search_filter'){ | |
| clearTimeout(_t); | |
| var v = e.target.value; | |
| _t = setTimeout(function(){ | |
| if (window.Shiny && Shiny.setInputValue){ | |
| Shiny.setInputValue('search_debounced', v, {priority:'event'}); | |
| } | |
| }, 200); | |
| } | |
| }); | |
| })(); | |
| """) | |
| ), | |
| ui.navset_bar( | |
| ui.nav_panel( | |
| "Data & Configure", | |
| ui.card( | |
| ui.card_header("Matching Setup"), | |
| ui.card_body( | |
| ui.row( | |
| ui.column(6, | |
| ui.input_select("input_column", "Input Column:", choices=[], selected=None), | |
| ui.output_table("input_col_preview") | |
| ), | |
| ui.column(6, | |
| ui.input_select("target_column", "Target Column:", choices=[], selected=None), | |
| ui.output_table("target_col_preview") | |
| ) | |
| ), | |
| ui.hr(), | |
| ui.row( | |
| ui.column(6, | |
| ui.p(ui.strong("Method:"), " Semantic Embedding (", ui.a("thenlper/gte-large", href="https://huggingface.co/thenlper/gte-large", target="_blank"), ")") | |
| ), | |
| ui.column(3, | |
| ui.div( | |
| ui.span( | |
| "Similarity Threshold ", | |
| ui.tags.i( | |
| class_="bi bi-question-circle text-muted", | |
| **{"data-bs-toggle": "tooltip", "data-bs-placement": "top", "title": "Items below this score are marked as NO MATCH"} | |
| ) | |
| ), | |
| ui.input_slider("threshold", "", min=0.0, max=1.0, value=0.85, step=0.05) | |
| ) | |
| ), | |
| ui.column(3, | |
| ui.input_checkbox("clean_text", "Apply text cleaning", value=False) | |
| ) | |
| ), | |
| ui.br(), | |
| ui.input_action_button("run_matching", "Start Mapping", class_="btn btn-primary", style="padding: 0.4rem 1rem;") | |
| ) | |
| ), | |
| ui.br(), | |
| ui.output_ui("center_progress_overlay") | |
| ), | |
| ui.nav_panel( | |
| "Results", | |
| ui.div( | |
| ui.div(id="process_status"), | |
| ui.div(id="process_progress"), | |
| ui.div(id="process_summary"), | |
| ui.div(id="method_chips"), | |
| style="margin-bottom: 12px;" | |
| ), | |
| ui.card( | |
| ui.card_header("Filter & View"), | |
| ui.card_body( | |
| ui.row( | |
| ui.column(6, ui.input_text("search_filter", "Search:", placeholder="Type to filter...")), | |
| ui.column(6, | |
| ui.div( | |
| ui.input_checkbox("show_no_match", "Only NO MATCH", value=False), | |
| ui.input_checkbox("sort_by_score", "Sort by score", value=True), | |
| ui.input_checkbox("compact_density", "Compact", value=False), | |
| ui.input_checkbox("use_grid", "Interactive grid", value=True), | |
| class_="d-flex gap-3 flex-wrap justify-content-end" | |
| ) | |
| ), | |
| ) | |
| ) | |
| ), | |
| ui.output_ui("results_tabulator"), | |
| ui.div(ui.output_table("results_table"), class_="results-container", id="results_container"), | |
| ui.br(), | |
| ui.card( | |
| ui.card_header("Export"), | |
| ui.card_body( | |
| ui.div( | |
| ui.download_button("download_results", ui.span(ui.tags.i(class_="bi bi-download me-2"), "Export All Results"), class_="btn btn-success"), | |
| ui.download_button("download_no_match", ui.span(ui.tags.i(class_="bi bi-exclamation-triangle me-2"), "Export NO MATCH Only"), class_="btn btn-outline-warning"), | |
| ui.input_action_button("reset_analysis", ui.span(ui.tags.i(class_="bi bi-arrow-repeat me-2"), "Start New Matching"), class_="btn btn-primary"), | |
| class_="btn-group-export" | |
| ) | |
| ) | |
| ) | |
| ), | |
| ui.nav_panel( | |
| "Visualizations", | |
| ui.card( | |
| ui.card_header("Similarity Score Distribution"), | |
| ui.card_body( | |
| ui.row( | |
| ui.column(6, ui.input_select("viz_type", "Chart", choices={"hist":"Histogram","cdf":"Cumulative","threshold":"Threshold curve","match_pie":"Matches vs No Matches"}, selected="hist")), | |
| ), | |
| ui.output_plot("score_hist") | |
| ) | |
| ) | |
| ), | |
| ui.nav_panel( | |
| "About", | |
| ui.card( | |
| ui.card_header("About Food Mapper"), | |
| ui.card_body( | |
| ui.h4("Food Mapper", class_="mb-3"), | |
| ui.p("A state-of-the-art tool for matching food descriptions across different databases using advanced natural language processing techniques.", class_="lead"), | |
| ui.hr(), | |
| ui.h5(ui.tags.i(class_="bi bi-stars me-2"), "Key Features"), | |
| ui.tags.ul( | |
| ui.tags.li(ui.HTML("Semantic embedding via <a href=\"https://huggingface.co/thenlper/gte-large\" target=\"_blank\">thenlper/gte-large</a>")), | |
| ui.tags.li("Batch processing with progress tracking"), | |
| ui.tags.li("Clear results table with spreadsheet-like view"), | |
| ui.tags.li("Export matched results and NO MATCH items") | |
| ) | |
| ) | |
| ) | |
| ), | |
| id="main_tabs", | |
| title="Food Mapper" | |
| ), | |
| ui.br(), | |
| ui.div( | |
| ui.strong("Western Human Nutrition Research Center"), | |
| " | Davis, CA", | |
| ui.br(), | |
| "Diet, Microbiome and Immunity Research Unit", | |
| ui.br(), | |
| "United States Department of Agriculture | Agricultural Research Service", | |
| ), | |
| theme=shinyswatch.theme.yeti() | |
| ) | |
| ''' # end legacy layout | |
| def make_sidebar(): | |
| return ui.sidebar( | |
| ui.div( | |
| ui.h5( | |
| "Upload Your Data", | |
| ui.input_action_link( | |
| "show_upload_help", | |
| ui.tags.i(class_="bi bi-question-circle ms-2", style="font-size: 0.8rem;"), | |
| class_="text-muted", | |
| style="text-decoration: none;" | |
| ), | |
| class_="d-flex justify-content-between align-items-center" | |
| ) | |
| ), | |
| ui.input_file("input_file", "Input CSV", accept=[".csv"], multiple=False), | |
| ui.div(id="input_status", class_="mt-2"), | |
| ui.input_file("target_file", "Target CSV", accept=[".csv"], multiple=False), | |
| ui.div(id="target_status", class_="mt-2"), | |
| # Add sample data button for easier mobile access | |
| ui.div( | |
| ui.input_action_button( | |
| "sidebar_sample_data", | |
| "Or use sample data", | |
| class_="btn btn-outline-primary btn-sm w-100 mt-3" | |
| ), | |
| class_="mb-3" | |
| ), | |
| ui.hr(), | |
| ui.output_ui("sidebar_results_summary_block"), | |
| ui.hr(), | |
| # Dynamic navigation button that changes based on current step | |
| ui.output_ui("sidebar_navigation_button"), | |
| open="open", | |
| id="main_sidebar", | |
| ) | |
| def make_footer(): | |
| return ui.div( | |
| ui.strong("Western Human Nutrition Research Center"), | |
| " | Davis, CA", | |
| ui.br(), | |
| "Diet, Microbiome and Immunity Research Unit", | |
| ui.br(), | |
| "United States Department of Agriculture | Agricultural Research Service", | |
| class_="footer" | |
| ) | |
| # Rebuild app UI with top navbar and nested tabs structure | |
| app_ui = ui.page_navbar( | |
| # Top navigation bar panels | |
| ui.nav_panel( | |
| "Semantic Embedder", | |
| ui.page_sidebar( | |
| make_sidebar(), | |
| ui.tags.style(custom_css_min), | |
| ui.navset_tab( | |
| ui.nav_panel( | |
| "Tutorial", | |
| ui.card( | |
| ui.card_header("Get Started"), | |
| ui.card_body( | |
| ui.row( | |
| ui.column(6, | |
| ui.h6("Quick Start", class_="mb-2"), | |
| ui.tags.ul( | |
| ui.tags.li("Upload Input CSV"), | |
| ui.tags.li("Upload Target CSV"), | |
| ui.tags.li("Pick columns → Start") | |
| , class_="mb-2"), | |
| ui.tags.small(ui.tags.i(class_="bi bi-upload me-1"), "Use the sidebar on the left to add your files.", class_="text-muted d-block mb-2"), | |
| ui.div( | |
| ui.tags.small("No data? ", class_="text-muted"), | |
| ui.input_action_button("load_sample", "Try with sample data", class_="btn btn-outline-primary btn-sm"), | |
| class_="mt-2 mb-3" | |
| ) | |
| ), | |
| ui.column(6, | |
| ui.h6( | |
| ui.span("Data Requirements"), | |
| ui.input_action_link( | |
| "show_requirements", | |
| ui.tags.i(class_="bi bi-info-circle ms-2"), | |
| class_="text-primary" | |
| ), | |
| class_="mb-2" | |
| ), | |
| ui.tags.ul( | |
| ui.tags.li("CSV files with headers"), | |
| ui.tags.li("Input: items to match"), | |
| ui.tags.li("Target: reference list") | |
| , class_="mb-2") | |
| ) | |
| , class_="g-4 align-items-start"), | |
| ui.hr(class_="my-3"), | |
| ui.div( | |
| ui.h6("What This Tool Does", class_="mb-2"), | |
| ui.p( | |
| "This application matches text descriptions between two datasets using AI-powered semantic analysis. " | |
| "Upload your input items and target reference list, select the columns to match, " | |
| "and the tool will find the best semantic matches based on meaning rather than exact text.", | |
| class_="text-muted small" | |
| ), | |
| ui.h6("Key Features", class_="mb-2 mt-3"), | |
| ui.tags.ul( | |
| ui.tags.li("Semantic matching using state-of-the-art embeddings", class_="small text-muted"), | |
| ui.tags.li("Adjustable similarity threshold for fine-tuning", class_="small text-muted"), | |
| ui.tags.li("Interactive visualizations and data export", class_="small text-muted"), | |
| ui.tags.li("Text cleaning options for better matches", class_="small text-muted") | |
| ), | |
| ui.div(id="sample_status", class_="mt-2") | |
| ) | |
| ) | |
| ) | |
| ), | |
| ui.nav_panel( | |
| "Step 1: Data & Configure", | |
| ui.card( | |
| ui.card_header("Matching Setup"), | |
| ui.card_body( | |
| ui.row( | |
| ui.column(6, | |
| ui.input_select("input_column", "Input Column:", choices=[], selected=None), | |
| ui.input_switch("clean_input", "Apply text cleaning to input", value=False), | |
| ui.div( | |
| ui.output_data_frame("input_col_preview"), | |
| id="input_preview_wrap", | |
| style="margin: 0 !important; padding: 0 !important; text-align: left !important; width: 100% !important;" | |
| ) | |
| ), | |
| ui.column(6, | |
| ui.input_select("target_column", "Target Column:", choices=[], selected=None), | |
| ui.input_switch("clean_target", "Apply text cleaning to target", value=False), | |
| ui.div( | |
| ui.output_data_frame("target_col_preview"), | |
| id="target_preview_wrap", | |
| style="margin: 0 !important; padding: 0 !important; text-align: left !important; width: 100% !important;" | |
| ) | |
| ) | |
| ), | |
| ui.hr(), | |
| ui.div( | |
| ui.h5("Similarity Threshold", class_="text-center mb-2"), | |
| # Three-column layout for professional appearance (centered slider and button) | |
| ui.row( | |
| # Left column - Method info | |
| ui.column(3, | |
| ui.div( | |
| ui.div( | |
| ui.tags.i(class_="bi bi-cpu text-primary me-1"), | |
| ui.strong("Method"), | |
| class_="mb-1 text-center small" | |
| ), | |
| ui.div("Semantic Embedding", class_="text-muted mb-2 text-center small"), | |
| ui.div( | |
| ui.tags.i(class_="bi bi-diagram-3 text-primary me-1"), | |
| ui.strong("Model"), | |
| class_="mb-1 text-center small" | |
| ), | |
| ui.div( | |
| ui.a("thenlper/gte-large", | |
| href="https://huggingface.co/thenlper/gte-large", | |
| target="_blank", | |
| class_="text-primary small"), | |
| class_="text-center small" | |
| ), | |
| class_="border-end pe-3 py-2" | |
| ) | |
| ), | |
| # Center column - Threshold slider and button stacked and centered | |
| ui.column(6, | |
| ui.div( | |
| ui.div( | |
| ui.div( | |
| ui.input_slider("threshold", "", min=0.0, max=1.0, value=0.85, step=0.01), | |
| class_="mx-auto", | |
| style="max-width: 400px;" | |
| ), | |
| class_="d-flex justify-content-center mb-1" | |
| ), | |
| ui.div( | |
| ui.input_action_button( | |
| "run_matching", | |
| "Start Mapping", | |
| class_="btn btn-primary px-5 shadow-sm", | |
| style="padding: 0.5rem 2rem;" | |
| ), | |
| class_="text-center mt-2" | |
| ) | |
| ) | |
| ), | |
| # Right column - Threshold note | |
| ui.column(3, | |
| ui.div( | |
| ui.div( | |
| ui.tags.i(class_="bi bi-info-circle text-muted me-1"), | |
| ui.tags.small(ui.strong("Note"), class_="text-muted"), | |
| class_="mb-1" | |
| ), | |
| ui.tags.small( | |
| ui.div("Items below threshold", class_="text-muted lh-sm"), | |
| ui.div("marked as NO MATCH.", class_="text-muted lh-sm"), | |
| ui.div("Adjust for performance", class_="text-muted lh-sm mt-1"), | |
| ui.div("for your dataset.", class_="text-muted lh-sm") | |
| ), | |
| class_="border-start ps-3 py-2" | |
| ) | |
| ) | |
| ), | |
| class_="matching-config-card" | |
| ) | |
| ) | |
| ), | |
| ui.br(), | |
| ui.output_ui("center_progress_overlay") | |
| ), | |
| ui.nav_panel( | |
| "Step 2: Results", | |
| ui.navset_pill( | |
| ui.nav_panel( | |
| "View Mappings", | |
| ui.div( | |
| ui.div(id="process_status"), | |
| ui.div(id="process_progress"), | |
| ui.div(id="process_summary"), | |
| ui.div(id="method_chips"), | |
| style="margin-bottom: 12px;" | |
| ), | |
| ui.output_ui("results_tabulator"), | |
| # Add tip below the results table with clickable link | |
| ui.div( | |
| ui.tags.small( | |
| ui.tags.i(class_="bi bi-lightbulb me-1"), | |
| "Tip: Return to ", | |
| ui.input_action_link("goto_step1_from_tip", "Step 1: Data & Configure", class_="text-primary"), | |
| " to adjust threshold or column selections, then re-run mapping.", | |
| class_="text-muted" | |
| ), | |
| class_="mt-3 mb-3 text-center" | |
| ), | |
| ui.div( | |
| ui.download_button("download_all_data", ui.span(ui.tags.i(class_="bi bi-download me-2"), "Export All Data"), class_="btn btn-success"), | |
| ui.download_button("download_matches", ui.span(ui.tags.i(class_="bi bi-file-earmark-check me-2"), "Export Matches"), class_="btn btn-info"), | |
| ui.input_action_button("reset_analysis", ui.span(ui.tags.i(class_="bi bi-arrow-repeat me-2"), "Start New Mapping"), class_="btn btn-primary"), | |
| class_="btn-group-export" | |
| ) | |
| ), | |
| ui.nav_panel( | |
| "Visualizations", | |
| ui.card( | |
| ui.card_header("Interactive Visualizations"), | |
| ui.card_body( | |
| ui.row( | |
| ui.column(7, | |
| ui.input_select("plotly_viz_type", "Chart Type", | |
| choices={ | |
| "density": "Density Plot - Score distribution shape", | |
| "histogram": "Histogram - Frequency of score ranges", | |
| "threshold": "Threshold Analysis - Match rate at different cutoffs" | |
| # HIDDEN VISUALIZATIONS - Uncomment lines below to restore | |
| # "box": "Box Plot - Quartiles & outliers", | |
| # "violin": "Violin Plot - Match vs No-match comparison", | |
| # "scatter": "Scatter Plot - Sequential patterns", | |
| # "ecdf": "Cumulative Distribution - Probability curve", | |
| # "sunburst": "Match Breakdown - Hierarchical match statistics" | |
| }, | |
| selected="density", | |
| width="100%" | |
| ) | |
| ), | |
| ui.column(5, | |
| ui.input_checkbox("show_threshold_line", "Show threshold line", value=True) | |
| ) | |
| ), | |
| ui.output_ui("chart_description"), | |
| output_widget("plotly_viz") | |
| ) | |
| ) | |
| ), | |
| id="results_subtabs" | |
| ) | |
| ), | |
| id="workflow_tabs" | |
| ), | |
| # JS helpers for Step 1 preview layout and narrow first column | |
| ui.tags.script(""" | |
| (function(){ | |
| // Initialize Bootstrap tooltips on demand | |
| function initTooltips(){ | |
| if (window.bootstrap && bootstrap.Tooltip) { | |
| document.querySelectorAll('[data-bs-toggle="tooltip"]').forEach(function(el){ | |
| try { new bootstrap.Tooltip(el, {container:'body'}); } catch(e){} | |
| }); | |
| } | |
| } | |
| document.addEventListener('DOMContentLoaded', initTooltips); | |
| document.addEventListener('shiny:value', initTooltips); | |
| // Threshold value badge removed; rely on slider only | |
| function adjustPreview(id){ | |
| var root = document.getElementById(id); | |
| if(!root) return; | |
| // Ensure containers don't center and take full width | |
| var containers = root.querySelectorAll('.shiny-data-frame, .html-fill-container, .html-fill-item, .gridjs-container, .gridjs-wrapper'); | |
| containers.forEach(function(el){ | |
| try { | |
| el.style.marginLeft = '0'; | |
| el.style.marginRight = '0'; | |
| el.style.width = '100%'; | |
| el.style.maxWidth = 'none'; | |
| } catch(e){} | |
| }); | |
| var table = root.querySelector('.gridjs-table'); | |
| if(!table) return; | |
| table.style.width = '100%'; | |
| table.style.tableLayout = 'auto'; | |
| // Force first column narrow across header and body (Grid.js) | |
| var th0 = table.querySelector('thead.gridjs-thead th.gridjs-th:first-child'); | |
| if (th0) { | |
| th0.style.setProperty('width','36px','important'); | |
| th0.style.setProperty('min-width','36px','important'); | |
| th0.style.setProperty('max-width','40px','important'); | |
| th0.style.setProperty('white-space','nowrap','important'); | |
| th0.style.setProperty('text-align','center','important'); | |
| } | |
| table.querySelectorAll('tbody.gridjs-tbody td.gridjs-td:first-child').forEach(function(td){ | |
| td.style.setProperty('width','36px','important'); | |
| td.style.setProperty('min-width','36px','important'); | |
| td.style.setProperty('max-width','40px','important'); | |
| td.style.setProperty('white-space','nowrap','important'); | |
| td.style.setProperty('text-align','center','important'); | |
| }); | |
| } | |
| document.addEventListener('shiny:value', function(ev){ | |
| if (ev.detail && (ev.detail.name === 'input_col_preview' || ev.detail.name === 'target_col_preview')){ | |
| setTimeout(function(){ adjustPreview(ev.detail.name); }, 0); | |
| } | |
| }); | |
| document.addEventListener('DOMContentLoaded', function(){ | |
| adjustPreview('input_col_preview'); | |
| adjustPreview('target_col_preview'); | |
| }); | |
| })(); | |
| """), | |
| make_footer() | |
| ) | |
| ), | |
| ui.nav_panel( | |
| "About", | |
| ui.div( | |
| ui.card( | |
| ui.card_header( | |
| ui.h4("About Food Mapper", class_="mb-0") | |
| ), | |
| ui.card_body( | |
| # Hero Section | |
| ui.div( | |
| ui.h2("Food Mapper", class_="text-center mb-3"), | |
| ui.p( | |
| "Advanced semantic matching tool for aligning food descriptions across nutritional databases", | |
| class_="lead text-center text-muted mb-4" | |
| ), | |
| ui.hr(class_="my-4") | |
| ), | |
| # Overview Section | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-info-circle me-2"), | |
| "Overview", | |
| class_="mb-3" | |
| ), | |
| ui.p( | |
| "Food Mapper solves a major problem in nutritional research: accurately matching " | |
| "food items between different databases that use varying naming conventions and descriptions. " | |
| "This tool uses neural language processing to find semantic matches " | |
| "based on meaning rather than exact text matching.", | |
| class_="mb-4" | |
| ) | |
| ), | |
| # Problem Statement | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-question-circle me-2"), | |
| "The Challenge", | |
| class_="mb-3 mt-4" | |
| ), | |
| ui.p( | |
| "Nutritional databases often describe the same foods differently:", | |
| class_="mb-2" | |
| ), | |
| ui.tags.ul( | |
| ui.tags.li('"2% milk" vs "Milk, reduced fat, 2% milkfat"'), | |
| ui.tags.li('"OJ" vs "Orange juice, raw"'), | |
| ui.tags.li('"Whole wheat bread" vs "Bread, whole-wheat, commercially prepared"'), | |
| class_="mb-3" | |
| ), | |
| ui.p( | |
| "Traditional text matching fails to recognize these as the same items, leading to " | |
| "incomplete or inaccurate nutritional analyses.", | |
| class_="text-muted mb-4" | |
| ) | |
| ), | |
| # Solution Section | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-lightbulb me-2"), | |
| "Our Solution", | |
| class_="mb-3 mt-4" | |
| ), | |
| ui.p( | |
| "Food Mapper uses semantic embeddings to understand the meaning behind food descriptions, " | |
| "enabling accurate matches even when the exact wording differs.", | |
| class_="mb-3" | |
| ), | |
| ui.div( | |
| ui.row( | |
| ui.column(6, | |
| ui.div( | |
| ui.tags.i(class_="bi bi-cpu text-primary fs-3 mb-2 d-block"), | |
| ui.h6("AI Model", class_="mb-2"), | |
| ui.p( | |
| ui.HTML('Powered by <a href="https://huggingface.co/thenlper/gte-large" target="_blank" class="text-decoration-none">GTE-Large</a>'), | |
| ui.br(), | |
| ui.tags.small("Neural embedding model", class_="text-muted"), | |
| class_="small" | |
| ), | |
| class_="text-center p-3 border rounded mb-3" | |
| ) | |
| ), | |
| ui.column(6, | |
| ui.div( | |
| ui.tags.i(class_="bi bi-speedometer2 text-success fs-3 mb-2 d-block"), | |
| ui.h6("Performance", class_="mb-2"), | |
| ui.p( | |
| "Process thousands of items/minute", | |
| ui.br(), | |
| ui.tags.small("Batch processing system", class_="text-muted"), | |
| class_="small" | |
| ), | |
| class_="text-center p-3 border rounded mb-3" | |
| ) | |
| ) | |
| ), | |
| ui.row( | |
| ui.column(6, | |
| ui.div( | |
| ui.tags.i(class_="bi bi-bullseye text-info fs-3 mb-2 d-block"), | |
| ui.h6("Accuracy", class_="mb-2"), | |
| ui.p( | |
| "Semantic understanding", | |
| ui.br(), | |
| ui.tags.small("Matches based on meaning", class_="text-muted"), | |
| class_="small" | |
| ), | |
| class_="text-center p-3 border rounded mb-3" | |
| ) | |
| ), | |
| ui.column(6, | |
| ui.div( | |
| ui.tags.i(class_="bi bi-sliders text-warning fs-3 mb-2 d-block"), | |
| ui.h6("Control", class_="mb-2"), | |
| ui.p( | |
| "Adjustable thresholds", | |
| ui.br(), | |
| ui.tags.small("Fine-tune match sensitivity", class_="text-muted"), | |
| class_="small" | |
| ), | |
| class_="text-center p-3 border rounded mb-3" | |
| ) | |
| ) | |
| ) | |
| ) | |
| ), | |
| # Key Features | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-star me-2"), | |
| "Key Features", | |
| class_="mb-3 mt-4" | |
| ), | |
| ui.tags.ul( | |
| ui.tags.li( | |
| ui.strong("Semantic Matching:"), | |
| " Understands food descriptions using neural embeddings" | |
| ), | |
| ui.tags.li( | |
| ui.strong("Batch Processing:"), | |
| " Handle thousands of items efficiently with concurrent processing" | |
| ), | |
| ui.tags.li( | |
| ui.strong("Interactive Visualizations:"), | |
| " Explore match distributions and patterns with 8 chart types" | |
| ), | |
| ui.tags.li( | |
| ui.strong("Data Export:"), | |
| " Download results as CSV with all original data preserved" | |
| ), | |
| ui.tags.li( | |
| ui.strong("Text Cleaning:"), | |
| " Optional preprocessing to potentially improve match quality" | |
| ), | |
| ui.tags.li( | |
| ui.strong("Real-time Preview:"), | |
| " See data transformations before processing" | |
| ), | |
| class_="mb-4" | |
| ) | |
| ), | |
| # Use Cases | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-diagram-3 me-2"), | |
| "Use Cases", | |
| class_="mb-3 mt-4" | |
| ), | |
| ui.tags.ul( | |
| ui.tags.li("Harmonizing dietary intake data with nutrient databases"), | |
| ui.tags.li("Linking research datasets to food composition tables"), | |
| ui.tags.li("Standardizing food nomenclature across studies"), | |
| ui.tags.li("Quality control for nutritional data entry"), | |
| ui.tags.li("Cross-referencing international food databases"), | |
| class_="mb-4" | |
| ) | |
| ), | |
| # Credits Section | |
| ui.div( | |
| ui.hr(class_="my-4"), | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-people me-2"), | |
| "Development Team", | |
| class_="mb-3" | |
| ), | |
| ui.div( | |
| ui.p( | |
| ui.strong("Principal Investigator:"), | |
| " Dr. Danielle G. Lemay", | |
| ui.br(), | |
| ui.tags.small("Research Molecular Biologist", class_="text-muted"), | |
| class_="mb-2" | |
| ), | |
| ui.p( | |
| ui.strong("Developer:"), | |
| " Richard Stoker", | |
| ui.br(), | |
| ui.tags.small("IT Specialist (Scientific)", class_="text-muted"), | |
| class_="mb-2" | |
| ), | |
| ui.p( | |
| ui.strong("Organization:"), | |
| ui.br(), | |
| "USDA Agricultural Research Service", | |
| ui.br(), | |
| "Western Human Nutrition Research Center", | |
| ui.br(), | |
| ui.tags.small("Davis, California", class_="text-muted"), | |
| class_="mb-3" | |
| ) | |
| ) | |
| ), | |
| # Version and Contact | |
| ui.div( | |
| ui.hr(class_="my-4"), | |
| ui.row( | |
| ui.column(6, | |
| ui.p( | |
| ui.tags.i(class_="bi bi-tag me-1"), | |
| ui.strong("Version:"), | |
| " 1.0.0", | |
| class_="text-muted small mb-0" | |
| ) | |
| ), | |
| ui.column(6, | |
| ui.p( | |
| ui.tags.i(class_="bi bi-envelope me-1"), | |
| ui.strong("Contact:"), | |
| " richard.stoker@usda.gov", | |
| ui.br(), | |
| ui.HTML('<a href="https://github.com/RichardStoker-USDA/Food-Mapper" target="_blank" class="text-decoration-none"><i class="bi bi-github me-1"></i>GitHub</a>'), | |
| class_="text-muted small mb-0 text-end" | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ), | |
| make_footer(), | |
| class_="container-fluid", | |
| style="max-width: 1200px; margin: 0 auto; padding: 20px;" | |
| ) | |
| ), | |
| title=ui.tags.div( | |
| ui.tags.h2( | |
| "Food Mapper", | |
| style="margin: 0 1rem 0 0; font-size: 1.9rem; font-weight: 700; letter-spacing: 0.5px; padding-right: 1rem; border-right: 2px solid var(--bs-gray-400, #ced4da);" | |
| ), | |
| ui.tags.div( | |
| ui.input_dark_mode(id="dark_mode", mode="light"), | |
| style="margin-left: 1rem;" | |
| ), | |
| style="display: flex; align-items: center;" | |
| ), | |
| id="main_navbar", | |
| # Remove hardcoded bg color to let theme control it | |
| theme=shinyswatch.theme.yeti() | |
| ) | |
| # UI utility functions | |
| def create_score_bar(score: float, width: int = 12, min_scale: float = 0.5) -> str: | |
| """Create a text-based progress bar for score visualization | |
| For semantic embeddings, scales the bar relative to a minimum threshold | |
| since scores rarely go below 0.5, making differences more visible. | |
| """ | |
| try: | |
| score = 0.0 if score is None or pd.isna(score) else float(score) | |
| except: | |
| score = 0.0 | |
| # For semantic embeddings, scale relative to min_scale (default 0.5) | |
| # This makes the bar show relative differences better | |
| if score < min_scale: | |
| # If below min_scale, show as empty or minimal | |
| filled = 0 | |
| else: | |
| # Scale from min_scale to 1.0 across the full bar width | |
| scaled_score = (score - min_scale) / (1.0 - min_scale) | |
| scaled_score = max(0.0, min(1.0, scaled_score)) | |
| filled = int(round(scaled_score * width)) | |
| return "█" * filled + "░" * (width - filled) | |
| def create_status_badge(value: str) -> str: | |
| """Create HTML status badge based on match status""" | |
| if str(value).upper() == "NO MATCH": | |
| return '<span class="status-badge status-warn">NO MATCH</span>' | |
| else: | |
| return '<span class="status-badge status-ok">Match</span>' | |
| def server(input: Inputs, output: Outputs, session: Session): | |
| # Reactive values for data storage | |
| input_df = reactive.value(pd.DataFrame()) | |
| target_df = reactive.value(pd.DataFrame()) | |
| results_df = reactive.value(pd.DataFrame()) | |
| current_step = reactive.value(1) | |
| progress_message = reactive.value("Starting...") | |
| # State for optional centered overlay (currently returns None by default) | |
| progress_show = reactive.value(False) | |
| progress_percent = reactive.value(0) | |
| # Show welcome splash screen on app load (controlled by environment variable) | |
| def show_splash(): | |
| # Check environment variable to control splash screen visibility | |
| # HIDE_SPLASH_SCREEN: Set to "true" or "1" to hide the splash screen | |
| # Default behavior is to show the splash screen if variable is not set | |
| hide_splash = os.environ.get("HIDE_SPLASH_SCREEN", "").lower() in ["true", "1", "yes"] | |
| if not hide_splash: | |
| ui.modal_show( | |
| ui.modal( | |
| ui.div( | |
| # Header | |
| ui.h2("Food Mapper", class_="text-center mb-2"), | |
| ui.p( | |
| "Research Tool for Dietary Data Mapping", | |
| class_="text-center text-muted mb-4" | |
| ), | |
| ui.hr(), | |
| # Research Paper Section | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-journal-text me-2"), | |
| "Research Publication", | |
| class_="mb-3" | |
| ), | |
| ui.div( | |
| ui.p( | |
| "This application was developed as part of ongoing research on automated methods " | |
| "for mapping dietary intake data to food composition databases.", | |
| class_="mb-3" | |
| ), | |
| ui.div( | |
| ui.p( | |
| ui.strong("Paper Title:"), | |
| ui.br(), | |
| ui.tags.small( | |
| "[Title Placeholder - To Be Updated]", | |
| class_="text-muted" | |
| ), | |
| class_="mb-2" | |
| ), | |
| ui.p( | |
| ui.strong("Authors:"), | |
| ui.br(), | |
| ui.tags.small( | |
| "Lemay DG, Strohmeier MP, Stoker RB, Larke JA, Wilson SMG", | |
| class_="text-muted" | |
| ), | |
| class_="mb-2" | |
| ), | |
| ui.p( | |
| ui.strong("Learn More:"), | |
| ui.br(), | |
| ui.tags.small( | |
| ui.HTML('[<a href="#" target="_blank">Link to paper - Coming Soon</a>]'), | |
| class_="text-muted" | |
| ), | |
| class_="mb-2" | |
| ), | |
| ui.p( | |
| ui.HTML('<a href="https://github.com/RichardStoker-USDA/Food-Mapper" target="_blank" class="text-decoration-none text-muted"><i class="bi bi-github"></i> View on GitHub</a>'), | |
| class_="mb-3 small" | |
| ), | |
| class_="ms-3 border-start ps-3" | |
| ) | |
| ), | |
| class_="mb-4" | |
| ), | |
| # How It Works Section | |
| ui.div( | |
| ui.h5( | |
| ui.tags.i(class_="bi bi-info-circle me-2"), | |
| "How It Works", | |
| class_="mb-3" | |
| ), | |
| ui.p( | |
| "Food Mapper uses the GTE-Large neural embedding model to understand the meaning " | |
| "behind food descriptions. This enables accurate matching even when foods are described " | |
| "differently across databases.", | |
| class_="small mb-2" | |
| ), | |
| ui.p( | |
| "Traditional manual mapping takes ~28 minutes per food item. " | |
| "This tool automates the process, handling thousands of items in minutes.", | |
| class_="small text-muted" | |
| ), | |
| class_="mb-4" | |
| ), | |
| # Disclaimer | |
| ui.div( | |
| ui.hr(), | |
| ui.p( | |
| ui.tags.i(class_="bi bi-exclamation-triangle me-1"), | |
| ui.strong("Research Tool Disclaimer"), | |
| class_="text-center mb-2" | |
| ), | |
| ui.p( | |
| "This application is a research tool intended for scientific use in nutritional and dietary studies. " | |
| "Results should be validated by domain experts. For research purposes only.", | |
| class_="small text-muted text-center" | |
| ), | |
| class_="mt-3" | |
| ), | |
| # Get Started Button | |
| ui.div( | |
| ui.input_action_button( | |
| "close_splash", | |
| "Get Started", | |
| class_="btn btn-primary btn-lg" | |
| ), | |
| class_="text-center mt-4" | |
| ), | |
| class_="p-4" | |
| ), | |
| title="", | |
| footer=None, | |
| size="m", | |
| easy_close=True, | |
| fade=True | |
| ) | |
| ) | |
| # Close splash screen handler | |
| def close_splash(): | |
| ui.modal_remove() | |
| # Search input debouncing implementation | |
| def debounced_search(): | |
| try: | |
| return input.search_debounced() | |
| except Exception: | |
| return input.search_filter() | |
| # Threshold badge removed; using slider only | |
| # Dynamic sidebar navigation button | |
| def sidebar_navigation_button(): | |
| # Get current active tab - workflow_tabs tracks the main steps | |
| try: | |
| current_tab = input.workflow_tabs() | |
| except: | |
| current_tab = "Tutorial" # Default to Step 0 | |
| if current_tab == "Tutorial": | |
| # Check if both files are loaded | |
| in_df = input_df.get() | |
| tgt_df = target_df.get() | |
| files_loaded = (not in_df.empty) and (not tgt_df.empty) | |
| if files_loaded: | |
| return ui.input_action_button( | |
| "sidebar_next", | |
| ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: Configure Data"), | |
| class_="btn btn-primary w-100" | |
| ) | |
| else: | |
| return ui.input_action_button( | |
| "sidebar_next", | |
| ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: Configure Data"), | |
| class_="btn btn-primary w-100", | |
| disabled=True | |
| ) | |
| elif current_tab == "Step 1: Data & Configure": | |
| # Check if results are available | |
| if not results_df.get().empty: | |
| return ui.input_action_button( | |
| "sidebar_next", | |
| ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: View Results"), | |
| class_="btn btn-primary w-100" | |
| ) | |
| else: | |
| return ui.input_action_button( | |
| "sidebar_next", | |
| ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: View Results"), | |
| class_="btn btn-primary w-100 disabled", | |
| disabled=True | |
| ) | |
| elif current_tab == "Step 2: Results": | |
| return ui.input_action_button( | |
| "sidebar_reset", | |
| ui.span(ui.tags.i(class_="bi bi-arrow-counterclockwise me-2"), "Start New Mapping"), | |
| class_="btn btn-primary w-100" | |
| ) | |
| else: | |
| return None | |
| # Sidebar navigation button handler | |
| def handle_sidebar_next(): | |
| try: | |
| current_tab = input.workflow_tabs() | |
| except: | |
| current_tab = "Tutorial" | |
| if current_tab == "Tutorial": | |
| # Only navigate if files are loaded | |
| in_df = input_df.get() | |
| tgt_df = target_df.get() | |
| if (not in_df.empty) and (not tgt_df.empty): | |
| ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure") | |
| elif current_tab == "Step 1: Data & Configure": | |
| ui.update_navs("workflow_tabs", selected="Step 2: Results") | |
| # Sidebar reset button handler | |
| def handle_sidebar_reset(): | |
| # Clear everything for a fresh start | |
| reset_for_new_analysis() | |
| # Navigate to Step 1 (not Step 0) | |
| ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure") | |
| # Expand the sidebar so user can upload new files | |
| ui.update_sidebar("main_sidebar", show=True) | |
| # Show notification | |
| ui.notification_show( | |
| "Ready for new mapping. Upload your data files.", | |
| type="info", | |
| duration=3 | |
| ) | |
| # Load sample data from sidebar button (mobile-friendly) | |
| def load_sample_from_sidebar(): | |
| # Load the sample data | |
| sample_input, sample_target = get_sample_data() | |
| input_df.set(sample_input) | |
| target_df.set(sample_target) | |
| # Update column choices | |
| ui.update_select("input_column", | |
| choices=sample_input.columns.tolist(), | |
| selected="description") | |
| ui.update_select("target_column", | |
| choices=sample_target.columns.tolist(), | |
| selected="reference") | |
| # Enable buttons since data is loaded | |
| check_files_loaded() | |
| # Close the sidebar (important for mobile) | |
| ui.update_sidebar("main_sidebar", show=False) | |
| # Navigate to Step 1 | |
| ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure") | |
| # Show success notification | |
| ui.notification_show( | |
| "Sample data loaded! Configure your matching settings.", | |
| type="success", | |
| duration=3 | |
| ) | |
| # Load sample data from tutorial page | |
| def load_sample_data(): | |
| # Remove previous sample status message if present | |
| try: | |
| ui.remove_ui(selector="#sample_status_msg") | |
| except Exception: | |
| pass | |
| sample_input, sample_target = get_sample_data() | |
| input_df.set(sample_input) | |
| target_df.set(sample_target) | |
| # Update column choices | |
| ui.update_select("input_column", | |
| choices=sample_input.columns.tolist(), | |
| selected="description") | |
| ui.update_select("target_column", | |
| choices=sample_target.columns.tolist(), | |
| selected="reference") | |
| # Show status | |
| ui.insert_ui( | |
| ui.div( | |
| ui.p("Sample data loaded successfully! Moving to Data Setup...", | |
| class_="alert alert-success alert-animated"), | |
| id="sample_status_msg" | |
| ), | |
| selector="#sample_status", | |
| where="afterEnd" | |
| ) | |
| # Enable buttons since data is loaded | |
| check_files_loaded() | |
| # Automatically navigate to Step 1 after loading sample data | |
| ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure") | |
| # Navigate to Step 1 from tip link | |
| def goto_step1_from_tip(): | |
| ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure") | |
| # Show Upload Help modal when question icon clicked in sidebar | |
| def show_upload_help_modal(): | |
| ui.modal_show( | |
| ui.modal( | |
| ui.div( | |
| ui.h4("Upload Requirements", class_="mb-3"), | |
| ui.hr(), | |
| ui.h6("File Format"), | |
| ui.tags.ul( | |
| ui.tags.li("CSV format (.csv) required"), | |
| ui.tags.li("Include headers in first row"), | |
| ui.tags.li("UTF-8 encoding recommended") | |
| ), | |
| ui.h6("Input File", class_="mt-3"), | |
| ui.p("Items you want to match (one per row)", class_="text-muted"), | |
| ui.h6("Target File", class_="mt-3"), | |
| ui.p("Reference database to match against", class_="text-muted"), | |
| ui.hr(), | |
| ui.p( | |
| ui.tags.small( | |
| "Need help? ", | |
| ui.input_action_link("close_help_goto_tutorial", "View tutorial", class_="text-primary"), | |
| class_="text-muted" | |
| ) | |
| ), | |
| class_="p-2" | |
| ), | |
| footer=ui.input_action_button("close_upload_help", "Got it", class_="btn btn-primary"), | |
| easy_close=True, | |
| size="m", | |
| title="" | |
| ) | |
| ) | |
| # Close upload help modal | |
| def close_upload_help(): | |
| ui.modal_remove() | |
| # Close help and go to tutorial | |
| def close_help_goto_tutorial(): | |
| ui.modal_remove() | |
| ui.update_sidebar("main_sidebar", show=False) | |
| ui.update_navs("workflow_tabs", selected="Tutorial") | |
| # Show Data Requirements modal when info icon clicked | |
| def show_data_requirements_modal(): | |
| ui.modal_show( | |
| ui.modal( | |
| ui.div( | |
| ui.h4("Data Requirements", class_="mb-3"), | |
| ui.hr(), | |
| ui.h6("File Format"), | |
| ui.tags.ul( | |
| ui.tags.li("Files must be in CSV format (.csv)"), | |
| ui.tags.li("Must include column headers in first row"), | |
| ui.tags.li("UTF-8 encoding recommended") | |
| ), | |
| ui.h6("Input File", class_="mt-3"), | |
| ui.p("Contains the items you want to match. Each row represents one item to find a match for.", class_="text-muted"), | |
| ui.h6("Target File", class_="mt-3"), | |
| ui.p("Contains the reference dataset. The system will find the best match from this list for each input item.", class_="text-muted"), | |
| ui.h6("Best Practices", class_="mt-3"), | |
| ui.tags.ul( | |
| ui.tags.li("Choose columns with descriptive text for best semantic matching"), | |
| ui.tags.li("Remove or clean special characters if needed"), | |
| ui.tags.li("Longer descriptions generally produce better matches") | |
| ), | |
| class_="p-2" | |
| ), | |
| footer=ui.input_action_button("close_req", "Got it", class_="btn btn-primary"), | |
| easy_close=True, | |
| size="m", | |
| title="" | |
| ) | |
| ) | |
| # Close Data Requirements modal | |
| def close_data_requirements_modal(): | |
| ui.modal_remove() | |
| # Handle file uploads | |
| def handle_input_file(): | |
| file: list[FileInfo] | None = input.input_file() | |
| if file and len(file) > 0: | |
| df = pd.read_csv(file[0]["datapath"]) | |
| input_df.set(df) | |
| # Update column choices | |
| ui.update_select("input_column", | |
| choices=df.columns.tolist(), | |
| selected=df.columns[0]) | |
| # Remove previous message then insert a fresh, readable status with filename | |
| try: | |
| ui.remove_ui(selector="#input_file_status") | |
| except Exception: | |
| pass | |
| ui.insert_ui( | |
| ui.div( | |
| ui.p(f"Loaded {len(df):,} inputs", | |
| class_="alert alert-success alert-animated"), | |
| id="input_file_status" | |
| ), | |
| selector="#input_status", | |
| where="afterEnd" | |
| ) | |
| # Check if both files loaded | |
| check_files_loaded() | |
| # NOTE: Clear files handler removed as button was removed from UI | |
| def handle_target_file(): | |
| file: list[FileInfo] | None = input.target_file() | |
| if file and len(file) > 0: | |
| df = pd.read_csv(file[0]["datapath"]) | |
| target_df.set(df) | |
| # Update column choices | |
| ui.update_select("target_column", | |
| choices=df.columns.tolist(), | |
| selected=df.columns[0]) | |
| # Remove previous message then insert a fresh, readable status with filename | |
| try: | |
| ui.remove_ui(selector="#target_file_status") | |
| except Exception: | |
| pass | |
| ui.insert_ui( | |
| ui.div( | |
| ui.p(f"Loaded {len(df):,} targets", | |
| class_="alert alert-success alert-animated"), | |
| id="target_file_status" | |
| ), | |
| selector="#target_status", | |
| where="afterEnd" | |
| ) | |
| # Check if both files loaded | |
| check_files_loaded() | |
| # Preview tables | |
| def input_preview(): | |
| df = input_df.get() | |
| if not df.empty: | |
| return df.head(5) | |
| return pd.DataFrame() | |
| def target_preview(): | |
| df = target_df.get() | |
| if not df.empty: | |
| return df.head(5) | |
| return pd.DataFrame() | |
| def input_col_preview(): | |
| df = input_df.get() | |
| col = input.input_column() | |
| if not df.empty and col and col in df.columns: | |
| sample_values = df[col].dropna().head(5).tolist() | |
| # Apply text cleaning if toggle is on | |
| if input.clean_input(): | |
| original_values = sample_values.copy() | |
| cleaned_values = clean_text_simple(sample_values) | |
| preview_df = pd.DataFrame({ | |
| "Row": range(1, len(sample_values) + 1), | |
| "Original": original_values, | |
| "After Cleaning": cleaned_values | |
| }) | |
| else: | |
| preview_df = pd.DataFrame({ | |
| "Row": range(1, len(sample_values) + 1), | |
| "Sample Values": sample_values | |
| }) | |
| # Use Shiny DataGrid (theme-aware, interactive) | |
| return render.DataGrid(preview_df) | |
| return pd.DataFrame() | |
| def target_col_preview(): | |
| df = target_df.get() | |
| col = input.target_column() | |
| if not df.empty and col and col in df.columns: | |
| sample_values = df[col].dropna().head(5).tolist() | |
| # Apply text cleaning if toggle is on | |
| if input.clean_target(): | |
| original_values = sample_values.copy() | |
| cleaned_values = clean_text_simple(sample_values) | |
| preview_df = pd.DataFrame({ | |
| "Row": range(1, len(sample_values) + 1), | |
| "Original": original_values, | |
| "After Cleaning": cleaned_values | |
| }) | |
| else: | |
| preview_df = pd.DataFrame({ | |
| "Row": range(1, len(sample_values) + 1), | |
| "Sample Values": sample_values | |
| }) | |
| # Use Shiny DataGrid (theme-aware, interactive) | |
| return render.DataGrid(preview_df) | |
| return pd.DataFrame() | |
| # Helper function to check readiness for running mapping | |
| def check_files_loaded(): | |
| ready = (not input_df.get().empty) and (not target_df.get().empty) | |
| try: | |
| in_col = input.input_column() | |
| tgt_col = input.target_column() | |
| ready = ready and bool(in_col) and bool(tgt_col) | |
| except Exception: | |
| pass | |
| ui.update_action_button("run_matching", disabled=(not ready)) | |
| # Watch column selection to enable/disable run button | |
| def _watch_columns_for_run(): | |
| try: | |
| _ = (input.input_column(), input.target_column()) | |
| except Exception: | |
| pass | |
| check_files_loaded() | |
| # Navigation handlers | |
| # Navigation effects removed in sidebar layout | |
| # Reset analysis handler from Results page button | |
| def handle_reset_analysis(): | |
| # Clear everything for a fresh start | |
| reset_for_new_analysis() | |
| # Navigate to Step 1 (not Step 0) | |
| ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure") | |
| # Expand the sidebar so user can upload new files | |
| ui.update_sidebar("main_sidebar", show=True) | |
| # Show notification | |
| ui.notification_show( | |
| "Ready for new mapping. Upload your data files.", | |
| type="info", | |
| duration=3 | |
| ) | |
| def reset_for_new_analysis(): | |
| # Clear results | |
| results_df.set(pd.DataFrame()) | |
| # Clear input and target datasets and selections | |
| input_df.set(pd.DataFrame()) | |
| target_df.set(pd.DataFrame()) | |
| ui.update_select("input_column", choices=[], selected=None) | |
| ui.update_select("target_column", choices=[], selected=None) | |
| # Disable run button until files and columns are (re)selected | |
| ui.update_action_button("run_matching", disabled=True) | |
| # Clear file upload status messages | |
| for sel in ["#sample_status_msg", "#process_status > *", "#process_progress > *", "#process_summary > *", "#results_summary_msg", "#input_file_status", "#target_file_status"]: | |
| try: | |
| ui.remove_ui(selector=sel) | |
| except Exception: | |
| pass | |
| # Optional centered overlay (placeholder: disabled by default) | |
| def center_progress_overlay(): | |
| # Return None so nothing is rendered; keep hook for future use | |
| return None | |
| # Sidebar summary block: only show after results exist | |
| def sidebar_results_summary_block(): | |
| df = results_df.get() | |
| if df.empty: | |
| return None | |
| # derive summary | |
| total_inputs = len(df) | |
| no_matches = (df.get('status', '').astype(str).str.upper() == 'NO MATCH').sum() | |
| successful = total_inputs - no_matches | |
| avg_score = df[df.get('status', '').astype(str).str.upper() != 'NO MATCH']['similarity_score'].mean() if 'similarity_score' in df.columns else None | |
| avg_score_str = f"{avg_score:.3f}" if avg_score is not None and not pd.isna(avg_score) else "N/A" | |
| return ui.div( | |
| ui.h5("Results Summary"), | |
| ui.p(f"Total Inputs: {total_inputs}"), | |
| ui.p(f"Successful Matches: {successful}"), | |
| ui.p(f"No Matches: {no_matches}"), | |
| ui.p(f"Average Score: {avg_score_str}"), | |
| class_="alert alert-info alert-animated" | |
| ) | |
| # Run matching process | |
| async def run_matching(): | |
| # Disable the button immediately to prevent double-clicks | |
| ui.update_action_button("run_matching", disabled=True) | |
| # Stay on current tab; navigate to Results after processing | |
| # Clear existing spinner before processing | |
| try: | |
| ui.remove_ui(selector="#processing_spinner") | |
| except Exception: | |
| pass | |
| # Get data | |
| in_df = input_df.get() | |
| tgt_df = target_df.get() | |
| if in_df.empty or tgt_df.empty: | |
| ui.notification_show("Please upload both input and target files", type="warning") | |
| ui.update_action_button("run_matching", disabled=False) # Re-enable on early return | |
| return | |
| # Get settings | |
| in_col = input.input_column() | |
| tgt_col = input.target_column() | |
| threshold = input.threshold() | |
| clean_input_text = input.clean_input() | |
| clean_target_text = input.clean_target() | |
| if not in_col or not tgt_col: | |
| ui.modal_show( | |
| ui.modal( | |
| ui.h5("Select Columns First"), | |
| ui.p("Please choose the input and target columns to match in the 'Data & Configure' tab before running."), | |
| easy_close=True, | |
| footer=ui.input_action_button("dismiss_modal", "OK", class_="btn btn-primary") | |
| ) | |
| ) | |
| ui.update_action_button("run_matching", disabled=False) # Re-enable on early return | |
| return | |
| # Get API key for semantic embeddings | |
| api_key = get_api_key() | |
| if not api_key: | |
| ui.notification_show("DeepInfra API key not found. Please set DEEPINFRA_API_KEY", type="error") | |
| ui.update_action_button("run_matching", disabled=False) # Re-enable on early return | |
| return | |
| # Reset circuit for this run | |
| global _API_FAILURES, FALLBACK_ACTIVE | |
| _API_FAILURES = 0 | |
| FALLBACK_ACTIVE = False | |
| # Clear previous results | |
| ui.remove_ui(selector="#process_status > *") | |
| ui.remove_ui(selector="#process_progress > *") | |
| ui.remove_ui(selector="#process_summary > *") | |
| ui.remove_ui(selector="#method_chips > *") | |
| # Add busy indicator at top of page | |
| ui.busy_indicators.use() | |
| # Use Shiny's built-in Progress with better formatting | |
| with ui.Progress(min=0, max=100) as p: | |
| p.set(5, message="Initializing", detail="Preparing data...") | |
| try: | |
| # Show loading spinner during processing | |
| try: | |
| ui.insert_ui( | |
| ui.div(ui.div(class_="spinner"), class_="spinner-container", id="processing_spinner"), | |
| selector="#process_progress", where="afterBegin" | |
| ) | |
| except Exception: | |
| pass | |
| # Prepare data | |
| input_list = in_df[in_col].dropna().tolist() | |
| target_list = tgt_df[tgt_col].dropna().tolist() | |
| # Remove duplicates from target | |
| target_list_unique = list(dict.fromkeys(target_list)) | |
| # Apply cleaning to display text if toggles are on | |
| # Store both original and cleaned versions | |
| input_list_display = clean_text_simple(input_list) if clean_input_text else input_list | |
| p.set(10, message="Data Prepared", | |
| detail=f"{len(input_list):,} inputs • {len(target_list_unique):,} targets") | |
| await asyncio.sleep(0.1) | |
| # Initialize results with potentially cleaned input text for display | |
| results = pd.DataFrame({ | |
| 'input_description': input_list_display | |
| }) | |
| # Calculate progress steps (semantic only) | |
| total_methods = 1 | |
| progress_per_method = 80 / total_methods | |
| current_progress = 10 | |
| # Run semantic embeddings only | |
| effective_batch = min(EMBED_BATCH_SIZE, 1024) | |
| # Check if we'll be using CPU and notify IMMEDIATELY | |
| if MODEL_FALLBACK_MODE == "local": | |
| ui.notification_show( | |
| "Using LOCAL CPU for embeddings. This will take significantly longer than API/GPU processing. " | |
| f"Processing {len(input_list)} items may take several minutes.", | |
| type="warning", | |
| duration=10, | |
| ) | |
| FALLBACK_ACTIVE = True | |
| elif FALLBACK_ACTIVE: # Already in fallback from previous failures | |
| ui.notification_show( | |
| "API unavailable. Using LOCAL CPU fallback - processing will be much slower.", | |
| type="warning", | |
| duration=8, | |
| ) | |
| # Update progress message if in fallback mode | |
| progress_msg = "Computing semantic embeddings..." | |
| if FALLBACK_ACTIVE or MODEL_FALLBACK_MODE == "local": | |
| progress_msg = "Computing semantic embeddings (CPU - this will take time)..." | |
| p.set(current_progress + 5, | |
| message=progress_msg, | |
| detail=f"Processing {len(input_list):,} items") | |
| await asyncio.sleep(0.1) | |
| # Simple progress callback for embedding batches | |
| batches_total = ((len(input_list) + effective_batch - 1) // effective_batch) + \ | |
| ((len(target_list_unique) + effective_batch - 1) // effective_batch) | |
| batch_num = [0] | |
| def progress_callback(msg: str): | |
| # Only increment when a batch completes (Embedding batches or Matching) | |
| if msg.startswith("Embedding batches:") or msg.startswith("Matching:"): | |
| batch_num[0] = min(batch_num[0] + 1, batches_total) | |
| progress_pct = current_progress + (batch_num[0] / batches_total) * progress_per_method | |
| p.set( | |
| int(progress_pct), | |
| message="Computing Embeddings", | |
| detail=f"Batch {batch_num[0]:,} of {batches_total:,}" | |
| ) | |
| if USE_ASYNC: | |
| embed_results = await run_embed_match_async( | |
| input_list, | |
| target_list_unique, | |
| api_key, | |
| progress_callback=progress_callback, | |
| clean_input=clean_input_text, | |
| clean_target=clean_target_text, | |
| ) | |
| else: | |
| embed_results = run_embed_match( | |
| input_list, | |
| target_list_unique, | |
| api_key, | |
| progress_callback=progress_callback, | |
| clean_input=clean_input_text, | |
| clean_target=clean_target_text, | |
| ) | |
| # Apply cleaning to matched target text if toggle is on | |
| matched_targets = embed_results['match'] | |
| if clean_target_text: | |
| # Clean the matched target text for display | |
| matched_targets = clean_text_simple(matched_targets) | |
| results['best_match'] = matched_targets | |
| results['similarity_score'] = embed_results['score'] | |
| # Keep the best match text without decoration for clean exports | |
| current_progress += progress_per_method | |
| p.set(int(current_progress), message="Embeddings Complete", detail="Processing results...") | |
| await asyncio.sleep(0.1) | |
| # Round scores for display | |
| for col in results.columns: | |
| if 'score' in col: | |
| results[col] = results[col].round(4) | |
| # Generate score visualization and status indicators | |
| for col in results.columns: | |
| if 'score' in col.lower() or 'similarity' in col.lower(): | |
| bar_col = f"{col}_bar" | |
| results[bar_col] = results[col].apply(create_score_bar) | |
| # Add match status column based on score threshold | |
| if 'best_match' in results.columns: | |
| results.insert(0, 'status', results['similarity_score'].apply( | |
| lambda s: 'NO MATCH' if (pd.notna(s) and float(s) < float(threshold)) else 'Match' | |
| )) | |
| p.set(95, message="Finalizing", detail="Preparing visualizations...") | |
| # Store results | |
| results_df.set(results) | |
| # Navigate to results tab automatically | |
| ui.update_navs("workflow_tabs", selected="Step 2: Results") | |
| # Generate summary statistics | |
| total_inputs = len(results) | |
| if 'status' in results.columns: | |
| no_matches = (results['status'] == 'NO MATCH').sum() | |
| successful_matches = total_inputs - no_matches | |
| avg_score = results[results['status'] != 'NO MATCH']['similarity_score'].mean() | |
| avg_score_str = f"{avg_score:.3f}" if not pd.isna(avg_score) else "N/A" | |
| else: | |
| no_matches = 0 | |
| successful_matches = total_inputs | |
| avg_score_str = "N/A" | |
| p.set(100, message="Complete", detail="Ready to view results") | |
| await asyncio.sleep(0.5) | |
| # Hide loading spinner | |
| try: | |
| ui.remove_ui(selector="#processing_spinner") | |
| except: | |
| pass | |
| # Show results summary using notifications | |
| ui.notification_show( | |
| f"Processing complete! Generated {len(results)} results.", | |
| type="success", | |
| duration=5 | |
| ) | |
| # Insert summary into the page | |
| # Replace previous summary (if any), then insert a fresh one with animation | |
| try: | |
| ui.remove_ui(selector="#results_summary_msg") | |
| except Exception: | |
| pass | |
| ui.insert_ui( | |
| ui.div( | |
| ui.p(f"Total Inputs: {total_inputs}"), | |
| ui.p(f"Successful Matches: {successful_matches}"), | |
| ui.p(f"No Matches: {no_matches}"), | |
| ui.p(f"Average Score: {avg_score_str}"), | |
| class_="alert alert-info alert-animated", | |
| id="results_summary_msg" | |
| ), | |
| selector="#sidebar_results_summary", | |
| where="afterBegin" | |
| ) | |
| # Show processing method indicator | |
| # Update method chip to reflect CPU fallback if active | |
| method_label = "Semantic Embedding (thenlper/gte-large)" | |
| if FALLBACK_ACTIVE: | |
| method_label += " — CPU fallback" | |
| ui.insert_ui( | |
| ui.div( | |
| ui.span(method_label, class_="method-chip"), | |
| class_="text-center mt-3" | |
| ), | |
| selector="#method_chips", | |
| where="afterBegin" | |
| ) | |
| # After processing completes, navigate to Results tab | |
| ui.update_navs("workflow_tabs", selected="Step 2: Results") | |
| # Re-enable the button after successful completion | |
| ui.update_action_button("run_matching", disabled=False) | |
| except Exception as e: | |
| # Hide loading spinner on error | |
| try: | |
| ui.remove_ui(selector="#processing_spinner") | |
| except: | |
| pass | |
| ui.notification_show(f"Error: {str(e)}", type="error", duration=10) | |
| # Re-enable the button on error | |
| ui.update_action_button("run_matching", disabled=False) | |
| raise | |
| # Filter and display results | |
| def results_table(): | |
| df = results_df.get() | |
| try: | |
| if input.use_grid(): | |
| return pd.DataFrame() | |
| except Exception: | |
| pass | |
| if df.empty: | |
| return pd.DataFrame() | |
| # Apply filters | |
| filtered_df = df.copy() | |
| # Apply search debouncing | |
| search_term = debounced_search() | |
| if search_term and search_term.strip(): | |
| mask = filtered_df.apply( | |
| lambda row: row.astype(str).str.contains(search_term, case=False, na=False).any(), | |
| axis=1 | |
| ) | |
| filtered_df = filtered_df[mask] | |
| # NO MATCH filter | |
| if input.show_no_match(): | |
| if 'status' in filtered_df.columns: | |
| filtered_df = filtered_df[filtered_df['status'] == 'NO MATCH'] | |
| # Sort by score | |
| if input.sort_by_score(): | |
| score_cols = [col for col in filtered_df.columns if 'score' in col.lower()] | |
| if score_cols: | |
| filtered_df = filtered_df.sort_values(score_cols[0], ascending=False) | |
| return filtered_df | |
| # Export All Data - includes original columns from input and target CSVs | |
| def download_all_data(): | |
| df = results_df.get() | |
| in_df = input_df.get() | |
| tgt_df = target_df.get() | |
| if df.empty: | |
| return io.BytesIO(b"No results to export") | |
| # Start with the original input dataframe | |
| export_df = in_df.copy() | |
| # Check if text cleaning was applied to input column | |
| # If so, replace the input column with the cleaned version from results | |
| in_col = input.input_column() | |
| if input.clean_input() and in_col and 'input_description' in df.columns: | |
| # Replace the original column with the cleaned version | |
| export_df[in_col] = df['input_description'].values[:len(export_df)] | |
| # Add the status and similarity score columns from results | |
| if 'status' in df.columns: | |
| export_df['status'] = df['status'].values[:len(export_df)] | |
| if 'similarity_score' in df.columns: | |
| export_df['similarity_score'] = df['similarity_score'].values[:len(export_df)] | |
| # Add the matched target data (already cleaned if toggle was on) | |
| if 'best_match' in df.columns: | |
| export_df['matched_target'] = df['best_match'].values[:len(export_df)] | |
| # Try to merge with target dataframe to get all target columns | |
| # This assumes the target column selected contains unique identifiers | |
| tgt_col = input.target_column() | |
| if tgt_col and not tgt_df.empty: | |
| # Create a mapping from target description to full target row | |
| tgt_df_unique = tgt_df.drop_duplicates(subset=[tgt_col]) | |
| # Merge based on the matched values | |
| merged = export_df.merge( | |
| tgt_df_unique, | |
| left_on='matched_target', | |
| right_on=tgt_col, | |
| how='left', | |
| suffixes=('', '_target') | |
| ) | |
| export_df = merged | |
| # Remove UI-only columns like score bars | |
| bar_cols = [c for c in export_df.columns if c.endswith('_bar')] | |
| export_df = export_df.drop(columns=bar_cols, errors='ignore') | |
| return io.BytesIO(export_df.to_csv(index=False).encode()) | |
| # Export Matches - current functionality (results with mappings) | |
| def download_matches(): | |
| df = results_df.get() | |
| if not df.empty: | |
| # Remove UI-only columns like score bars | |
| export_df = df.copy() | |
| bar_cols = [c for c in export_df.columns if c.endswith('_bar')] | |
| export_df = export_df.drop(columns=bar_cols, errors='ignore') | |
| return io.BytesIO(export_df.to_csv(index=False).encode()) | |
| return io.BytesIO(b"No results to download") | |
| # Build interactive grid (Tabulator) | |
| def results_tabulator(): | |
| df = results_df.get() | |
| if df.empty: | |
| return None | |
| # Convert DataFrame to records for Tabulator | |
| records = df.to_dict(orient='records') | |
| cols = [] | |
| for c in df.columns: | |
| col = {"title": c, "field": c} | |
| lc = c.lower() | |
| if ("score" in lc) or ("similarity" in lc): | |
| col["hozAlign"] = "right" | |
| col["sorter"] = "number" | |
| # Disable filter/sort for bar columns | |
| if lc.endswith("_bar"): | |
| col["headerFilter"] = False | |
| col["headerSort"] = False | |
| else: | |
| col["headerFilter"] = "input" | |
| cols.append(col) | |
| data_json = json.dumps(records) | |
| cols_json = json.dumps(cols) | |
| html = ( | |
| "<div style=\"height:600px\" id=\"tabulator_results\"></div>\n" | |
| "<script>\n" | |
| "(function(){\n" | |
| " function ensureTabulatorAssets(cb){\n" | |
| " var cssId='tabulator-css';\n" | |
| " if(!document.getElementById(cssId)){\n" | |
| " var l=document.createElement('link'); l.id=cssId; l.rel='stylesheet'; l.href='https://unpkg.com/tabulator-tables@5.5.2/dist/css/tabulator.min.css'; document.head.appendChild(l);\n" | |
| " }\n" | |
| " if(window.Tabulator){ cb(); return; }\n" | |
| " var s=document.createElement('script'); s.src='https://unpkg.com/tabulator-tables@5.5.2/dist/js/tabulator.min.js'; s.onload=cb; document.body.appendChild(s);\n" | |
| " }\n" | |
| " function init(){\n" | |
| " var el = document.getElementById('tabulator_results');\n" | |
| " if(!el) return;\n" | |
| " if (el._tabulator) { el._tabulator.destroy(); }\n" | |
| " var table = new Tabulator(el, {\n" | |
| " data: " + data_json + ",\n" | |
| " reactiveData: false,\n" | |
| " layout: 'fitDataStretch',\n" | |
| " height: '600px',\n" | |
| " movableColumns: true,\n" | |
| " resizableColumns: true,\n" | |
| " columnDefaults: {headerSort: true, editable: false},\n" | |
| " columns: " + cols_json + ",\n" | |
| " initialSort: [{column: 'similarity_score', dir: 'desc'}],\n" | |
| " rowFormatter: function(row){\n" | |
| " var data = row.getData();\n" | |
| " if(data.status === 'NO MATCH'){\n" | |
| " row.getElement().style.backgroundColor = 'rgba(220, 53, 69, 0.08)';\n" | |
| " }\n" | |
| " }\n" | |
| " });\n" | |
| " el._tabulator = table;\n" | |
| " }\n" | |
| " ensureTabulatorAssets(init);\n" | |
| "})();\n" | |
| "</script>\n" | |
| ) | |
| return ui.HTML(html) | |
| # Chart description helper | |
| def chart_description(): | |
| viz_type = input.plotly_viz_type() | |
| descriptions = { | |
| "density": "**Density Plot**: Shows probability density of scores. Higher peaks = more concentrated scores. Rug plot below shows individual points.", | |
| "histogram": "**Histogram**: Frequency of scores in bins. Height = count in each range. Shows mean and median.", | |
| "threshold": "**Threshold Analysis**: Shows how match percentage changes at different threshold values. Helps find optimal cutoff point." | |
| # HIDDEN DESCRIPTIONS - Keep in sync with hidden visualizations above | |
| # "box": "**Box Plot**: Shows quartiles (Q1, median, Q3) and outliers. Box = middle 50% of data.", | |
| # "violin": "**Violin Plot**: Distribution shape split by match/no-match status. Width = data density.", | |
| # "scatter": "**Scatter Plot**: Each point is one item (index vs score). Blue = match, red = no match. Shows sequential patterns.", | |
| # "ecdf": "**Cumulative Distribution**: Shows % of data at or below each score. Answers 'what % scores below X?'", | |
| # "sunburst": "**Match Breakdown**: Hierarchical view of match statistics. Inner ring shows overall split, outer rings show score ranges." | |
| } | |
| desc = descriptions.get(viz_type, "") | |
| if desc: | |
| return ui.div( | |
| ui.markdown(desc), | |
| class_="border rounded", | |
| style="padding: 0.75rem 1rem; margin-bottom: 1rem; font-size: 0.9rem; background-color: var(--bs-tertiary-bg, transparent); color: inherit;" | |
| ) | |
| return None | |
| # Interactive Plotly visualizations | |
| def plotly_viz(): | |
| # Access reactive values to establish dependencies | |
| df = results_df.get() | |
| if df.empty: | |
| # Return empty figure when no data | |
| import plotly.graph_objects as go | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text="Run mapping to view interactive charts", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| showarrow=False, | |
| font=dict(size=14, color="#666") | |
| ) | |
| fig.update_layout( | |
| xaxis=dict(visible=False), | |
| yaxis=dict(visible=False), | |
| height=400 | |
| ) | |
| return fig | |
| # Check for similarity score column | |
| if "similarity_score" not in df.columns: | |
| import plotly.graph_objects as go | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text="No similarity scores available", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| showarrow=False, | |
| font=dict(size=14, color="#666") | |
| ) | |
| fig.update_layout( | |
| xaxis=dict(visible=False), | |
| yaxis=dict(visible=False), | |
| height=400 | |
| ) | |
| return fig | |
| # Get input values - these trigger reactive updates | |
| viz_type = input.plotly_viz_type() | |
| show_threshold = input.show_threshold_line() | |
| threshold = input.threshold() | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import numpy as np | |
| # Keep full dataframe for scatter plot, filter for other plots | |
| df_clean = df.dropna(subset=['similarity_score']).copy() | |
| scores = df_clean["similarity_score"] | |
| # Create figure based on visualization type | |
| if viz_type == "density": | |
| # Kernel Density Estimation | |
| from scipy import stats | |
| density = stats.gaussian_kde(scores) | |
| x_range = np.linspace(0, 1, 200) | |
| y_density = density(x_range) | |
| fig = go.Figure() | |
| # Add density trace | |
| fig.add_trace(go.Scatter( | |
| x=x_range, | |
| y=y_density, | |
| mode='lines', | |
| fill='tozeroy', | |
| name='Density', | |
| line=dict(color='#4e79a7', width=2), | |
| fillcolor='rgba(78, 121, 167, 0.3)', | |
| hovertemplate='Score: %{x:.3f}<br>Density: %{y:.3f}<extra></extra>' | |
| )) | |
| # Add rug plot for actual data points | |
| fig.add_trace(go.Scatter( | |
| x=scores, | |
| y=[-0.01 * max(y_density)] * len(scores), | |
| mode='markers', | |
| name='Data points', | |
| marker=dict(color='#4e79a7', size=2, symbol='line-ns', line=dict(width=1, color='#4e79a7')), | |
| hovertemplate='Score: %{x:.3f}<extra></extra>' | |
| )) | |
| fig.update_layout( | |
| title="Similarity Score Density Distribution", | |
| xaxis_title="Similarity Score", | |
| yaxis_title="Density", | |
| showlegend=True | |
| ) | |
| elif viz_type == "histogram": | |
| fig = go.Figure() | |
| fig.add_trace(go.Histogram( | |
| x=scores, | |
| nbinsx=30, | |
| name='Scores', | |
| marker_color='#4e79a7', | |
| opacity=0.8, | |
| hovertemplate='Score range: %{x}<br>Count: %{y}<extra></extra>' | |
| )) | |
| # Add statistics annotation | |
| mean_score = scores.mean() | |
| median_score = scores.median() | |
| fig.add_annotation( | |
| text=f"Mean: {mean_score:.3f}<br>Median: {median_score:.3f}", | |
| xref="paper", yref="paper", | |
| x=0.98, y=0.98, | |
| showarrow=False, | |
| bgcolor="white", | |
| bordercolor="#4e79a7", | |
| borderwidth=1 | |
| ) | |
| fig.update_layout( | |
| title="Interactive Histogram of Similarity Scores", | |
| xaxis_title="Similarity Score", | |
| yaxis_title="Count", | |
| bargap=0.05 | |
| ) | |
| elif viz_type == "box": | |
| # Box plot with individual points | |
| fig = go.Figure() | |
| fig.add_trace(go.Box( | |
| y=scores, | |
| name='Scores', | |
| marker_color='#4e79a7', | |
| boxpoints='outliers', | |
| jitter=0.3, | |
| pointpos=-1.8, | |
| hovertemplate='Score: %{y:.3f}<extra></extra>' | |
| )) | |
| # Add violin for comparison | |
| fig.add_trace(go.Violin( | |
| y=scores, | |
| name='Distribution', | |
| side='positive', | |
| opacity=0.3, | |
| marker_color='#e15759', | |
| hovertemplate='Score: %{y:.3f}<extra></extra>' | |
| )) | |
| fig.update_layout( | |
| title="Box Plot with Distribution Shape", | |
| yaxis_title="Similarity Score", | |
| showlegend=True | |
| ) | |
| elif viz_type == "violin": | |
| fig = go.Figure() | |
| # Split by match status using cleaned dataframe | |
| matched_scores = df_clean[df_clean['status'] != 'NO MATCH']['similarity_score'] | |
| no_match_scores = df_clean[df_clean['status'] == 'NO MATCH']['similarity_score'] | |
| if len(matched_scores) > 0: | |
| fig.add_trace(go.Violin( | |
| y=matched_scores, | |
| name='Matched', | |
| side='negative', | |
| marker_color='#4e79a7', | |
| line_color='#4e79a7', | |
| meanline_visible=True, | |
| hovertemplate='Matched<br>Score: %{y:.3f}<extra></extra>' | |
| )) | |
| if len(no_match_scores) > 0: | |
| fig.add_trace(go.Violin( | |
| y=no_match_scores, | |
| name='No Match', | |
| side='positive', | |
| marker_color='#e15759', | |
| line_color='#e15759', | |
| meanline_visible=True, | |
| hovertemplate='No Match<br>Score: %{y:.3f}<extra></extra>' | |
| )) | |
| fig.update_layout( | |
| title="Violin Plot: Score Distribution by Match Status", | |
| yaxis_title="Similarity Score", | |
| violingap=0.3, | |
| violinmode='overlay' | |
| ) | |
| elif viz_type == "scatter": | |
| # Scatter plot with color by match status | |
| colors = ['#4e79a7' if status != 'NO MATCH' else '#e15759' | |
| for status in df_clean['status']] | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=list(range(len(df_clean))), | |
| y=df_clean['similarity_score'], | |
| mode='markers', | |
| marker=dict( | |
| color=colors, | |
| size=8, | |
| opacity=0.6, | |
| line=dict(width=1, color='white') | |
| ), | |
| text=df_clean['input_description'], | |
| hovertemplate='Index: %{x}<br>Score: %{y:.3f}<br>Input: %{text}<extra></extra>' | |
| )) | |
| fig.update_layout( | |
| title="Similarity Scores by Index", | |
| xaxis_title="Item Index", | |
| yaxis_title="Similarity Score" | |
| ) | |
| elif viz_type == "ecdf": | |
| # Empirical Cumulative Distribution Function | |
| sorted_scores = np.sort(scores) | |
| ecdf = np.arange(1, len(sorted_scores) + 1) / len(sorted_scores) | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=sorted_scores, | |
| y=ecdf, | |
| mode='lines', | |
| name='ECDF', | |
| line=dict(color='#4e79a7', width=2), | |
| hovertemplate='Score: %{x:.3f}<br>Cumulative %: %{y:.1%}<extra></extra>' | |
| )) | |
| # Add markers at quartiles | |
| q25, q50, q75 = np.percentile(scores, [25, 50, 75]) | |
| fig.add_trace(go.Scatter( | |
| x=[q25, q50, q75], | |
| y=[0.25, 0.50, 0.75], | |
| mode='markers+text', | |
| name='Quartiles', | |
| marker=dict(color='#e15759', size=10), | |
| text=['Q1', 'Median', 'Q3'], | |
| textposition='top center', | |
| hovertemplate='%{text}<br>Score: %{x:.3f}<extra></extra>' | |
| )) | |
| fig.update_layout( | |
| title="Empirical Cumulative Distribution", | |
| xaxis_title="Similarity Score", | |
| yaxis_title="Cumulative Probability", | |
| yaxis=dict(tickformat='.0%') | |
| ) | |
| elif viz_type == "threshold": | |
| # Threshold Analysis - shows match rate at different thresholds | |
| thresholds = np.linspace(0, 1, 101) | |
| match_rates = [(scores >= t).mean() for t in thresholds] | |
| fig = go.Figure() | |
| # Main threshold curve | |
| fig.add_trace(go.Scatter( | |
| x=thresholds, | |
| y=match_rates, | |
| mode='lines', | |
| name='Match Rate', | |
| line=dict(color='#4e79a7', width=3), | |
| hovertemplate='Threshold: %{x:.3f}<br>Match Rate: %{y:.1%}<extra></extra>' | |
| )) | |
| # Add current threshold marker | |
| if threshold: | |
| current_match_rate = (scores >= threshold).mean() | |
| fig.add_trace(go.Scatter( | |
| x=[threshold], | |
| y=[current_match_rate], | |
| mode='markers+text', | |
| name='Current Threshold', | |
| marker=dict(color='#e15759', size=12), | |
| text=[f'{current_match_rate:.1%}'], | |
| textposition='top center', | |
| hovertemplate='Current: %{x:.3f}<br>Rate: %{y:.1%}<extra></extra>' | |
| )) | |
| # Add reference lines | |
| fig.add_vline(x=threshold, line_dash="dash", line_color="#e15759", opacity=0.5) | |
| fig.add_hline(y=current_match_rate, line_dash="dot", line_color="#e15759", opacity=0.5) | |
| # Add 50% reference line | |
| fig.add_hline(y=0.5, line_dash="dash", line_color="gray", opacity=0.3) | |
| fig.update_layout( | |
| title="Threshold Analysis - Match Rate vs Cutoff", | |
| xaxis_title="Threshold Value", | |
| yaxis_title="Match Rate", | |
| yaxis=dict(tickformat='.0%', range=[0, 1.05]), | |
| xaxis=dict(range=[0, 1]) | |
| ) | |
| # Hidden chart - uncomment block to re-enable | |
| # elif viz_type == "sunburst": | |
| # # Sunburst chart - hierarchical match breakdown | |
| # # Create hierarchical data for sunburst | |
| # total = len(df_clean) | |
| # | |
| # # Calculate match/no-match counts | |
| # if 'status' in df_clean.columns: | |
| # match_mask = df_clean['status'] != 'NO MATCH' | |
| # else: | |
| # match_mask = df_clean['similarity_score'] >= threshold if threshold else df_clean['similarity_score'] >= 0.85 | |
| # | |
| # matched = match_mask.sum() | |
| # no_match = total - matched | |
| # | |
| # # Create score bins for matched items | |
| # matched_df = df_clean[match_mask] | |
| # | |
| # # Define score ranges | |
| # labels = [] | |
| # parents = [] | |
| # values = [] | |
| # colors = [] | |
| # | |
| # # Root level | |
| # labels.append("All Items") | |
| # parents.append("") | |
| # values.append(total) | |
| # colors.append("#94a3b8") | |
| # | |
| # # Match/No Match level | |
| # if matched > 0: | |
| # labels.append("Matched") | |
| # parents.append("All Items") | |
| # values.append(matched) | |
| # colors.append("#4e79a7") | |
| # | |
| # # Score ranges for matched items - dynamic based on threshold | |
| # if len(matched_df) > 0: | |
| # scores_matched = matched_df['similarity_score'] | |
| # | |
| # # Calculate dynamic ranges based on user's threshold | |
| # # Use the actual threshold value, defaulting to 0.85 if not set | |
| # thresh_val = threshold if threshold else 0.85 | |
| # | |
| # # High confidence: threshold + 0.10 or higher (capped at 1.0) | |
| # high_threshold = min(thresh_val + 0.10, 1.0) | |
| # high_conf = (scores_matched >= high_threshold).sum() | |
| # if high_conf > 0: | |
| # labels.append(f"High (≥{high_threshold:.2f}): {high_conf}") | |
| # parents.append("Matched") | |
| # values.append(high_conf) | |
| # colors.append("#059669") | |
| # | |
| # # Good confidence: threshold + 0.05 to threshold + 0.10 | |
| # good_lower = thresh_val + 0.05 | |
| # good_upper = high_threshold | |
| # if good_lower < 1.0: # Only show if range is valid | |
| # good_conf = ((scores_matched >= good_lower) & (scores_matched < good_upper)).sum() | |
| # if good_conf > 0: | |
| # labels.append(f"Good ({good_lower:.2f}-{good_upper:.2f}): {good_conf}") | |
| # parents.append("Matched") | |
| # values.append(good_conf) | |
| # colors.append("#0ea5e9") | |
| # | |
| # # Moderate confidence: threshold to threshold + 0.05 | |
| # mod_lower = thresh_val | |
| # mod_upper = min(thresh_val + 0.05, 1.0) | |
| # moderate_conf = ((scores_matched >= mod_lower) & (scores_matched < mod_upper)).sum() | |
| # if moderate_conf > 0: | |
| # labels.append(f"Moderate ({mod_lower:.2f}-{mod_upper:.2f}): {moderate_conf}") | |
| # parents.append("Matched") | |
| # values.append(moderate_conf) | |
| # colors.append("#8b5cf6") | |
| # | |
| # # Note: There shouldn't be any "Low" matches below threshold | |
| # # since we filter by threshold, but check just in case | |
| # low_conf = (scores_matched < thresh_val).sum() | |
| # if low_conf > 0: | |
| # labels.append(f"Below threshold (<{thresh_val:.2f}): {low_conf}") | |
| # parents.append("Matched") | |
| # values.append(low_conf) | |
| # colors.append("#f59e0b") | |
| # | |
| # if no_match > 0: | |
| # labels.append("No Match") | |
| # parents.append("All Items") | |
| # values.append(no_match) | |
| # colors.append("#e15759") | |
| # | |
| # # Create sunburst | |
| # fig = go.Figure(go.Sunburst( | |
| # labels=labels, | |
| # parents=parents, | |
| # values=values, | |
| # branchvalues="total", | |
| # marker=dict(colors=colors), | |
| # textinfo="label+percent parent", | |
| # hovertemplate='<b>%{label}</b><br>Count: %{value}<br>%{percentParent}<extra></extra>' | |
| # )) | |
| # | |
| # fig.update_layout( | |
| # title="Match Statistics Breakdown", | |
| # height=500 | |
| # ) | |
| # Add threshold line if requested (but not for sunburst chart where it doesn't apply) | |
| if show_threshold and threshold and viz_type != "sunburst": | |
| fig.add_vline( | |
| x=threshold, | |
| line_dash="dash", | |
| line_color="red", | |
| annotation_text=f"Threshold: {threshold:.2f}", | |
| annotation_position="top" | |
| ) | |
| # Common layout updates (individual charts already set their axis ranges) | |
| fig.update_layout( | |
| template="plotly_white", | |
| hovermode='closest', | |
| height=500, | |
| margin=dict(l=50, r=50, t=50, b=50), | |
| font=dict(family="system-ui, -apple-system, sans-serif") | |
| ) | |
| # Return the Plotly figure directly for render_widget | |
| return fig | |
| # Create the app | |
| app = App(app_ui, server) | |