food-mapper / app.py
richtext's picture
Upload folder using huggingface_hub
ca76e91 verified
#!/usr/bin/env python3
"""
Food Mapper - Semantic Matching Application
USDA Agricultural Research Service, Western Human Nutrition Research Center
Matches food descriptions between databases using neural embeddings (GTE-Large model).
API-first with automatic CPU fallback for reliability.
"""
import os
import io
import json
import re
import time
import asyncio
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, List, Tuple
from shiny import App, Inputs, Outputs, Session, reactive, render, ui
from shiny.types import FileInfo
import shinyswatch
from shinywidgets import render_widget, output_widget
from functools import lru_cache
# ============================================================================
# IMPORTS
# ============================================================================
# Matching algorithms
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# API client for embeddings
from openai import OpenAI, AsyncOpenAI
import httpx
try:
import h2 # type: ignore
HTTP2_AVAILABLE = True
except Exception:
HTTP2_AVAILABLE = False
# ============================================================================
# STYLE CONFIGURATION
# ============================================================================
custom_css = """
/* Professional neutral color scheme */
:root {
--primary-color: #475569;
--primary-dark: #334155;
--secondary-color: #64748b;
--success-color: #059669;
--warning-color: #d97706;
--danger-color: #dc2626;
--background: #ffffff;
--surface: #f8fafc;
--text-primary: #1e293b;
--text-secondary: #64748b;
--border-color: #e2e8f0;
--shadow-sm: 0 1px 2px 0 rgb(0 0 0 / 0.05);
--shadow-md: 0 4px 6px -1px rgb(0 0 0 / 0.08);
--shadow-lg: 0 10px 15px -3px rgb(0 0 0 / 0.1);
--gradient-primary: linear-gradient(135deg, #64748b 0%, #475569 100%);
}
/* Main container with responsive design */
.container-fluid {
max-width: 1400px;
margin: 0 auto;
padding: 20px;
background: var(--background);
}
/* NEW: Typography with system font stack */
body, .container-fluid, .card, .btn, .table {
font-family: -apple-system, BlinkMacSystemFont, "Inter", "Segoe UI", "Helvetica Neue", Arial, sans-serif;
color: var(--text-primary);
line-height: 1.6;
}
/* Clean header styling */
.app-header {
background: var(--gradient-primary);
color: white;
padding: 1.25rem 1.25rem; /* NEW: thinner header */
border-radius: 12px;
margin-bottom: 1.25rem;
text-align: center;
box-shadow: var(--shadow-md);
}
.app-header h1 {
color: white;
margin: 0;
font-size: 2rem; /* NEW: smaller title */
font-weight: 700;
letter-spacing: 0.2px;
}
.app-header p {
color: rgba(255, 255, 255, 0.95);
font-size: 0.95rem; /* NEW: smaller subtitle */
margin-top: 0.35rem;
margin-bottom: 0;
}
/* Clean card styling */
.card {
background: white;
border: 1px solid var(--border-color);
box-shadow: var(--shadow-sm);
border-radius: 12px;
margin-bottom: 1.5rem;
transition: box-shadow 0.2s ease;
}
.card:hover {
box-shadow: var(--shadow-md);
}
.card-header {
background: linear-gradient(135deg, var(--secondary-color), var(--primary-color));
color: white;
font-weight: 600;
border-radius: 12px 12px 0 0 !important;
padding: 1rem 1.5rem;
border-bottom: none;
}
/* Simple button styling */
.btn {
font-weight: 600;
border-radius: 8px;
padding: 0.75rem 1.5rem;
transition: all 0.2s ease;
border: none;
}
.btn-primary {
background: var(--primary-color);
color: white;
}
.btn-primary:hover {
background: var(--primary-dark);
transform: translateY(-1px);
box-shadow: var(--shadow-md);
}
.btn-success {
background: var(--success-color);
color: white;
}
.btn-success:hover {
filter: brightness(0.9);
transform: translateY(-1px);
}
.btn-warning {
background: var(--warning-color);
color: white;
}
.btn-warning:hover {
filter: brightness(0.9);
transform: translateY(-1px);
}
.btn-outline-warning {
background: transparent;
border: 2px solid var(--warning-color);
color: var(--warning-color);
}
.btn-outline-warning:hover {
background: var(--warning-color);
color: white;
}
/* Export button grouping */
.btn-group-export {
display: flex;
gap: 1rem;
justify-content: center;
flex-wrap: wrap;
}
/* File upload area */
.file-upload-area {
border: 2px dashed var(--border-color);
border-radius: 12px;
padding: 2rem;
background: var(--background-light);
text-align: center;
transition: all 0.3s ease;
}
.file-upload-area:hover {
border-color: var(--primary-color);
background: white;
}
/* Fix table header alignment */
.shiny-table table {
width: 100%;
table-layout: fixed;
}
.shiny-table th {
text-align: left !important;
padding-left: 8px !important;
}
.shiny-table td {
text-align: left !important;
padding-left: 8px !important;
word-wrap: break-word;
overflow-wrap: break-word;
}
/* Special styling for preview tables in Step 1 - data_frame outputs */
/* Force left alignment for the entire data_frame container */
#input_col_preview,
#target_col_preview {
text-align: left !important;
display: block !important;
margin: 0 !important;
padding: 0 !important;
}
#input_col_preview .shiny-data-frame,
#target_col_preview .shiny-data-frame {
display: block !important;
text-align: left !important;
margin: 0 !important;
padding: 0 !important;
width: 100% !important;
}
#input_col_preview .shiny-data-frame > div,
#target_col_preview .shiny-data-frame > div {
display: block !important;
text-align: left !important;
margin: 0 auto 0 0 !important; /* This forces left alignment by removing auto centering */
padding: 0 !important;
}
#input_col_preview .shiny-data-frame table,
#target_col_preview .shiny-data-frame table {
width: 100% !important;
table-layout: fixed !important;
margin: 0 !important;
margin-left: 0 !important;
margin-right: auto !important;
border-collapse: collapse !important;
}
/* Headers - force left alignment and dark mode support */
#input_col_preview .shiny-data-frame thead th,
#target_col_preview .shiny-data-frame thead th {
text-align: left !important;
padding: 8px !important;
vertical-align: middle !important;
font-weight: bold !important;
color: var(--bs-body-color, #212529) !important;
background-color: var(--bs-gray-200, #e9ecef) !important;
border-bottom: 2px solid var(--bs-border-color, #dee2e6) !important;
}
/* First column header (Row) - centered and narrow */
#input_col_preview .shiny-data-frame thead th:first-child,
#target_col_preview .shiny-data-frame thead th:first-child {
width: 60px !important;
min-width: 60px !important;
max-width: 60px !important;
text-align: center !important;
}
/* Second column header (Sample Values) - left aligned, takes remaining space */
#input_col_preview .shiny-data-frame thead th:nth-child(2),
#target_col_preview .shiny-data-frame thead th:nth-child(2) {
text-align: left !important;
padding-left: 12px !important;
width: auto !important;
}
/* Data cells - with dark mode support */
#input_col_preview .shiny-data-frame tbody td,
#target_col_preview .shiny-data-frame tbody td {
text-align: left !important;
padding: 8px !important;
vertical-align: top !important;
word-wrap: break-word !important;
overflow-wrap: break-word !important;
white-space: normal !important;
color: var(--bs-body-color, #212529) !important;
background-color: var(--bs-body-bg, white) !important;
}
/* First column data (Row numbers) - centered */
#input_col_preview .shiny-data-frame tbody td:first-child,
#target_col_preview .shiny-data-frame tbody td:first-child {
width: 60px !important;
text-align: center !important;
}
/* Second column data - left aligned with wrapping */
#input_col_preview .shiny-data-frame tbody td:nth-child(2),
#target_col_preview .shiny-data-frame tbody td:nth-child(2) {
text-align: left !important;
white-space: normal !important;
word-break: break-word !important;
}
/* When there are 3 columns (cleaning preview) */
#input_col_preview .shiny-data-frame thead th:nth-child(3),
#target_col_preview .shiny-data-frame thead th:nth-child(3) {
text-align: left !important;
width: auto !important;
}
/* Matching Configuration - Polished Production Styling (theme-friendly) */
.matching-config-card {
background-color: var(--bs-tertiary-bg, var(--bs-body-bg));
border-radius: 12px;
padding: 1.25rem;
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
border: 1px solid var(--bs-border-color, rgba(0,0,0,0.05));
}
.matching-config-card h5 {
color: var(--primary-dark);
font-weight: 600;
letter-spacing: 0.5px;
}
.matching-config-card h6 {
color: var(--primary-color);
font-weight: 500;
}
.matching-config-card .border-end {
border-color: var(--border-color) !important;
}
.matching-config-card .border-start {
border-color: var(--border-color) !important;
}
/* Slider styling for production look - centered */
/* Ensure slider stays centered and doesn't break on resize */
.matching-config-card .d-flex.justify-content-center {
display: flex !important;
justify-content: center !important;
}
.matching-config-card .mx-auto {
margin-left: auto !important;
margin-right: auto !important;
width: 100% !important;
}
.matching-config-card .irs-bar {
background: var(--primary-color);
}
.matching-config-card .irs-handle {
border: 3px solid var(--primary-color);
}
/* Centered description text */
.threshold-description {
max-width: 600px;
margin: 0 auto;
line-height: 1.5;
color: var(--text-secondary);
}
/* Start button hover effect */
.matching-config-card .btn-primary:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(71, 85, 105, 0.3); }
/* Progress indicator */
.progress-container {
background: white;
border-radius: 12px;
padding: 2rem;
box-shadow: 0 2px 10px rgba(0,0,0,0.08);
margin: 1rem 0;
}
.progress {
background-color: #e9ecef;
border-radius: 10px;
overflow: hidden;
}
.progress-bar {
background: linear-gradient(90deg, var(--primary-color), var(--secondary-color));
border-radius: 8px;
transition: width 0.5s ease-in-out;
font-size: 14px;
font-weight: 600;
display: flex;
align-items: center;
justify-content: center;
color: white;
}
.progress-bar-striped {
background-image: linear-gradient(45deg, rgba(255,255,255,.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,.15) 50%, rgba(255,255,255,.15) 75%, transparent 75%, transparent);
background-size: 1rem 1rem;
}
.progress-bar-animated {
animation: progress-bar-stripes 1s linear infinite;
}
@keyframes progress-bar-stripes {
from { background-position: 1rem 0; }
to { background-position: 0 0; }
}
/* Status messages */
.alert-custom {
border-radius: 8px;
padding: 1rem 1.5rem;
margin: 1rem 0;
border-left: 4px solid;
}
.alert-success-custom {
background: #f0fdf4;
border-color: var(--success-color);
color: #166534;
}
.alert-warning-custom {
background: #fffbeb;
border-color: var(--warning-color);
color: #92400e;
}
.alert-info-custom {
background: #f0f9ff;
border-color: var(--primary-color);
color: var(--primary-dark);
}
/* NEW: Enhanced table with proper alignment */
.table {
width: 100%;
table-layout: auto;
border-collapse: separate;
border-spacing: 0;
background: var(--surface);
color: var(--text-primary);
}
.table thead {
background: var(--surface);
color: var(--text-primary);
position: sticky;
top: 0;
z-index: 10;
border-bottom: 2px solid var(--primary-color);
}
.table thead th {
padding: 12px;
font-weight: 700;
text-align: left;
white-space: nowrap;
border-bottom: 2px solid var(--primary-color);
}
/* NEW: Numeric column alignment */
.table th.num,
.table td.num {
text-align: right;
font-variant-numeric: tabular-nums;
}
.table th.text,
.table td.text {
text-align: left;
}
.table tbody td {
padding: 12px;
vertical-align: middle;
border-bottom: 1px solid var(--border-color);
}
/* NEW: Compact density mode */
#results_container.compact .table tbody td {
padding: 6px 12px;
font-size: 0.875rem;
line-height: 1.25;
}
#results_container.compact .table thead th {
padding: 8px 12px;
}
.table-striped tbody tr:nth-child(odd) {
background: rgba(0, 0, 0, 0.02);
}
.table tbody tr:hover {
background: rgba(71, 85, 105, 0.05) !important;
cursor: pointer;
}
/* Footer */
.footer {
margin-top: 3rem;
padding: 2rem;
border-top: 2px solid var(--border-color);
text-align: center;
color: #64748b;
}
/* Animated alerts */
.alert-animated { animation: fadeIn 0.25s ease-out both; }
@keyframes fadeIn { from { opacity: 0; transform: translateY(4px);} to { opacity: 1; transform: none; } }
/* NEW: Results container with better scrolling */
.results-container {
height: 520px;
overflow: auto;
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 0;
background: var(--surface);
box-shadow: var(--shadow-sm);
position: relative;
}
.results-container::-webkit-scrollbar {
width: 10px;
height: 10px;
}
.results-container::-webkit-scrollbar-track {
background: var(--surface);
border-radius: 10px;
}
.results-container::-webkit-scrollbar-thumb {
background: var(--primary-color);
border-radius: 10px;
}
.results-container::-webkit-scrollbar-thumb:hover {
background: var(--primary-dark);
}
/* NEW: Make Shiny progress panel wider so text doesn't wrap */
.shiny-progress-panel { min-width: 520px !important; width: 520px !important; }
.shiny-progress-panel .progress { height: 0.75rem; }
.shiny-progress-panel p { white-space: normal; }
/* Loading spinner */
.spinner-container {
display: flex;
justify-content: center;
align-items: center;
padding: 2rem;
}
.spinner {
width: 40px;
height: 40px;
border: 3px solid var(--border-color);
border-top-color: var(--primary-color);
border-radius: 50%;
animation: spin 1s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
/* Score bar visualization */
.scorebar {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
letter-spacing: 0.5px;
white-space: nowrap;
color: var(--secondary-color);
}
/* NEW: Status badges */
.status-badge {
display: inline-block;
font-weight: 600;
font-size: 0.75rem;
padding: 0.25rem 0.75rem;
border-radius: 9999px;
text-transform: uppercase;
letter-spacing: 0.025em;
}
.status-ok {
background: rgba(16, 185, 129, 0.1);
color: var(--success-color);
border: 1px solid var(--success-color);
}
.status-warn {
background: rgba(245, 158, 11, 0.1);
color: var(--warning-color);
border: 1px solid var(--warning-color);
}
/* Method chips */
.method-chip {
display: inline-block;
background: var(--secondary-color);
color: white;
padding: 0.25rem 0.75rem;
border-radius: 16px;
font-size: 0.875rem;
font-weight: 500;
margin: 0.25rem;
}
/* NEW: Nav tabs styling */
.nav-tabs {
border-bottom: 2px solid var(--border-color);
margin-bottom: 1.5rem;
}
.nav-tabs .nav-link {
color: var(--primary-color);
border: none;
padding: 0.75rem 1.5rem;
font-weight: 500;
transition: all 0.3s ease;
position: relative;
}
.nav-tabs .nav-link:hover {
color: var(--primary-color);
background: transparent;
}
.nav-tabs .nav-link.active {
color: var(--primary-dark);
background: transparent;
border: none;
font-weight: 600;
}
.nav-tabs .nav-link.active::after {
content: '';
position: absolute;
bottom: -2px;
left: 0;
right: 0;
height: 2px;
background: var(--primary-color);
}
/* NEW: Tooltip styles */
.tooltip {
font-size: 0.875rem;
}
/* NEW: Mobile responsive improvements */
@media (max-width: 768px) {
.container-fluid {
padding: 12px;
}
.app-header {
padding: 2rem 1rem;
border-radius: 12px;
}
.card {
margin-bottom: 1rem;
border-radius: 12px;
}
.btn {
padding: 0.625rem 1.25rem;
font-size: 0.875rem;
}
.btn-group-export { justify-content: center; }
.btn-group-export {
flex-direction: column;
}
.results-container {
height: 400px;
}
.nav-tabs .nav-link {
padding: 0.5rem 0.75rem;
font-size: 0.875rem;
}
}
@media (max-width: 480px) {
.app-header h1 {
font-size: 1.75rem;
}
.app-header p {
font-size: 0.875rem;
}
.table {
font-size: 0.75rem;
}
.table thead th,
.table tbody td {
padding: 8px 6px;
}
}
/* Simple transitions */
.fade-in {
animation: fadeIn 0.2s ease;
}
@keyframes fadeIn {
from { opacity: 0; }
to { opacity: 1; }
}
/* NEW: File input styling */
.file-upload-area {
border: 2px dashed var(--border-color);
border-radius: 12px;
padding: 2rem;
background: var(--surface);
text-align: center;
transition: all 0.3s ease;
cursor: pointer;
}
.file-upload-area:hover {
border-color: var(--primary-color);
background: var(--background);
box-shadow: var(--shadow-sm);
}
/* Alert animations */
.alert-animated {
animation: fadeIn 0.2s ease;
}
/* Fix Shiny's file input progress bar - center text vertically */
.shiny-input-container .progress {
min-height: 1.5rem !important;
height: 1.5rem !important;
display: flex !important;
align-items: center !important;
}
.shiny-input-container .progress-bar {
min-height: 1.5rem !important;
height: 1.5rem !important;
line-height: 1 !important; /* Reset line-height */
font-size: 0.875rem;
display: flex !important;
align-items: center !important;
justify-content: center !important;
padding: 0 0.5rem;
}
/* Make the 'Loaded X rows' status alerts thinner */
#input_file_status .alert,
#target_file_status .alert {
padding: 0.4rem 0.75rem !important;
margin-bottom: 0.25rem;
font-size: 0.875rem;
}
#input_file_status p,
#target_file_status p {
margin: 0;
line-height: 1.2;
}
"""
# Minimal custom CSS to preserve app behaviors while letting themes show
custom_css_min = """
/* Fix Shiny's file input progress bar - thinner with centered text */
.sidebar { min-height: 100vh; }
.page-sidebar .sidebar { min-height: 100vh; }
/* Small, theme-friendly footer shown on all pages */
.footer {
margin-top: 1rem !important;
padding: 0.75rem 1rem !important;
border-top: 1px solid var(--bs-border-color, #dee2e6) !important;
text-align: center !important;
color: var(--bs-secondary-color, inherit) !important;
font-size: 0.9rem !important;
}
.shiny-input-container .progress {
min-height: 1.4rem !important;
height: 1.4rem !important;
margin-bottom: 0 !important; /* Remove bottom margin */
}
/* Let Shiny's default animation work - only adjust height and text position */
.shiny-input-container .progress-bar {
min-height: 1.4rem !important;
height: 1.4rem !important;
line-height: 1.4rem !important;
font-size: 0.8rem !important;
padding-top: 0.1rem !important; /* Small padding to center text */
/* Allow Shiny's default transition animation */
transition: width 0.6s ease !important;
}
/* Make the 'Loaded X rows' status alerts much thinner and even closer to upload bar */
#input_file_status .alert,
#target_file_status .alert {
padding: 0.25rem 0.5rem !important;
margin-bottom: 0.5rem !important;
margin-top: -0.75rem !important; /* Even more negative margin to bring closer */
font-size: 0.85rem !important;
line-height: 1.1 !important;
min-height: auto !important;
position: relative !important;
top: -0.25rem !important; /* Additional upward shift */
}
/* Also adjust the container divs to reduce spacing */
#input_status,
#target_status {
margin-top: -0.25rem !important;
margin-bottom: 0 !important;
padding-top: 0 !important;
}
#input_file_status p,
#target_file_status p {
margin: 0 !important;
padding: 0 !important;
line-height: 1.1 !important;
}
/* Results container: bounded height and scrolling */
.results-container {
height: 520px;
overflow: auto;
}
/* Compact density mode for results table */
#results_container.compact table tbody td {
padding: 6px 12px;
font-size: 0.875rem;
line-height: 1.25;
}
#results_container.compact table thead th {
padding: 8px 12px;
}
/* Numeric/text alignment used by JS alignment helper */
.table th.num, .table td.num { text-align: right; font-variant-numeric: tabular-nums; }
.table th.text, .table td.text { text-align: left; }
/* Score bar visualization */
.scorebar {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
white-space: nowrap;
letter-spacing: 0.5px;
}
/* Simple spinner for long-running tasks */
.spinner-container { display: flex; justify-content: center; align-items: center; padding: 2rem; }
.spinner { width: 40px; height: 40px; border: 3px solid rgba(0,0,0,0.1); border-top-color: currentColor; border-radius: 50%; animation: spin 1s linear infinite; }
@keyframes spin { to { transform: rotate(360deg); } }
/* Export button group layout */
.btn-group-export { display: flex; gap: 1rem; justify-content: center; flex-wrap: wrap; }
/* Optional small fade-in animation class used by notifications */
.alert-animated { animation: fadeIn 0.25s ease-out both; }
@keyframes fadeIn { from { opacity: 0; transform: translateY(4px);} to { opacity: 1; transform: none; } }
/* Shiny progress panel: polished and theme-adaptive */
.shiny-progress-panel {
width: 420px !important;
min-width: 360px !important;
max-width: 90vw;
padding: 1.25rem;
border-radius: 0.5rem;
background-color: var(--bs-body-bg, white) !important;
border: 1px solid var(--bs-border-color, rgba(0,0,0,0.125)) !important;
box-shadow: 0 0.5rem 1rem rgba(0,0,0,0.15), 0 0.125rem 0.25rem rgba(0,0,0,0.075);
animation: slide-in-bottom 0.3s cubic-bezier(0.25, 0.46, 0.45, 0.94) both;
/* Position at bottom right with proper spacing */
position: fixed !important;
right: 20px !important;
bottom: 20px !important;
left: auto !important;
top: auto !important;
transform: none !important;
z-index: 9999 !important;
}
.shiny-progress-panel .progress-text,
.shiny-progress-panel p {
color: var(--bs-body-color, #212529) !important;
font-size: 0.875rem !important;
font-weight: 500 !important;
margin-bottom: 0.75rem !important;
line-height: 1.4 !important;
display: block !important;
text-align: left !important;
}
.shiny-progress-panel .progress {
height: 1.25rem !important;
background-color: var(--bs-gray-200, #e9ecef) !important;
border-radius: 0.375rem !important;
overflow: hidden !important;
margin-bottom: 0.5rem !important;
box-shadow: inset 0 1px 2px rgba(0,0,0,0.075) !important;
}
.shiny-progress-panel .progress-bar {
background: linear-gradient(90deg, var(--bs-primary, #0d6efd), var(--bs-info, #0dcaf0)) !important;
transition: width 0.6s cubic-bezier(0.25, 0.46, 0.45, 0.94) !important;
font-size: 0.75rem !important;
font-weight: 600 !important;
color: white !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
position: relative !important;
overflow: hidden !important;
}
.shiny-progress-panel .progress-bar::after {
content: '';
position: absolute;
top: 0;
left: 0;
bottom: 0;
right: 0;
background: linear-gradient(
90deg,
transparent,
rgba(255, 255, 255, 0.2),
transparent
);
animation: shimmer 2s infinite;
}
@keyframes shimmer {
0% { transform: translateX(-100%); }
100% { transform: translateX(100%); }
}
.shiny-progress-panel .progress-bar-animated {
background-image: linear-gradient(
45deg,
rgba(255,255,255,.15) 25%,
transparent 25%,
transparent 50%,
rgba(255,255,255,.15) 50%,
rgba(255,255,255,.15) 75%,
transparent 75%,
transparent
) !important;
background-size: 1rem 1rem !important;
animation: progress-bar-stripes 1s linear infinite !important;
}
@keyframes slide-in-bottom {
0% {
transform: translateY(50px);
opacity: 0;
}
100% {
transform: translateY(0);
opacity: 1;
}
}
@keyframes progress-bar-stripes {
from { background-position: 1rem 0; }
to { background-position: 0 0; }
}
/* Enhanced navbar tab styling */
.navbar-nav .nav-item .nav-link {
border-radius: 0.375rem;
padding: 0.5rem 1rem !important;
margin: 0 0.25rem;
transition: all 0.2s ease;
position: relative;
color: var(--bs-nav-link-color, #495057) !important;
}
.navbar-nav .nav-item .nav-link.active {
background-color: var(--bs-primary, #0d6efd) !important;
color: white !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.navbar-nav .nav-item .nav-link:hover:not(.active) {
background-color: var(--bs-gray-200, #e9ecef);
color: var(--bs-body-color, #212529) !important;
}
/* Remove default underline */
.navbar-nav .nav-item .nav-link.active::after {
display: none !important;
}
/* Matching configuration panel - theme friendly */
.config-card {
background-color: var(--bs-tertiary-bg, var(--bs-body-bg)) !important;
border: 1px solid var(--bs-border-color, #dee2e6) !important;
border-radius: .5rem !important;
padding: 1rem !important;
}
/* threshold-badge removed (using slider only) */
.config-row .btn { width: 100%; }
/* Step 1 preview tables: keep left-aligned and full width at all sizes */
#input_col_preview, #target_col_preview {
text-align: left !important;
display: block !important;
margin: 0 !important;
padding: 0 !important;
width: 100% !important;
}
#input_preview_wrap, #target_preview_wrap {
display: flex !important;
width: 100% !important;
}
#input_preview_wrap > *, #target_preview_wrap > * {
flex: 1 1 auto !important;
width: 100% !important;
}
#input_col_preview .shiny-data-frame, #target_col_preview .shiny-data-frame {
display: block !important;
text-align: left !important;
margin: 0 !important;
padding: 0 !important;
width: 100% !important;
}
#input_col_preview .gridjs-container, #target_col_preview .gridjs-container,
#input_col_preview .gridjs-wrapper, #target_col_preview .gridjs-wrapper {
width: 100% !important;
max-width: none !important;
margin-left: 0 !important;
margin-right: 0 !important;
}
/* Improve text density so more characters fit on two lines */
#input_col_preview .gridjs-table td.gridjs-td,
#target_col_preview .gridjs-table td.gridjs-td {
padding-left: 6px !important;
padding-right: 6px !important;
font-size: 0.95rem !important;
line-height: 1.2 !important;
white-space: normal !important;
word-break: break-word !important;
overflow-wrap: anywhere !important;
hyphens: auto !important;
}
#input_col_preview .gridjs-table th.gridjs-th,
#target_col_preview .gridjs-table th.gridjs-th {
padding-left: 8px !important;
padding-right: 8px !important;
}
#input_col_preview .shiny-data-frame > div, #target_col_preview .shiny-data-frame > div {
display: block !important;
text-align: left !important;
margin: 0 auto 0 0 !important; /* prevent auto-centering */
padding: 0 !important;
width: 100% !important;
max-width: none !important;
}
#input_col_preview .shiny-data-frame table, #target_col_preview .shiny-data-frame table,
#input_col_preview .gridjs-table, #target_col_preview .gridjs-table {
width: 100% !important;
/* Allow content to determine width so text columns expand and 'Row' stays narrow */
table-layout: auto !important;
margin: 0 !important;
margin-left: 0 !important;
margin-right: auto !important;
border-collapse: collapse !important;
}
#input_preview_wrap .table-responsive, #target_preview_wrap .table-responsive {
width: 100% !important;
margin: 0 !important;
}
/* Make any Shiny fill wrappers align to the left and fill width */
#input_col_preview .html-fill-container, #target_col_preview .html-fill-container,
#input_col_preview .html-fill-item, #target_col_preview .html-fill-item {
display: block !important;
justify-content: flex-start !important;
align-items: stretch !important;
width: 100% !important;
max-width: none !important;
margin-left: 0 !important;
margin-right: 0 !important;
}
/* Catch-all for inline auto-centering styles */
#input_col_preview div[style*="margin: 0 auto"],
#target_col_preview div[style*="margin: 0 auto"],
#input_col_preview div[style*="margin-left: auto"],
#input_col_preview div[style*="margin-right: auto"],
#target_col_preview div[style*="margin-left: auto"],
#target_col_preview div[style*="margin-right: auto"] {
margin-left: 0 !important;
margin-right: 0 !important;
width: 100% !important;
max-width: none !important;
}
/* Shiny fill layout elements sometimes center children with auto margins */
#input_col_preview .html-fill-item, #target_col_preview .html-fill-item,
#input_col_preview .html-fill-container, #target_col_preview .html-fill-container {
margin-left: 0 !important;
margin-right: 0 !important;
width: 100% !important;
max-width: none !important;
}
/* In case an inner wrapper uses inline style widths, let it stretch */
#input_col_preview div[style*="margin-left: auto"],
#input_col_preview div[style*="margin-right: auto"],
#target_col_preview div[style*="margin-left: auto"],
#target_col_preview div[style*="margin-right: auto"] {
margin-left: 0 !important;
margin-right: 0 !important;
width: 100% !important;
}
/* Header/data alignment for consistency */
#input_col_preview .shiny-data-frame thead th,
#target_col_preview .shiny-data-frame thead th { text-align: left !important; }
#input_col_preview .shiny-data-frame thead th:first-child,
#target_col_preview .shiny-data-frame thead th:first-child { width: 36px !important; text-align: center !important; }
#input_col_preview .shiny-data-frame tbody td,
#target_col_preview .shiny-data-frame tbody td { text-align: left !important; vertical-align: top !important; }
#input_col_preview .shiny-data-frame tbody td:first-child,
#target_col_preview .shiny-data-frame tbody td:first-child { width: 36px !important; text-align: center !important; white-space: nowrap !important; }
/* Also constrain first column via colgroup to override library sizing */
#input_col_preview .shiny-data-frame colgroup col:first-child,
#target_col_preview .shiny-data-frame colgroup col:first-child {
width: 36px !important;
min-width: 36px !important;
max-width: 36px !important;
}
/* Tighten padding on the small first column */
#input_col_preview .shiny-data-frame thead th:first-child,
#input_col_preview .shiny-data-frame tbody td:first-child,
#target_col_preview .shiny-data-frame thead th:first-child,
#target_col_preview .shiny-data-frame tbody td:first-child {
padding-left: 6px !important;
padding-right: 6px !important;
}
/* Grid.js (Shiny DataGrid) often enforces a min column width (~120px).
Explicitly override only for the first column within these two previews. */
#input_col_preview .gridjs-table thead th:first-child,
#input_col_preview .gridjs-table tbody td:first-child,
#input_col_preview .gridjs-header .gridjs-th:first-child,
#input_col_preview .gridjs-body .gridjs-td:first-child,
#target_col_preview .gridjs-table thead th:first-child,
#target_col_preview .gridjs-table tbody td:first-child,
#target_col_preview .gridjs-header .gridjs-th:first-child,
#target_col_preview .gridjs-body .gridjs-td:first-child {
width: 36px !important;
min-width: 36px !important;
max-width: 40px !important;
text-align: center !important;
white-space: nowrap !important;
}
/* Make sure the table can use the freed space for text columns */
#input_col_preview .gridjs-table,
#target_col_preview .gridjs-table { width: 100% !important; }
/* Center overlay progress (theme-friendly) */
.center-progress-backdrop { position: fixed; inset: 0; background: rgba(0,0,0,0.25); display: flex; align-items: center; justify-content: center; z-index: 2000; }
.center-progress-panel { width: 720px; max-width: 90vw; }
/* App title size */
.app-title { font-size: 2rem; font-weight: 700; margin: 0; }
/* Highlight NO MATCH rows with light red background */
.no-match-row {
background-color: rgba(220, 53, 69, 0.08) !important;
}
.no-match-row:hover {
background-color: rgba(220, 53, 69, 0.15) !important;
}
"""
# ============================================================================
# EMBEDDING CONFIGURATION
# ============================================================================
# Model and API settings
DEEPINFRA_MODEL = "thenlper/gte-large"
# Concurrency settings
MAX_CONCURRENCY = int(os.environ.get("EMBEDDING_CONCURRENCY", "100"))
EMBED_BATCH_SIZE = int(os.environ.get("EMBEDDING_BATCH_SIZE", "200"))
USE_PRIORITY_TIER = os.environ.get("DEEPINFRA_PRIORITY", "false").lower() in {"1", "true", "yes", "on"}
USE_ASYNC = os.environ.get("EMBEDDING_ASYNC", "true").lower() in {"1", "true", "yes", "on"}
# Fallback behavior
API_EMBED_TIMEOUT_SECS = int(os.environ.get("API_EMBED_TIMEOUT_SECS", "45")) # overall call timeout
API_MAX_FAILURES = int(os.environ.get("API_EMBED_MAX_FAILURES", "3")) # consecutive failures before CPU fallback
MODEL_FALLBACK_MODE = os.environ.get("MODEL_FALLBACK_MODE", "auto").lower()
# Values: "auto" (try API then fallback), "api" (force API only), "local" (force CPU), "off" (no fallback)
# Runtime state
FALLBACK_ACTIVE: bool = False
_API_FAILURES: int = 0
_LOCAL_ST_MODEL: Optional["SentenceTransformer"] = None # lazy-loaded cache
def get_api_key():
"""Get API key from environment variable or HuggingFace secret"""
# Try HuggingFace secret first
api_key = os.environ.get("DEEPINFRA_API_KEY")
if not api_key:
# Try standard environment variable
api_key = os.environ.get("DEEPINFRA_TOKEN")
return api_key
# Client caching for connection pooling
_CLIENT_CACHE: Dict[str, OpenAI] = {}
_ASYNC_CLIENT_CACHE: Dict[str, AsyncOpenAI] = {}
def get_openai_client(api_key: str) -> OpenAI:
"""Create or retrieve cached OpenAI client configured for DeepInfra"""
if api_key in _CLIENT_CACHE:
return _CLIENT_CACHE[api_key]
client = OpenAI(
api_key=api_key,
base_url="https://api.deepinfra.com/v1/openai"
)
_CLIENT_CACHE[api_key] = client
return client
def get_async_openai_client(api_key: str) -> AsyncOpenAI:
"""Create or retrieve cached AsyncOpenAI client configured for DeepInfra"""
if api_key in _ASYNC_CLIENT_CACHE:
return _ASYNC_CLIENT_CACHE[api_key]
# Configure HTTP client for higher concurrency
limits = httpx.Limits(
max_connections=max(10, MAX_CONCURRENCY),
max_keepalive_connections=max(10, MAX_CONCURRENCY),
)
timeout = httpx.Timeout(60.0, connect=20.0, read=60.0, write=60.0)
# Enable HTTP/2 only if the 'h2' package is available
http_client = httpx.AsyncClient(limits=limits, timeout=timeout, http2=HTTP2_AVAILABLE)
if not HTTP2_AVAILABLE:
print("[async] HTTP/2 not available (h2 not installed). Falling back to HTTP/1.1")
client = AsyncOpenAI(
api_key=api_key,
base_url="https://api.deepinfra.com/v1/openai",
http_client=http_client,
)
_ASYNC_CLIENT_CACHE[api_key] = client
return client
def compute_embeddings_deepinfra(texts: List[str], api_key: str) -> np.ndarray:
"""Compute embeddings using DeepInfra API via OpenAI client"""
client = get_openai_client(api_key)
try:
extra_body = {"normalize": True}
if USE_PRIORITY_TIER:
extra_body["service_tier"] = "priority"
# Create embeddings using OpenAI client
response = client.embeddings.create(
model=DEEPINFRA_MODEL,
input=texts,
encoding_format="float",
extra_body=extra_body,
),
# Extract embeddings from response (already normalized if normalize=True)
embeddings = np.array([data.embedding for data in response.data], dtype=np.float32)
return embeddings
except Exception as e:
raise Exception(f"DeepInfra API error: {str(e)}")
async def compute_embeddings_deepinfra_async(texts: List[str], api_key: str) -> np.ndarray:
"""Async embeddings using DeepInfra via AsyncOpenAI client"""
client = get_async_openai_client(api_key)
try:
extra_body = {"normalize": True}
if USE_PRIORITY_TIER:
extra_body["service_tier"] = "priority"
# Retry loop to avoid transient throttling
last_err = None
for attempt in range(5):
try:
t0 = time.perf_counter()
response = await client.embeddings.create(
model=DEEPINFRA_MODEL,
input=texts,
encoding_format="float",
extra_body=extra_body,
)
dt = time.perf_counter() - t0
embeddings = np.array([data.embedding for data in response.data], dtype=np.float32)
print(f"[async] embeddings.create batch_size={len(texts)} took {dt:.2f}s")
return embeddings
except Exception as e:
last_err = e
await asyncio.sleep(min(4.0, 0.25 * (2 ** attempt)))
raise Exception(f"DeepInfra API error after retries: {str(last_err)}")
except Exception as e:
raise Exception(f"DeepInfra API error: {str(e)}")
def _chunk_indices(total: int, chunk_size: int) -> List[Tuple[int, int]]:
return [(i, min(i + chunk_size, total)) for i in range(0, total, chunk_size)]
def _embed_batch_slice(args: Tuple[int, int, List[str], str]) -> Tuple[int, np.ndarray]:
start, end, texts, api_key = args
batch_vecs = compute_embeddings_deepinfra(texts[start:end], api_key)
return start, batch_vecs
def compute_embeddings_parallel(
texts: List[str],
api_key: str,
batch_size: int = EMBED_BATCH_SIZE,
max_concurrency: int = MAX_CONCURRENCY,
progress_callback=None,
) -> np.ndarray:
"""Concurrent embedding across batches while preserving order."""
n = len(texts)
if n == 0:
return np.empty((0, 0), dtype=np.float32)
slices = _chunk_indices(n, batch_size)
results: Dict[int, np.ndarray] = {}
total_batches = len(slices)
completed = 0
with ThreadPoolExecutor(max_workers=max_concurrency) as ex:
futures = [
ex.submit(_embed_batch_slice, (start, end, texts, api_key))
for (start, end) in slices
]
for fut in as_completed(futures):
start, vecs = fut.result()
results[start] = vecs
completed += 1
if progress_callback:
pct = int((completed / total_batches) * 100)
progress_callback(f"Embedding batches: {pct}% ({completed}/{total_batches})")
# Assemble in order
ordered_starts = sorted(results.keys())
assembled = np.vstack([results[s] for s in ordered_starts])
return assembled
async def compute_embeddings_parallel_async(
texts: List[str],
api_key: str,
batch_size: int = EMBED_BATCH_SIZE,
max_concurrency: int = MAX_CONCURRENCY,
progress_callback=None,
) -> np.ndarray:
"""Async concurrent embedding across batches while preserving order."""
n = len(texts)
if n == 0:
return np.empty((0, 0), dtype=np.float32)
slices = _chunk_indices(n, batch_size)
results: Dict[int, np.ndarray] = {}
sem = asyncio.Semaphore(max_concurrency)
total_batches = len(slices)
completed = 0
async def worker(start: int, end: int):
nonlocal completed
print(f"[async] launch target slice {start}:{end}")
async with sem:
vecs = await compute_embeddings_deepinfra_async(texts[start:end], api_key)
print(f"[async] done target slice {start}:{end}")
results[start] = vecs
completed += 1
if progress_callback:
pct = int((completed / total_batches) * 100)
progress_callback(f"Embedding batches: {pct}% ({completed}/{total_batches})")
await asyncio.gather(*(worker(start, end) for (start, end) in slices))
ordered_starts = sorted(results.keys())
return np.vstack([results[s] for s in ordered_starts])
# Local CPU Embedding Backend (async-compatible)
async def _load_local_model() -> "SentenceTransformer":
global _LOCAL_ST_MODEL
if _LOCAL_ST_MODEL is not None:
return _LOCAL_ST_MODEL
# Show notification that model is loading (may need to download)
try:
from shiny import ui
ui.notification_show(
"Loading local embedding model (thenlper/gte-large). "
"First-time loading may take a few minutes to download the model (~670MB).",
type="info",
duration=None, # Keep showing until we dismiss it
id="model_loading"
)
except:
pass
# Lazy import to avoid heavy import if API is healthy
from sentence_transformers import SentenceTransformer
# Load CPU model (default behavior). This may take time on first run (download + init).
model = await asyncio.to_thread(SentenceTransformer, "thenlper/gte-large")
_LOCAL_ST_MODEL = model
# Dismiss loading notification
try:
from shiny import ui
ui.notification_remove("model_loading")
ui.notification_show(
"Local embedding model loaded successfully.",
type="success",
duration=3
)
except:
pass
return model
async def compute_embeddings_local_async(texts: List[str]) -> np.ndarray:
# Minimal cleaning consistent with your embedding path
texts = clean_text_for_embedding(texts)
if len(texts) == 0:
return np.empty((0, 0), dtype=np.float32)
model = await _load_local_model()
# Batch via your existing chunking to keep memory bounded
slices = _chunk_indices(len(texts), EMBED_BATCH_SIZE)
results: Dict[int, np.ndarray] = {}
completed = 0
total = len(slices)
async def work(start: int, end: int):
# Run CPU-bound encode in a thread to keep event loop responsive
vecs = await asyncio.to_thread(model.encode, texts[start:end], normalize_embeddings=True)
# vecs is a numpy array
results[start] = vecs.astype(np.float32, copy=False)
await asyncio.gather(*(work(s, e) for (s, e) in slices))
# Assemble in order
ordered = [results[s] for s in sorted(results.keys())]
return np.vstack(ordered) if ordered else np.empty((0, 0), dtype=np.float32)
# Resilient Wrapper (API first, CPU fallback)
async def _try_api_embeddings(texts: List[str], api_key: str, progress_callback=None) -> np.ndarray:
# Wrap your existing async API call with a timeout
coro = compute_embeddings_deepinfra_async(texts, api_key)
return await asyncio.wait_for(coro, timeout=API_EMBED_TIMEOUT_SECS)
async def compute_embeddings_resilient_async(
texts: List[str],
api_key: str,
progress_callback=None,
) -> np.ndarray:
global FALLBACK_ACTIVE, _API_FAILURES
mode = MODEL_FALLBACK_MODE # "auto" | "api" | "local" | "off"
# Force-local mode
if mode == "local":
FALLBACK_ACTIVE = True
if progress_callback:
progress_callback("Local CPU embeddings (forced).")
return await compute_embeddings_local_async(texts)
# Force-API mode
if mode == "api":
FALLBACK_ACTIVE = False
if progress_callback:
progress_callback("Using API embeddings (forced).")
return await _try_api_embeddings(texts, api_key, progress_callback)
# Fallback disabled entirely
if mode == "off":
FALLBACK_ACTIVE = False
return await _try_api_embeddings(texts, api_key, progress_callback)
# Auto mode: try API, fallback on failure or repeated errors
if FALLBACK_ACTIVE:
# Circuit open: stay on local until next run
if progress_callback:
progress_callback("Local CPU embeddings (fallback active).")
return await compute_embeddings_local_async(texts)
try:
vecs = await _try_api_embeddings(texts, api_key, progress_callback)
# On success, reset failure counter
_API_FAILURES = 0
FALLBACK_ACTIVE = False
return vecs
except Exception as e:
_API_FAILURES += 1
if _API_FAILURES >= API_MAX_FAILURES:
FALLBACK_ACTIVE = True
# Show immediate notification when switching to CPU fallback
try:
from shiny import ui
ui.notification_show(
f"API failed after {_API_FAILURES} attempts. Switching to LOCAL CPU processing. "
f"This will be much slower. Processing {len(texts)} items may take several minutes.",
type="warning",
duration=10,
)
except:
pass # ui might not be available in all contexts
# Loggable note
if progress_callback:
progress_callback("API unavailable. Falling back to local CPU.")
return await compute_embeddings_local_async(texts)
# Re-raise before we hit threshold so upstream can decide (e.g., show an error or retry)
raise
def clean_text_simple(text_list: List[str]) -> List[str]:
"""Clean text by removing punctuation and extra spaces"""
cleaned = []
for text in text_list:
text = str(text).strip()
text = re.sub(r'\s+', ' ', text) # Multiple spaces to single
text = re.sub(r'[^\w\s,.-]', '', text) # Keep basic punctuation
cleaned.append(text.lower())
return cleaned
def clean_text_for_embedding(text_list: List[str]) -> List[str]:
"""Minimal cleaning for embedding models"""
cleaned = []
for text in text_list:
text = str(text).strip()
text = re.sub(r'\s+', ' ', text)
cleaned.append(text)
return cleaned
def run_fuzzy_match(input_list: List[str], target_list: List[str], clean: bool = True) -> Dict:
"""Run fuzzy string matching"""
if clean:
input_list = clean_text_simple(input_list)
target_list = clean_text_simple(target_list)
matches = []
scores = []
for input_desc in input_list:
best_match, score, _ = process.extractOne(
input_desc,
target_list,
scorer=fuzz.ratio
)
matches.append(best_match)
scores.append(score / 100.0) # Normalize to 0-1
return {"match": matches, "score": scores}
def run_tfidf_match(input_list: List[str], target_list: List[str], clean: bool = True) -> Dict:
"""Run TF-IDF matching with cosine similarity"""
if clean:
input_list = clean_text_simple(input_list)
target_list = clean_text_simple(target_list)
combined = input_list + target_list
vectorizer = TfidfVectorizer()
vectorizer.fit(combined)
tfidf_input = vectorizer.transform(input_list)
tfidf_target = vectorizer.transform(target_list)
similarity_matrix = cosine_similarity(tfidf_input, tfidf_target)
matches = []
scores = []
for row in similarity_matrix:
best_idx = np.argmax(row)
best_score = row[best_idx]
best_match = target_list[best_idx]
matches.append(best_match)
scores.append(float(best_score))
return {"match": matches, "score": scores}
def run_embed_match(
input_list: List[str],
target_list: List[str],
api_key: str,
batch_size: int = EMBED_BATCH_SIZE,
progress_callback=None,
max_concurrency: int = MAX_CONCURRENCY,
clean_input: bool = False,
clean_target: bool = False,
) -> Dict:
"""Run semantic embedding matching using DeepInfra API with concurrent batching.
Respects DeepInfra's 1024 max batch size and uses up to `max_concurrency`
concurrent requests to reduce wall-clock time. Results are reassembled in
the original order.
"""
# Apply cleaning based on user selection
input_list_clean = clean_text_for_embedding(input_list) if clean_input else input_list
target_list_clean = clean_text_for_embedding(target_list) if clean_target else target_list
total_inputs = len(input_list_clean)
total_targets = len(target_list_clean)
# 1) Compute target embeddings concurrently (once per run)
if progress_callback:
progress_callback("Computing target embeddings (concurrent)...")
target_embeddings = compute_embeddings_parallel(
target_list_clean,
api_key,
batch_size=min(batch_size, 1024),
max_concurrency=max_concurrency,
progress_callback=progress_callback,
)
# 2) Prepare concurrent input embedding + local similarity
if progress_callback:
progress_callback("Computing input embeddings (concurrent)...")
input_slices = _chunk_indices(total_inputs, min(batch_size, 1024))
results_match: Dict[int, List[str]] = {}
results_score: Dict[int, List[float]] = {}
def _process_input_slice(args: Tuple[int, int]) -> Tuple[int, List[str], List[float]]:
start, end = args
emb = compute_embeddings_deepinfra(input_list_clean[start:end], api_key)
sim = cosine_similarity(emb, target_embeddings)
batch_matches: List[str] = []
batch_scores: List[float] = []
for row in sim:
idx = int(np.argmax(row))
batch_matches.append(target_list[idx])
batch_scores.append(float(row[idx]))
return start, batch_matches, batch_scores
total_batches = len(input_slices)
completed = 0
with ThreadPoolExecutor(max_workers=max_concurrency) as ex:
futures = [ex.submit(_process_input_slice, sl) for sl in input_slices]
for fut in as_completed(futures):
start, m, s = fut.result()
results_match[start] = m
results_score[start] = s
completed += 1
if progress_callback:
pct = int((completed / total_batches) * 100)
progress_callback(f"Matching: {pct}% ({completed}/{total_batches})")
# 3) Assemble results in order
matches: List[str] = []
scores: List[float] = []
for start in sorted(results_match.keys()):
matches.extend(results_match[start])
scores.extend(results_score[start])
if progress_callback:
progress_callback("Finalizing results...")
return {"match": matches, "score": scores}
async def run_embed_match_async(
input_list: List[str],
target_list: List[str],
api_key: str,
batch_size: int = EMBED_BATCH_SIZE,
progress_callback=None,
max_concurrency: int = MAX_CONCURRENCY,
clean_input: bool = False,
clean_target: bool = False,
) -> Dict:
"""Async version using AsyncOpenAI and asyncio concurrency."""
# Apply cleaning based on user selection
input_list_clean = clean_text_for_embedding(input_list) if clean_input else input_list
target_list_clean = clean_text_for_embedding(target_list) if clean_target else target_list
# 1) Targets once
if progress_callback:
progress_callback("Computing target embeddings (async concurrent)...")
target_embeddings = await compute_embeddings_resilient_async(
target_list_clean,
api_key,
progress_callback=progress_callback,
)
# 2) Inputs concurrent and local similarity
if progress_callback:
progress_callback("Computing input embeddings (async concurrent)...")
input_slices = _chunk_indices(len(input_list_clean), min(batch_size, 1024))
results_match: Dict[int, List[str]] = {}
results_score: Dict[int, List[float]] = {}
sem = asyncio.Semaphore(max_concurrency)
completed = 0
total_batches = len(input_slices)
async def worker(start: int, end: int):
nonlocal completed
print(f"[async] launch input slice {start}:{end}")
async with sem:
emb = await compute_embeddings_resilient_async(input_list_clean[start:end], api_key, progress_callback)
print(f"[async] done input slice {start}:{end}")
sim = cosine_similarity(emb, target_embeddings)
batch_matches: List[str] = []
batch_scores: List[float] = []
for row in sim:
idx = int(np.argmax(row))
batch_matches.append(target_list[idx])
batch_scores.append(float(row[idx]))
results_match[start] = batch_matches
results_score[start] = batch_scores
completed += 1
if progress_callback:
pct = int((completed / total_batches) * 100)
progress_callback(f"Matching: {pct}% ({completed}/{total_batches})")
await asyncio.gather(*(worker(s, e) for (s, e) in input_slices))
matches: List[str] = []
scores: List[float] = []
for start in sorted(results_match.keys()):
matches.extend(results_match[start])
scores.extend(results_score[start])
if progress_callback:
progress_callback("Finalizing results...")
return {"match": matches, "score": scores}
def get_sample_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Get sample datasets for demonstration"""
# Sample input data
input_data = pd.DataFrame({
"id": range(1, 26),
"description": [
"apple juice", "chicken breast grilled", "whole milk",
"orange juice fresh", "bread whole wheat", "cheddar cheese", "scrambled eggs",
"pasta with tomato sauce", "beef steak medium rare", "yogurt plain",
"brown rice cooked", "salmon fillet baked",
"vegetable soup", "fruit salad mixed", "pizza slice pepperoni",
"ice cream vanilla", "coffee with cream",
"energy drink", "protein bar chocolate", "trail mix nuts", "smoothie berry",
"xyz123 test item", "random text here", "unknown food item 999", "synthetic compound ABC"
]
})
# Sample target data
target_data = pd.DataFrame({
"code": [f"A{i:03d}" for i in range(1, 26)],
"reference": [
"Apple juice, unsweetened, bottled, without added ascorbic acid",
"Chicken, broilers or fryers, breast, meat only, cooked, grilled",
"Milk, whole, 3.25% milkfat, with added vitamin D",
"Orange juice, raw, includes from concentrate, fortified with calcium",
"Bread, whole-wheat, commercially prepared",
"Cheese, cheddar, sharp, sliced",
"Egg, whole, cooked, scrambled",
"Pasta with tomato-based sauce",
"Beef, short loin, t-bone steak, separable lean and fat, trimmed to 1/8\" fat, all grades, cooked, grilled",
"Yogurt, plain, whole milk",
"Rice, brown, medium-grain, cooked",
"Fish, salmon, Atlantic, farmed, cooked, dry heat",
"Soup, vegetable with beef broth, canned, prepared with equal volume water",
"Fruit salad, (pineapple and papaya and banana and guava), tropical, canned, heavy syrup, solids and liquids",
"Pizza, meat and vegetable topping, regular crust, frozen, cooked",
"Ice creams, vanilla",
"Coffee, brewed from grounds, prepared with tap water, decaffeinated",
"Beverages, Energy drink, RED BULL",
"Snacks, granola bar, chocolate chip",
"Snacks, trail mix, regular, unsalted",
"Beverages, Smoothie, strawberry",
"Water, tap, municipal",
"Crackers, standard snack-type, regular",
"Cookies, chocolate chip, commercially prepared, regular",
"Candies, milk chocolate"
]
})
return input_data, target_data
# Create Shiny app with modern theme
''' LEGACY LAYOUT (disabled)
app_ui = ui.page_sidebar(
# Sidebar must be the first positional argument
ui.sidebar(
ui.h5("Quick Start"),
ui.input_action_button(
"load_sample",
"Load Sample Dataset",
class_="btn btn-success w-100"
),
ui.div(id="sample_status", class_="mt-2"),
ui.hr(),
ui.h5("Upload Data"),
ui.input_file("input_file", "Input CSV", accept=[".csv"], multiple=False),
ui.div(id="input_status", class_="mt-2"),
ui.input_file("target_file", "Target CSV", accept=[".csv"], multiple=False),
ui.div(id="target_status", class_="mt-2"),
ui.hr(),
ui.output_ui("sidebar_results_summary_block"),
open="open",
),
# Then page contents (positional)
ui.tags.head(
ui.tags.link(rel="stylesheet", href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap"),
ui.tags.link(rel="stylesheet", href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.0/font/bootstrap-icons.css"),
ui.tags.link(rel="stylesheet", href="https://unpkg.com/tabulator-tables@5.5.2/dist/css/tabulator.min.css"),
ui.tags.style(custom_css_min),
ui.tags.script(src="https://unpkg.com/tabulator-tables@5.5.2/dist/js/tabulator.min.js"),
# JavaScript for tooltips and table features
ui.tags.script("""
// Initialize tooltips
document.addEventListener('DOMContentLoaded', () => {
const tooltipTriggerList = [].slice.call(document.querySelectorAll('[data-bs-toggle="tooltip"]'));
tooltipTriggerList.map(function (tooltipTriggerEl) {
return new bootstrap.Tooltip(tooltipTriggerEl);
});
});
// Toggle compact mode for results table
document.addEventListener('change', function(e){
if(e.target && e.target.id === 'compact_density'){
const c = document.getElementById('results_container');
if(c){ e.target.checked ? c.classList.add('compact') : c.classList.remove('compact'); }
}
});
// NEW: Apply column alignment classes after table render
document.addEventListener('shiny:value', function(ev) {
if (ev.detail && ev.detail.name === 'results_table') {
setTimeout(() => {
const table = document.querySelector('#results_container table');
if (!table) return;
// Apply compact mode if needed
const compact = window.Shiny && Shiny.shinyapp && Shiny.shinyapp.$inputValues['compact_density'];
const container = document.querySelector('#results_container');
if (container) container.classList.toggle('compact', !!compact);
// Fix column alignment
const headerCells = Array.from(table.querySelectorAll('thead th'));
const bodyRows = Array.from(table.querySelectorAll('tbody tr'));
// Find the status column index
let statusColIdx = -1;
headerCells.forEach((th, idx) => {
const header = th.innerText || '';
if (header.toLowerCase() === 'status') {
statusColIdx = idx;
}
});
headerCells.forEach((th, idx) => {
const header = th.innerText || '';
const isNum = /score|similarity|count|percent|^\\d+/.test(header.toLowerCase());
th.classList.toggle('num', isNum);
th.classList.toggle('text', !isNum);
bodyRows.forEach(tr => {
const td = tr.children[idx];
if (!td) return;
const txt = td.innerText || '';
const looksNum = isNum || /^[\\d\\s\\.\\,\\-]+%?$/.test(txt);
td.classList.toggle('num', looksNum);
td.classList.toggle('text', !looksNum);
// Add scorebar class for bar columns
if (/bar$/i.test(header)) {
td.classList.add('scorebar');
}
});
});
// Apply no-match-row class to rows with NO MATCH status
if (statusColIdx >= 0) {
bodyRows.forEach(tr => {
const statusCell = tr.children[statusColIdx];
if (statusCell && statusCell.innerText.trim() === 'NO MATCH') {
tr.classList.add('no-match-row');
}
});
}
}, 100);
}
});
// Ensure Step 1 preview tables stick left and first column stays narrow
function adjustPreview(id){
var root = document.getElementById(id);
if(!root) return;
// Remove auto-centering and allow full width on wrappers
var nodes = [root].concat(Array.from(root.querySelectorAll('.shiny-data-frame, .html-fill-container, .html-fill-item, .table-responsive, table, div')));
nodes.forEach(function(el){
try {
el.style.marginLeft = '0';
el.style.marginRight = '0';
el.style.width = '100%';
el.style.maxWidth = 'none';
} catch(e){}
});
var table = root.querySelector('table');
if(!table) return;
table.style.width = '100%';
table.style.tableLayout = 'auto';
// Add/adjust colgroup for first column
var colgroup = table.querySelector('colgroup');
var colCount = (table.querySelectorAll('thead th').length) || (table.querySelectorAll('tbody tr:first-child td').length);
if(!colgroup && colCount){
colgroup = document.createElement('colgroup');
for (var i=0;i<colCount;i++){ colgroup.appendChild(document.createElement('col')); }
table.insertBefore(colgroup, table.firstChild);
}
if(colgroup && colgroup.firstElementChild){
var c0 = colgroup.firstElementChild;
c0.style.width = '36px';
c0.style.minWidth = '36px';
c0.style.maxWidth = '40px';
}
// Set widths on first TH/TDs as well
var th0 = table.querySelector('thead th:first-child');
if(th0){ th0.style.width = '36px'; th0.style.minWidth = '36px'; th0.style.whiteSpace = 'nowrap'; th0.style.textAlign = 'center'; }
table.querySelectorAll('tbody td:first-child').forEach(function(td){
td.style.width = '36px';
td.style.minWidth = '36px';
td.style.whiteSpace = 'nowrap';
td.style.textAlign = 'center';
});
}
document.addEventListener('shiny:value', function(ev){
if (ev.detail && (ev.detail.name === 'input_col_preview' || ev.detail.name === 'target_col_preview')){
setTimeout(function(){ adjustPreview(ev.detail.name); }, 0);
}
});
document.addEventListener('DOMContentLoaded', function(){
adjustPreview('input_col_preview');
adjustPreview('target_col_preview');
});
""")
,
ui.tags.script("""
(function(){
// Debounced search mirror -> search_debounced
var _t;
document.addEventListener('input', function(e){
if (e.target && e.target.id === 'search_filter'){
clearTimeout(_t);
var v = e.target.value;
_t = setTimeout(function(){
if (window.Shiny && Shiny.setInputValue){
Shiny.setInputValue('search_debounced', v, {priority:'event'});
}
}, 200);
}
});
})();
""")
),
ui.navset_bar(
ui.nav_panel(
"Data & Configure",
ui.card(
ui.card_header("Matching Setup"),
ui.card_body(
ui.row(
ui.column(6,
ui.input_select("input_column", "Input Column:", choices=[], selected=None),
ui.output_table("input_col_preview")
),
ui.column(6,
ui.input_select("target_column", "Target Column:", choices=[], selected=None),
ui.output_table("target_col_preview")
)
),
ui.hr(),
ui.row(
ui.column(6,
ui.p(ui.strong("Method:"), " Semantic Embedding (", ui.a("thenlper/gte-large", href="https://huggingface.co/thenlper/gte-large", target="_blank"), ")")
),
ui.column(3,
ui.div(
ui.span(
"Similarity Threshold ",
ui.tags.i(
class_="bi bi-question-circle text-muted",
**{"data-bs-toggle": "tooltip", "data-bs-placement": "top", "title": "Items below this score are marked as NO MATCH"}
)
),
ui.input_slider("threshold", "", min=0.0, max=1.0, value=0.85, step=0.05)
)
),
ui.column(3,
ui.input_checkbox("clean_text", "Apply text cleaning", value=False)
)
),
ui.br(),
ui.input_action_button("run_matching", "Start Mapping", class_="btn btn-primary", style="padding: 0.4rem 1rem;")
)
),
ui.br(),
ui.output_ui("center_progress_overlay")
),
ui.nav_panel(
"Results",
ui.div(
ui.div(id="process_status"),
ui.div(id="process_progress"),
ui.div(id="process_summary"),
ui.div(id="method_chips"),
style="margin-bottom: 12px;"
),
ui.card(
ui.card_header("Filter & View"),
ui.card_body(
ui.row(
ui.column(6, ui.input_text("search_filter", "Search:", placeholder="Type to filter...")),
ui.column(6,
ui.div(
ui.input_checkbox("show_no_match", "Only NO MATCH", value=False),
ui.input_checkbox("sort_by_score", "Sort by score", value=True),
ui.input_checkbox("compact_density", "Compact", value=False),
ui.input_checkbox("use_grid", "Interactive grid", value=True),
class_="d-flex gap-3 flex-wrap justify-content-end"
)
),
)
)
),
ui.output_ui("results_tabulator"),
ui.div(ui.output_table("results_table"), class_="results-container", id="results_container"),
ui.br(),
ui.card(
ui.card_header("Export"),
ui.card_body(
ui.div(
ui.download_button("download_results", ui.span(ui.tags.i(class_="bi bi-download me-2"), "Export All Results"), class_="btn btn-success"),
ui.download_button("download_no_match", ui.span(ui.tags.i(class_="bi bi-exclamation-triangle me-2"), "Export NO MATCH Only"), class_="btn btn-outline-warning"),
ui.input_action_button("reset_analysis", ui.span(ui.tags.i(class_="bi bi-arrow-repeat me-2"), "Start New Matching"), class_="btn btn-primary"),
class_="btn-group-export"
)
)
)
),
ui.nav_panel(
"Visualizations",
ui.card(
ui.card_header("Similarity Score Distribution"),
ui.card_body(
ui.row(
ui.column(6, ui.input_select("viz_type", "Chart", choices={"hist":"Histogram","cdf":"Cumulative","threshold":"Threshold curve","match_pie":"Matches vs No Matches"}, selected="hist")),
),
ui.output_plot("score_hist")
)
)
),
ui.nav_panel(
"About",
ui.card(
ui.card_header("About Food Mapper"),
ui.card_body(
ui.h4("Food Mapper", class_="mb-3"),
ui.p("A state-of-the-art tool for matching food descriptions across different databases using advanced natural language processing techniques.", class_="lead"),
ui.hr(),
ui.h5(ui.tags.i(class_="bi bi-stars me-2"), "Key Features"),
ui.tags.ul(
ui.tags.li(ui.HTML("Semantic embedding via <a href=\"https://huggingface.co/thenlper/gte-large\" target=\"_blank\">thenlper/gte-large</a>")),
ui.tags.li("Batch processing with progress tracking"),
ui.tags.li("Clear results table with spreadsheet-like view"),
ui.tags.li("Export matched results and NO MATCH items")
)
)
)
),
id="main_tabs",
title="Food Mapper"
),
ui.br(),
ui.div(
ui.strong("Western Human Nutrition Research Center"),
" | Davis, CA",
ui.br(),
"Diet, Microbiome and Immunity Research Unit",
ui.br(),
"United States Department of Agriculture | Agricultural Research Service",
),
theme=shinyswatch.theme.yeti()
)
''' # end legacy layout
def make_sidebar():
return ui.sidebar(
ui.div(
ui.h5(
"Upload Your Data",
ui.input_action_link(
"show_upload_help",
ui.tags.i(class_="bi bi-question-circle ms-2", style="font-size: 0.8rem;"),
class_="text-muted",
style="text-decoration: none;"
),
class_="d-flex justify-content-between align-items-center"
)
),
ui.input_file("input_file", "Input CSV", accept=[".csv"], multiple=False),
ui.div(id="input_status", class_="mt-2"),
ui.input_file("target_file", "Target CSV", accept=[".csv"], multiple=False),
ui.div(id="target_status", class_="mt-2"),
# Add sample data button for easier mobile access
ui.div(
ui.input_action_button(
"sidebar_sample_data",
"Or use sample data",
class_="btn btn-outline-primary btn-sm w-100 mt-3"
),
class_="mb-3"
),
ui.hr(),
ui.output_ui("sidebar_results_summary_block"),
ui.hr(),
# Dynamic navigation button that changes based on current step
ui.output_ui("sidebar_navigation_button"),
open="open",
id="main_sidebar",
)
def make_footer():
return ui.div(
ui.strong("Western Human Nutrition Research Center"),
" | Davis, CA",
ui.br(),
"Diet, Microbiome and Immunity Research Unit",
ui.br(),
"United States Department of Agriculture | Agricultural Research Service",
class_="footer"
)
# Rebuild app UI with top navbar and nested tabs structure
app_ui = ui.page_navbar(
# Top navigation bar panels
ui.nav_panel(
"Semantic Embedder",
ui.page_sidebar(
make_sidebar(),
ui.tags.style(custom_css_min),
ui.navset_tab(
ui.nav_panel(
"Tutorial",
ui.card(
ui.card_header("Get Started"),
ui.card_body(
ui.row(
ui.column(6,
ui.h6("Quick Start", class_="mb-2"),
ui.tags.ul(
ui.tags.li("Upload Input CSV"),
ui.tags.li("Upload Target CSV"),
ui.tags.li("Pick columns → Start")
, class_="mb-2"),
ui.tags.small(ui.tags.i(class_="bi bi-upload me-1"), "Use the sidebar on the left to add your files.", class_="text-muted d-block mb-2"),
ui.div(
ui.tags.small("No data? ", class_="text-muted"),
ui.input_action_button("load_sample", "Try with sample data", class_="btn btn-outline-primary btn-sm"),
class_="mt-2 mb-3"
)
),
ui.column(6,
ui.h6(
ui.span("Data Requirements"),
ui.input_action_link(
"show_requirements",
ui.tags.i(class_="bi bi-info-circle ms-2"),
class_="text-primary"
),
class_="mb-2"
),
ui.tags.ul(
ui.tags.li("CSV files with headers"),
ui.tags.li("Input: items to match"),
ui.tags.li("Target: reference list")
, class_="mb-2")
)
, class_="g-4 align-items-start"),
ui.hr(class_="my-3"),
ui.div(
ui.h6("What This Tool Does", class_="mb-2"),
ui.p(
"This application matches text descriptions between two datasets using AI-powered semantic analysis. "
"Upload your input items and target reference list, select the columns to match, "
"and the tool will find the best semantic matches based on meaning rather than exact text.",
class_="text-muted small"
),
ui.h6("Key Features", class_="mb-2 mt-3"),
ui.tags.ul(
ui.tags.li("Semantic matching using state-of-the-art embeddings", class_="small text-muted"),
ui.tags.li("Adjustable similarity threshold for fine-tuning", class_="small text-muted"),
ui.tags.li("Interactive visualizations and data export", class_="small text-muted"),
ui.tags.li("Text cleaning options for better matches", class_="small text-muted")
),
ui.div(id="sample_status", class_="mt-2")
)
)
)
),
ui.nav_panel(
"Step 1: Data & Configure",
ui.card(
ui.card_header("Matching Setup"),
ui.card_body(
ui.row(
ui.column(6,
ui.input_select("input_column", "Input Column:", choices=[], selected=None),
ui.input_switch("clean_input", "Apply text cleaning to input", value=False),
ui.div(
ui.output_data_frame("input_col_preview"),
id="input_preview_wrap",
style="margin: 0 !important; padding: 0 !important; text-align: left !important; width: 100% !important;"
)
),
ui.column(6,
ui.input_select("target_column", "Target Column:", choices=[], selected=None),
ui.input_switch("clean_target", "Apply text cleaning to target", value=False),
ui.div(
ui.output_data_frame("target_col_preview"),
id="target_preview_wrap",
style="margin: 0 !important; padding: 0 !important; text-align: left !important; width: 100% !important;"
)
)
),
ui.hr(),
ui.div(
ui.h5("Similarity Threshold", class_="text-center mb-2"),
# Three-column layout for professional appearance (centered slider and button)
ui.row(
# Left column - Method info
ui.column(3,
ui.div(
ui.div(
ui.tags.i(class_="bi bi-cpu text-primary me-1"),
ui.strong("Method"),
class_="mb-1 text-center small"
),
ui.div("Semantic Embedding", class_="text-muted mb-2 text-center small"),
ui.div(
ui.tags.i(class_="bi bi-diagram-3 text-primary me-1"),
ui.strong("Model"),
class_="mb-1 text-center small"
),
ui.div(
ui.a("thenlper/gte-large",
href="https://huggingface.co/thenlper/gte-large",
target="_blank",
class_="text-primary small"),
class_="text-center small"
),
class_="border-end pe-3 py-2"
)
),
# Center column - Threshold slider and button stacked and centered
ui.column(6,
ui.div(
ui.div(
ui.div(
ui.input_slider("threshold", "", min=0.0, max=1.0, value=0.85, step=0.01),
class_="mx-auto",
style="max-width: 400px;"
),
class_="d-flex justify-content-center mb-1"
),
ui.div(
ui.input_action_button(
"run_matching",
"Start Mapping",
class_="btn btn-primary px-5 shadow-sm",
style="padding: 0.5rem 2rem;"
),
class_="text-center mt-2"
)
)
),
# Right column - Threshold note
ui.column(3,
ui.div(
ui.div(
ui.tags.i(class_="bi bi-info-circle text-muted me-1"),
ui.tags.small(ui.strong("Note"), class_="text-muted"),
class_="mb-1"
),
ui.tags.small(
ui.div("Items below threshold", class_="text-muted lh-sm"),
ui.div("marked as NO MATCH.", class_="text-muted lh-sm"),
ui.div("Adjust for performance", class_="text-muted lh-sm mt-1"),
ui.div("for your dataset.", class_="text-muted lh-sm")
),
class_="border-start ps-3 py-2"
)
)
),
class_="matching-config-card"
)
)
),
ui.br(),
ui.output_ui("center_progress_overlay")
),
ui.nav_panel(
"Step 2: Results",
ui.navset_pill(
ui.nav_panel(
"View Mappings",
ui.div(
ui.div(id="process_status"),
ui.div(id="process_progress"),
ui.div(id="process_summary"),
ui.div(id="method_chips"),
style="margin-bottom: 12px;"
),
ui.output_ui("results_tabulator"),
# Add tip below the results table with clickable link
ui.div(
ui.tags.small(
ui.tags.i(class_="bi bi-lightbulb me-1"),
"Tip: Return to ",
ui.input_action_link("goto_step1_from_tip", "Step 1: Data & Configure", class_="text-primary"),
" to adjust threshold or column selections, then re-run mapping.",
class_="text-muted"
),
class_="mt-3 mb-3 text-center"
),
ui.div(
ui.download_button("download_all_data", ui.span(ui.tags.i(class_="bi bi-download me-2"), "Export All Data"), class_="btn btn-success"),
ui.download_button("download_matches", ui.span(ui.tags.i(class_="bi bi-file-earmark-check me-2"), "Export Matches"), class_="btn btn-info"),
ui.input_action_button("reset_analysis", ui.span(ui.tags.i(class_="bi bi-arrow-repeat me-2"), "Start New Mapping"), class_="btn btn-primary"),
class_="btn-group-export"
)
),
ui.nav_panel(
"Visualizations",
ui.card(
ui.card_header("Interactive Visualizations"),
ui.card_body(
ui.row(
ui.column(7,
ui.input_select("plotly_viz_type", "Chart Type",
choices={
"density": "Density Plot - Score distribution shape",
"histogram": "Histogram - Frequency of score ranges",
"threshold": "Threshold Analysis - Match rate at different cutoffs"
# HIDDEN VISUALIZATIONS - Uncomment lines below to restore
# "box": "Box Plot - Quartiles & outliers",
# "violin": "Violin Plot - Match vs No-match comparison",
# "scatter": "Scatter Plot - Sequential patterns",
# "ecdf": "Cumulative Distribution - Probability curve",
# "sunburst": "Match Breakdown - Hierarchical match statistics"
},
selected="density",
width="100%"
)
),
ui.column(5,
ui.input_checkbox("show_threshold_line", "Show threshold line", value=True)
)
),
ui.output_ui("chart_description"),
output_widget("plotly_viz")
)
)
),
id="results_subtabs"
)
),
id="workflow_tabs"
),
# JS helpers for Step 1 preview layout and narrow first column
ui.tags.script("""
(function(){
// Initialize Bootstrap tooltips on demand
function initTooltips(){
if (window.bootstrap && bootstrap.Tooltip) {
document.querySelectorAll('[data-bs-toggle="tooltip"]').forEach(function(el){
try { new bootstrap.Tooltip(el, {container:'body'}); } catch(e){}
});
}
}
document.addEventListener('DOMContentLoaded', initTooltips);
document.addEventListener('shiny:value', initTooltips);
// Threshold value badge removed; rely on slider only
function adjustPreview(id){
var root = document.getElementById(id);
if(!root) return;
// Ensure containers don't center and take full width
var containers = root.querySelectorAll('.shiny-data-frame, .html-fill-container, .html-fill-item, .gridjs-container, .gridjs-wrapper');
containers.forEach(function(el){
try {
el.style.marginLeft = '0';
el.style.marginRight = '0';
el.style.width = '100%';
el.style.maxWidth = 'none';
} catch(e){}
});
var table = root.querySelector('.gridjs-table');
if(!table) return;
table.style.width = '100%';
table.style.tableLayout = 'auto';
// Force first column narrow across header and body (Grid.js)
var th0 = table.querySelector('thead.gridjs-thead th.gridjs-th:first-child');
if (th0) {
th0.style.setProperty('width','36px','important');
th0.style.setProperty('min-width','36px','important');
th0.style.setProperty('max-width','40px','important');
th0.style.setProperty('white-space','nowrap','important');
th0.style.setProperty('text-align','center','important');
}
table.querySelectorAll('tbody.gridjs-tbody td.gridjs-td:first-child').forEach(function(td){
td.style.setProperty('width','36px','important');
td.style.setProperty('min-width','36px','important');
td.style.setProperty('max-width','40px','important');
td.style.setProperty('white-space','nowrap','important');
td.style.setProperty('text-align','center','important');
});
}
document.addEventListener('shiny:value', function(ev){
if (ev.detail && (ev.detail.name === 'input_col_preview' || ev.detail.name === 'target_col_preview')){
setTimeout(function(){ adjustPreview(ev.detail.name); }, 0);
}
});
document.addEventListener('DOMContentLoaded', function(){
adjustPreview('input_col_preview');
adjustPreview('target_col_preview');
});
})();
"""),
make_footer()
)
),
ui.nav_panel(
"About",
ui.div(
ui.card(
ui.card_header(
ui.h4("About Food Mapper", class_="mb-0")
),
ui.card_body(
# Hero Section
ui.div(
ui.h2("Food Mapper", class_="text-center mb-3"),
ui.p(
"Advanced semantic matching tool for aligning food descriptions across nutritional databases",
class_="lead text-center text-muted mb-4"
),
ui.hr(class_="my-4")
),
# Overview Section
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-info-circle me-2"),
"Overview",
class_="mb-3"
),
ui.p(
"Food Mapper solves a major problem in nutritional research: accurately matching "
"food items between different databases that use varying naming conventions and descriptions. "
"This tool uses neural language processing to find semantic matches "
"based on meaning rather than exact text matching.",
class_="mb-4"
)
),
# Problem Statement
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-question-circle me-2"),
"The Challenge",
class_="mb-3 mt-4"
),
ui.p(
"Nutritional databases often describe the same foods differently:",
class_="mb-2"
),
ui.tags.ul(
ui.tags.li('"2% milk" vs "Milk, reduced fat, 2% milkfat"'),
ui.tags.li('"OJ" vs "Orange juice, raw"'),
ui.tags.li('"Whole wheat bread" vs "Bread, whole-wheat, commercially prepared"'),
class_="mb-3"
),
ui.p(
"Traditional text matching fails to recognize these as the same items, leading to "
"incomplete or inaccurate nutritional analyses.",
class_="text-muted mb-4"
)
),
# Solution Section
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-lightbulb me-2"),
"Our Solution",
class_="mb-3 mt-4"
),
ui.p(
"Food Mapper uses semantic embeddings to understand the meaning behind food descriptions, "
"enabling accurate matches even when the exact wording differs.",
class_="mb-3"
),
ui.div(
ui.row(
ui.column(6,
ui.div(
ui.tags.i(class_="bi bi-cpu text-primary fs-3 mb-2 d-block"),
ui.h6("AI Model", class_="mb-2"),
ui.p(
ui.HTML('Powered by <a href="https://huggingface.co/thenlper/gte-large" target="_blank" class="text-decoration-none">GTE-Large</a>'),
ui.br(),
ui.tags.small("Neural embedding model", class_="text-muted"),
class_="small"
),
class_="text-center p-3 border rounded mb-3"
)
),
ui.column(6,
ui.div(
ui.tags.i(class_="bi bi-speedometer2 text-success fs-3 mb-2 d-block"),
ui.h6("Performance", class_="mb-2"),
ui.p(
"Process thousands of items/minute",
ui.br(),
ui.tags.small("Batch processing system", class_="text-muted"),
class_="small"
),
class_="text-center p-3 border rounded mb-3"
)
)
),
ui.row(
ui.column(6,
ui.div(
ui.tags.i(class_="bi bi-bullseye text-info fs-3 mb-2 d-block"),
ui.h6("Accuracy", class_="mb-2"),
ui.p(
"Semantic understanding",
ui.br(),
ui.tags.small("Matches based on meaning", class_="text-muted"),
class_="small"
),
class_="text-center p-3 border rounded mb-3"
)
),
ui.column(6,
ui.div(
ui.tags.i(class_="bi bi-sliders text-warning fs-3 mb-2 d-block"),
ui.h6("Control", class_="mb-2"),
ui.p(
"Adjustable thresholds",
ui.br(),
ui.tags.small("Fine-tune match sensitivity", class_="text-muted"),
class_="small"
),
class_="text-center p-3 border rounded mb-3"
)
)
)
)
),
# Key Features
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-star me-2"),
"Key Features",
class_="mb-3 mt-4"
),
ui.tags.ul(
ui.tags.li(
ui.strong("Semantic Matching:"),
" Understands food descriptions using neural embeddings"
),
ui.tags.li(
ui.strong("Batch Processing:"),
" Handle thousands of items efficiently with concurrent processing"
),
ui.tags.li(
ui.strong("Interactive Visualizations:"),
" Explore match distributions and patterns with 8 chart types"
),
ui.tags.li(
ui.strong("Data Export:"),
" Download results as CSV with all original data preserved"
),
ui.tags.li(
ui.strong("Text Cleaning:"),
" Optional preprocessing to potentially improve match quality"
),
ui.tags.li(
ui.strong("Real-time Preview:"),
" See data transformations before processing"
),
class_="mb-4"
)
),
# Use Cases
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-diagram-3 me-2"),
"Use Cases",
class_="mb-3 mt-4"
),
ui.tags.ul(
ui.tags.li("Harmonizing dietary intake data with nutrient databases"),
ui.tags.li("Linking research datasets to food composition tables"),
ui.tags.li("Standardizing food nomenclature across studies"),
ui.tags.li("Quality control for nutritional data entry"),
ui.tags.li("Cross-referencing international food databases"),
class_="mb-4"
)
),
# Credits Section
ui.div(
ui.hr(class_="my-4"),
ui.h5(
ui.tags.i(class_="bi bi-people me-2"),
"Development Team",
class_="mb-3"
),
ui.div(
ui.p(
ui.strong("Principal Investigator:"),
" Dr. Danielle G. Lemay",
ui.br(),
ui.tags.small("Research Molecular Biologist", class_="text-muted"),
class_="mb-2"
),
ui.p(
ui.strong("Developer:"),
" Richard Stoker",
ui.br(),
ui.tags.small("IT Specialist (Scientific)", class_="text-muted"),
class_="mb-2"
),
ui.p(
ui.strong("Organization:"),
ui.br(),
"USDA Agricultural Research Service",
ui.br(),
"Western Human Nutrition Research Center",
ui.br(),
ui.tags.small("Davis, California", class_="text-muted"),
class_="mb-3"
)
)
),
# Version and Contact
ui.div(
ui.hr(class_="my-4"),
ui.row(
ui.column(6,
ui.p(
ui.tags.i(class_="bi bi-tag me-1"),
ui.strong("Version:"),
" 1.0.0",
class_="text-muted small mb-0"
)
),
ui.column(6,
ui.p(
ui.tags.i(class_="bi bi-envelope me-1"),
ui.strong("Contact:"),
" richard.stoker@usda.gov",
ui.br(),
ui.HTML('<a href="https://github.com/RichardStoker-USDA/Food-Mapper" target="_blank" class="text-decoration-none"><i class="bi bi-github me-1"></i>GitHub</a>'),
class_="text-muted small mb-0 text-end"
)
)
)
)
)
),
make_footer(),
class_="container-fluid",
style="max-width: 1200px; margin: 0 auto; padding: 20px;"
)
),
title=ui.tags.div(
ui.tags.h2(
"Food Mapper",
style="margin: 0 1rem 0 0; font-size: 1.9rem; font-weight: 700; letter-spacing: 0.5px; padding-right: 1rem; border-right: 2px solid var(--bs-gray-400, #ced4da);"
),
ui.tags.div(
ui.input_dark_mode(id="dark_mode", mode="light"),
style="margin-left: 1rem;"
),
style="display: flex; align-items: center;"
),
id="main_navbar",
# Remove hardcoded bg color to let theme control it
theme=shinyswatch.theme.yeti()
)
# UI utility functions
def create_score_bar(score: float, width: int = 12, min_scale: float = 0.5) -> str:
"""Create a text-based progress bar for score visualization
For semantic embeddings, scales the bar relative to a minimum threshold
since scores rarely go below 0.5, making differences more visible.
"""
try:
score = 0.0 if score is None or pd.isna(score) else float(score)
except:
score = 0.0
# For semantic embeddings, scale relative to min_scale (default 0.5)
# This makes the bar show relative differences better
if score < min_scale:
# If below min_scale, show as empty or minimal
filled = 0
else:
# Scale from min_scale to 1.0 across the full bar width
scaled_score = (score - min_scale) / (1.0 - min_scale)
scaled_score = max(0.0, min(1.0, scaled_score))
filled = int(round(scaled_score * width))
return "█" * filled + "░" * (width - filled)
def create_status_badge(value: str) -> str:
"""Create HTML status badge based on match status"""
if str(value).upper() == "NO MATCH":
return '<span class="status-badge status-warn">NO MATCH</span>'
else:
return '<span class="status-badge status-ok">Match</span>'
def server(input: Inputs, output: Outputs, session: Session):
# Reactive values for data storage
input_df = reactive.value(pd.DataFrame())
target_df = reactive.value(pd.DataFrame())
results_df = reactive.value(pd.DataFrame())
current_step = reactive.value(1)
progress_message = reactive.value("Starting...")
# State for optional centered overlay (currently returns None by default)
progress_show = reactive.value(False)
progress_percent = reactive.value(0)
# Show welcome splash screen on app load (controlled by environment variable)
@reactive.effect
def show_splash():
# Check environment variable to control splash screen visibility
# HIDE_SPLASH_SCREEN: Set to "true" or "1" to hide the splash screen
# Default behavior is to show the splash screen if variable is not set
hide_splash = os.environ.get("HIDE_SPLASH_SCREEN", "").lower() in ["true", "1", "yes"]
if not hide_splash:
ui.modal_show(
ui.modal(
ui.div(
# Header
ui.h2("Food Mapper", class_="text-center mb-2"),
ui.p(
"Research Tool for Dietary Data Mapping",
class_="text-center text-muted mb-4"
),
ui.hr(),
# Research Paper Section
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-journal-text me-2"),
"Research Publication",
class_="mb-3"
),
ui.div(
ui.p(
"This application was developed as part of ongoing research on automated methods "
"for mapping dietary intake data to food composition databases.",
class_="mb-3"
),
ui.div(
ui.p(
ui.strong("Paper Title:"),
ui.br(),
ui.tags.small(
"[Title Placeholder - To Be Updated]",
class_="text-muted"
),
class_="mb-2"
),
ui.p(
ui.strong("Authors:"),
ui.br(),
ui.tags.small(
"Lemay DG, Strohmeier MP, Stoker RB, Larke JA, Wilson SMG",
class_="text-muted"
),
class_="mb-2"
),
ui.p(
ui.strong("Learn More:"),
ui.br(),
ui.tags.small(
ui.HTML('[<a href="#" target="_blank">Link to paper - Coming Soon</a>]'),
class_="text-muted"
),
class_="mb-2"
),
ui.p(
ui.HTML('<a href="https://github.com/RichardStoker-USDA/Food-Mapper" target="_blank" class="text-decoration-none text-muted"><i class="bi bi-github"></i> View on GitHub</a>'),
class_="mb-3 small"
),
class_="ms-3 border-start ps-3"
)
),
class_="mb-4"
),
# How It Works Section
ui.div(
ui.h5(
ui.tags.i(class_="bi bi-info-circle me-2"),
"How It Works",
class_="mb-3"
),
ui.p(
"Food Mapper uses the GTE-Large neural embedding model to understand the meaning "
"behind food descriptions. This enables accurate matching even when foods are described "
"differently across databases.",
class_="small mb-2"
),
ui.p(
"Traditional manual mapping takes ~28 minutes per food item. "
"This tool automates the process, handling thousands of items in minutes.",
class_="small text-muted"
),
class_="mb-4"
),
# Disclaimer
ui.div(
ui.hr(),
ui.p(
ui.tags.i(class_="bi bi-exclamation-triangle me-1"),
ui.strong("Research Tool Disclaimer"),
class_="text-center mb-2"
),
ui.p(
"This application is a research tool intended for scientific use in nutritional and dietary studies. "
"Results should be validated by domain experts. For research purposes only.",
class_="small text-muted text-center"
),
class_="mt-3"
),
# Get Started Button
ui.div(
ui.input_action_button(
"close_splash",
"Get Started",
class_="btn btn-primary btn-lg"
),
class_="text-center mt-4"
),
class_="p-4"
),
title="",
footer=None,
size="m",
easy_close=True,
fade=True
)
)
# Close splash screen handler
@reactive.effect
@reactive.event(input.close_splash)
def close_splash():
ui.modal_remove()
# Search input debouncing implementation
@reactive.calc
def debounced_search():
try:
return input.search_debounced()
except Exception:
return input.search_filter()
# Threshold badge removed; using slider only
# Dynamic sidebar navigation button
@render.ui
def sidebar_navigation_button():
# Get current active tab - workflow_tabs tracks the main steps
try:
current_tab = input.workflow_tabs()
except:
current_tab = "Tutorial" # Default to Step 0
if current_tab == "Tutorial":
# Check if both files are loaded
in_df = input_df.get()
tgt_df = target_df.get()
files_loaded = (not in_df.empty) and (not tgt_df.empty)
if files_loaded:
return ui.input_action_button(
"sidebar_next",
ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: Configure Data"),
class_="btn btn-primary w-100"
)
else:
return ui.input_action_button(
"sidebar_next",
ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: Configure Data"),
class_="btn btn-primary w-100",
disabled=True
)
elif current_tab == "Step 1: Data & Configure":
# Check if results are available
if not results_df.get().empty:
return ui.input_action_button(
"sidebar_next",
ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: View Results"),
class_="btn btn-primary w-100"
)
else:
return ui.input_action_button(
"sidebar_next",
ui.span(ui.tags.i(class_="bi bi-arrow-right-circle-fill me-2"), "Next: View Results"),
class_="btn btn-primary w-100 disabled",
disabled=True
)
elif current_tab == "Step 2: Results":
return ui.input_action_button(
"sidebar_reset",
ui.span(ui.tags.i(class_="bi bi-arrow-counterclockwise me-2"), "Start New Mapping"),
class_="btn btn-primary w-100"
)
else:
return None
# Sidebar navigation button handler
@reactive.effect
@reactive.event(input.sidebar_next)
def handle_sidebar_next():
try:
current_tab = input.workflow_tabs()
except:
current_tab = "Tutorial"
if current_tab == "Tutorial":
# Only navigate if files are loaded
in_df = input_df.get()
tgt_df = target_df.get()
if (not in_df.empty) and (not tgt_df.empty):
ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure")
elif current_tab == "Step 1: Data & Configure":
ui.update_navs("workflow_tabs", selected="Step 2: Results")
# Sidebar reset button handler
@reactive.effect
@reactive.event(input.sidebar_reset)
def handle_sidebar_reset():
# Clear everything for a fresh start
reset_for_new_analysis()
# Navigate to Step 1 (not Step 0)
ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure")
# Expand the sidebar so user can upload new files
ui.update_sidebar("main_sidebar", show=True)
# Show notification
ui.notification_show(
"Ready for new mapping. Upload your data files.",
type="info",
duration=3
)
# Load sample data from sidebar button (mobile-friendly)
@reactive.effect
@reactive.event(input.sidebar_sample_data)
def load_sample_from_sidebar():
# Load the sample data
sample_input, sample_target = get_sample_data()
input_df.set(sample_input)
target_df.set(sample_target)
# Update column choices
ui.update_select("input_column",
choices=sample_input.columns.tolist(),
selected="description")
ui.update_select("target_column",
choices=sample_target.columns.tolist(),
selected="reference")
# Enable buttons since data is loaded
check_files_loaded()
# Close the sidebar (important for mobile)
ui.update_sidebar("main_sidebar", show=False)
# Navigate to Step 1
ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure")
# Show success notification
ui.notification_show(
"Sample data loaded! Configure your matching settings.",
type="success",
duration=3
)
# Load sample data from tutorial page
@reactive.effect
@reactive.event(input.load_sample)
def load_sample_data():
# Remove previous sample status message if present
try:
ui.remove_ui(selector="#sample_status_msg")
except Exception:
pass
sample_input, sample_target = get_sample_data()
input_df.set(sample_input)
target_df.set(sample_target)
# Update column choices
ui.update_select("input_column",
choices=sample_input.columns.tolist(),
selected="description")
ui.update_select("target_column",
choices=sample_target.columns.tolist(),
selected="reference")
# Show status
ui.insert_ui(
ui.div(
ui.p("Sample data loaded successfully! Moving to Data Setup...",
class_="alert alert-success alert-animated"),
id="sample_status_msg"
),
selector="#sample_status",
where="afterEnd"
)
# Enable buttons since data is loaded
check_files_loaded()
# Automatically navigate to Step 1 after loading sample data
ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure")
# Navigate to Step 1 from tip link
@reactive.effect
@reactive.event(input.goto_step1_from_tip)
def goto_step1_from_tip():
ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure")
# Show Upload Help modal when question icon clicked in sidebar
@reactive.effect
@reactive.event(input.show_upload_help)
def show_upload_help_modal():
ui.modal_show(
ui.modal(
ui.div(
ui.h4("Upload Requirements", class_="mb-3"),
ui.hr(),
ui.h6("File Format"),
ui.tags.ul(
ui.tags.li("CSV format (.csv) required"),
ui.tags.li("Include headers in first row"),
ui.tags.li("UTF-8 encoding recommended")
),
ui.h6("Input File", class_="mt-3"),
ui.p("Items you want to match (one per row)", class_="text-muted"),
ui.h6("Target File", class_="mt-3"),
ui.p("Reference database to match against", class_="text-muted"),
ui.hr(),
ui.p(
ui.tags.small(
"Need help? ",
ui.input_action_link("close_help_goto_tutorial", "View tutorial", class_="text-primary"),
class_="text-muted"
)
),
class_="p-2"
),
footer=ui.input_action_button("close_upload_help", "Got it", class_="btn btn-primary"),
easy_close=True,
size="m",
title=""
)
)
# Close upload help modal
@reactive.effect
@reactive.event(input.close_upload_help)
def close_upload_help():
ui.modal_remove()
# Close help and go to tutorial
@reactive.effect
@reactive.event(input.close_help_goto_tutorial)
def close_help_goto_tutorial():
ui.modal_remove()
ui.update_sidebar("main_sidebar", show=False)
ui.update_navs("workflow_tabs", selected="Tutorial")
# Show Data Requirements modal when info icon clicked
@reactive.effect
@reactive.event(input.show_requirements)
def show_data_requirements_modal():
ui.modal_show(
ui.modal(
ui.div(
ui.h4("Data Requirements", class_="mb-3"),
ui.hr(),
ui.h6("File Format"),
ui.tags.ul(
ui.tags.li("Files must be in CSV format (.csv)"),
ui.tags.li("Must include column headers in first row"),
ui.tags.li("UTF-8 encoding recommended")
),
ui.h6("Input File", class_="mt-3"),
ui.p("Contains the items you want to match. Each row represents one item to find a match for.", class_="text-muted"),
ui.h6("Target File", class_="mt-3"),
ui.p("Contains the reference dataset. The system will find the best match from this list for each input item.", class_="text-muted"),
ui.h6("Best Practices", class_="mt-3"),
ui.tags.ul(
ui.tags.li("Choose columns with descriptive text for best semantic matching"),
ui.tags.li("Remove or clean special characters if needed"),
ui.tags.li("Longer descriptions generally produce better matches")
),
class_="p-2"
),
footer=ui.input_action_button("close_req", "Got it", class_="btn btn-primary"),
easy_close=True,
size="m",
title=""
)
)
# Close Data Requirements modal
@reactive.effect
@reactive.event(input.close_req)
def close_data_requirements_modal():
ui.modal_remove()
# Handle file uploads
@reactive.effect
@reactive.event(input.input_file)
def handle_input_file():
file: list[FileInfo] | None = input.input_file()
if file and len(file) > 0:
df = pd.read_csv(file[0]["datapath"])
input_df.set(df)
# Update column choices
ui.update_select("input_column",
choices=df.columns.tolist(),
selected=df.columns[0])
# Remove previous message then insert a fresh, readable status with filename
try:
ui.remove_ui(selector="#input_file_status")
except Exception:
pass
ui.insert_ui(
ui.div(
ui.p(f"Loaded {len(df):,} inputs",
class_="alert alert-success alert-animated"),
id="input_file_status"
),
selector="#input_status",
where="afterEnd"
)
# Check if both files loaded
check_files_loaded()
# NOTE: Clear files handler removed as button was removed from UI
@reactive.effect
@reactive.event(input.target_file)
def handle_target_file():
file: list[FileInfo] | None = input.target_file()
if file and len(file) > 0:
df = pd.read_csv(file[0]["datapath"])
target_df.set(df)
# Update column choices
ui.update_select("target_column",
choices=df.columns.tolist(),
selected=df.columns[0])
# Remove previous message then insert a fresh, readable status with filename
try:
ui.remove_ui(selector="#target_file_status")
except Exception:
pass
ui.insert_ui(
ui.div(
ui.p(f"Loaded {len(df):,} targets",
class_="alert alert-success alert-animated"),
id="target_file_status"
),
selector="#target_status",
where="afterEnd"
)
# Check if both files loaded
check_files_loaded()
# Preview tables
@render.table
def input_preview():
df = input_df.get()
if not df.empty:
return df.head(5)
return pd.DataFrame()
@render.table
def target_preview():
df = target_df.get()
if not df.empty:
return df.head(5)
return pd.DataFrame()
@render.data_frame
def input_col_preview():
df = input_df.get()
col = input.input_column()
if not df.empty and col and col in df.columns:
sample_values = df[col].dropna().head(5).tolist()
# Apply text cleaning if toggle is on
if input.clean_input():
original_values = sample_values.copy()
cleaned_values = clean_text_simple(sample_values)
preview_df = pd.DataFrame({
"Row": range(1, len(sample_values) + 1),
"Original": original_values,
"After Cleaning": cleaned_values
})
else:
preview_df = pd.DataFrame({
"Row": range(1, len(sample_values) + 1),
"Sample Values": sample_values
})
# Use Shiny DataGrid (theme-aware, interactive)
return render.DataGrid(preview_df)
return pd.DataFrame()
@render.data_frame
def target_col_preview():
df = target_df.get()
col = input.target_column()
if not df.empty and col and col in df.columns:
sample_values = df[col].dropna().head(5).tolist()
# Apply text cleaning if toggle is on
if input.clean_target():
original_values = sample_values.copy()
cleaned_values = clean_text_simple(sample_values)
preview_df = pd.DataFrame({
"Row": range(1, len(sample_values) + 1),
"Original": original_values,
"After Cleaning": cleaned_values
})
else:
preview_df = pd.DataFrame({
"Row": range(1, len(sample_values) + 1),
"Sample Values": sample_values
})
# Use Shiny DataGrid (theme-aware, interactive)
return render.DataGrid(preview_df)
return pd.DataFrame()
# Helper function to check readiness for running mapping
def check_files_loaded():
ready = (not input_df.get().empty) and (not target_df.get().empty)
try:
in_col = input.input_column()
tgt_col = input.target_column()
ready = ready and bool(in_col) and bool(tgt_col)
except Exception:
pass
ui.update_action_button("run_matching", disabled=(not ready))
# Watch column selection to enable/disable run button
@reactive.effect
def _watch_columns_for_run():
try:
_ = (input.input_column(), input.target_column())
except Exception:
pass
check_files_loaded()
# Navigation handlers
# Navigation effects removed in sidebar layout
# Reset analysis handler from Results page button
@reactive.effect
@reactive.event(input.reset_analysis)
def handle_reset_analysis():
# Clear everything for a fresh start
reset_for_new_analysis()
# Navigate to Step 1 (not Step 0)
ui.update_navs("workflow_tabs", selected="Step 1: Data & Configure")
# Expand the sidebar so user can upload new files
ui.update_sidebar("main_sidebar", show=True)
# Show notification
ui.notification_show(
"Ready for new mapping. Upload your data files.",
type="info",
duration=3
)
def reset_for_new_analysis():
# Clear results
results_df.set(pd.DataFrame())
# Clear input and target datasets and selections
input_df.set(pd.DataFrame())
target_df.set(pd.DataFrame())
ui.update_select("input_column", choices=[], selected=None)
ui.update_select("target_column", choices=[], selected=None)
# Disable run button until files and columns are (re)selected
ui.update_action_button("run_matching", disabled=True)
# Clear file upload status messages
for sel in ["#sample_status_msg", "#process_status > *", "#process_progress > *", "#process_summary > *", "#results_summary_msg", "#input_file_status", "#target_file_status"]:
try:
ui.remove_ui(selector=sel)
except Exception:
pass
# Optional centered overlay (placeholder: disabled by default)
@render.ui
def center_progress_overlay():
# Return None so nothing is rendered; keep hook for future use
return None
# Sidebar summary block: only show after results exist
@render.ui
def sidebar_results_summary_block():
df = results_df.get()
if df.empty:
return None
# derive summary
total_inputs = len(df)
no_matches = (df.get('status', '').astype(str).str.upper() == 'NO MATCH').sum()
successful = total_inputs - no_matches
avg_score = df[df.get('status', '').astype(str).str.upper() != 'NO MATCH']['similarity_score'].mean() if 'similarity_score' in df.columns else None
avg_score_str = f"{avg_score:.3f}" if avg_score is not None and not pd.isna(avg_score) else "N/A"
return ui.div(
ui.h5("Results Summary"),
ui.p(f"Total Inputs: {total_inputs}"),
ui.p(f"Successful Matches: {successful}"),
ui.p(f"No Matches: {no_matches}"),
ui.p(f"Average Score: {avg_score_str}"),
class_="alert alert-info alert-animated"
)
# Run matching process
@reactive.effect
@reactive.event(input.run_matching)
async def run_matching():
# Disable the button immediately to prevent double-clicks
ui.update_action_button("run_matching", disabled=True)
# Stay on current tab; navigate to Results after processing
# Clear existing spinner before processing
try:
ui.remove_ui(selector="#processing_spinner")
except Exception:
pass
# Get data
in_df = input_df.get()
tgt_df = target_df.get()
if in_df.empty or tgt_df.empty:
ui.notification_show("Please upload both input and target files", type="warning")
ui.update_action_button("run_matching", disabled=False) # Re-enable on early return
return
# Get settings
in_col = input.input_column()
tgt_col = input.target_column()
threshold = input.threshold()
clean_input_text = input.clean_input()
clean_target_text = input.clean_target()
if not in_col or not tgt_col:
ui.modal_show(
ui.modal(
ui.h5("Select Columns First"),
ui.p("Please choose the input and target columns to match in the 'Data & Configure' tab before running."),
easy_close=True,
footer=ui.input_action_button("dismiss_modal", "OK", class_="btn btn-primary")
)
)
ui.update_action_button("run_matching", disabled=False) # Re-enable on early return
return
# Get API key for semantic embeddings
api_key = get_api_key()
if not api_key:
ui.notification_show("DeepInfra API key not found. Please set DEEPINFRA_API_KEY", type="error")
ui.update_action_button("run_matching", disabled=False) # Re-enable on early return
return
# Reset circuit for this run
global _API_FAILURES, FALLBACK_ACTIVE
_API_FAILURES = 0
FALLBACK_ACTIVE = False
# Clear previous results
ui.remove_ui(selector="#process_status > *")
ui.remove_ui(selector="#process_progress > *")
ui.remove_ui(selector="#process_summary > *")
ui.remove_ui(selector="#method_chips > *")
# Add busy indicator at top of page
ui.busy_indicators.use()
# Use Shiny's built-in Progress with better formatting
with ui.Progress(min=0, max=100) as p:
p.set(5, message="Initializing", detail="Preparing data...")
try:
# Show loading spinner during processing
try:
ui.insert_ui(
ui.div(ui.div(class_="spinner"), class_="spinner-container", id="processing_spinner"),
selector="#process_progress", where="afterBegin"
)
except Exception:
pass
# Prepare data
input_list = in_df[in_col].dropna().tolist()
target_list = tgt_df[tgt_col].dropna().tolist()
# Remove duplicates from target
target_list_unique = list(dict.fromkeys(target_list))
# Apply cleaning to display text if toggles are on
# Store both original and cleaned versions
input_list_display = clean_text_simple(input_list) if clean_input_text else input_list
p.set(10, message="Data Prepared",
detail=f"{len(input_list):,} inputs • {len(target_list_unique):,} targets")
await asyncio.sleep(0.1)
# Initialize results with potentially cleaned input text for display
results = pd.DataFrame({
'input_description': input_list_display
})
# Calculate progress steps (semantic only)
total_methods = 1
progress_per_method = 80 / total_methods
current_progress = 10
# Run semantic embeddings only
effective_batch = min(EMBED_BATCH_SIZE, 1024)
# Check if we'll be using CPU and notify IMMEDIATELY
if MODEL_FALLBACK_MODE == "local":
ui.notification_show(
"Using LOCAL CPU for embeddings. This will take significantly longer than API/GPU processing. "
f"Processing {len(input_list)} items may take several minutes.",
type="warning",
duration=10,
)
FALLBACK_ACTIVE = True
elif FALLBACK_ACTIVE: # Already in fallback from previous failures
ui.notification_show(
"API unavailable. Using LOCAL CPU fallback - processing will be much slower.",
type="warning",
duration=8,
)
# Update progress message if in fallback mode
progress_msg = "Computing semantic embeddings..."
if FALLBACK_ACTIVE or MODEL_FALLBACK_MODE == "local":
progress_msg = "Computing semantic embeddings (CPU - this will take time)..."
p.set(current_progress + 5,
message=progress_msg,
detail=f"Processing {len(input_list):,} items")
await asyncio.sleep(0.1)
# Simple progress callback for embedding batches
batches_total = ((len(input_list) + effective_batch - 1) // effective_batch) + \
((len(target_list_unique) + effective_batch - 1) // effective_batch)
batch_num = [0]
def progress_callback(msg: str):
# Only increment when a batch completes (Embedding batches or Matching)
if msg.startswith("Embedding batches:") or msg.startswith("Matching:"):
batch_num[0] = min(batch_num[0] + 1, batches_total)
progress_pct = current_progress + (batch_num[0] / batches_total) * progress_per_method
p.set(
int(progress_pct),
message="Computing Embeddings",
detail=f"Batch {batch_num[0]:,} of {batches_total:,}"
)
if USE_ASYNC:
embed_results = await run_embed_match_async(
input_list,
target_list_unique,
api_key,
progress_callback=progress_callback,
clean_input=clean_input_text,
clean_target=clean_target_text,
)
else:
embed_results = run_embed_match(
input_list,
target_list_unique,
api_key,
progress_callback=progress_callback,
clean_input=clean_input_text,
clean_target=clean_target_text,
)
# Apply cleaning to matched target text if toggle is on
matched_targets = embed_results['match']
if clean_target_text:
# Clean the matched target text for display
matched_targets = clean_text_simple(matched_targets)
results['best_match'] = matched_targets
results['similarity_score'] = embed_results['score']
# Keep the best match text without decoration for clean exports
current_progress += progress_per_method
p.set(int(current_progress), message="Embeddings Complete", detail="Processing results...")
await asyncio.sleep(0.1)
# Round scores for display
for col in results.columns:
if 'score' in col:
results[col] = results[col].round(4)
# Generate score visualization and status indicators
for col in results.columns:
if 'score' in col.lower() or 'similarity' in col.lower():
bar_col = f"{col}_bar"
results[bar_col] = results[col].apply(create_score_bar)
# Add match status column based on score threshold
if 'best_match' in results.columns:
results.insert(0, 'status', results['similarity_score'].apply(
lambda s: 'NO MATCH' if (pd.notna(s) and float(s) < float(threshold)) else 'Match'
))
p.set(95, message="Finalizing", detail="Preparing visualizations...")
# Store results
results_df.set(results)
# Navigate to results tab automatically
ui.update_navs("workflow_tabs", selected="Step 2: Results")
# Generate summary statistics
total_inputs = len(results)
if 'status' in results.columns:
no_matches = (results['status'] == 'NO MATCH').sum()
successful_matches = total_inputs - no_matches
avg_score = results[results['status'] != 'NO MATCH']['similarity_score'].mean()
avg_score_str = f"{avg_score:.3f}" if not pd.isna(avg_score) else "N/A"
else:
no_matches = 0
successful_matches = total_inputs
avg_score_str = "N/A"
p.set(100, message="Complete", detail="Ready to view results")
await asyncio.sleep(0.5)
# Hide loading spinner
try:
ui.remove_ui(selector="#processing_spinner")
except:
pass
# Show results summary using notifications
ui.notification_show(
f"Processing complete! Generated {len(results)} results.",
type="success",
duration=5
)
# Insert summary into the page
# Replace previous summary (if any), then insert a fresh one with animation
try:
ui.remove_ui(selector="#results_summary_msg")
except Exception:
pass
ui.insert_ui(
ui.div(
ui.p(f"Total Inputs: {total_inputs}"),
ui.p(f"Successful Matches: {successful_matches}"),
ui.p(f"No Matches: {no_matches}"),
ui.p(f"Average Score: {avg_score_str}"),
class_="alert alert-info alert-animated",
id="results_summary_msg"
),
selector="#sidebar_results_summary",
where="afterBegin"
)
# Show processing method indicator
# Update method chip to reflect CPU fallback if active
method_label = "Semantic Embedding (thenlper/gte-large)"
if FALLBACK_ACTIVE:
method_label += " — CPU fallback"
ui.insert_ui(
ui.div(
ui.span(method_label, class_="method-chip"),
class_="text-center mt-3"
),
selector="#method_chips",
where="afterBegin"
)
# After processing completes, navigate to Results tab
ui.update_navs("workflow_tabs", selected="Step 2: Results")
# Re-enable the button after successful completion
ui.update_action_button("run_matching", disabled=False)
except Exception as e:
# Hide loading spinner on error
try:
ui.remove_ui(selector="#processing_spinner")
except:
pass
ui.notification_show(f"Error: {str(e)}", type="error", duration=10)
# Re-enable the button on error
ui.update_action_button("run_matching", disabled=False)
raise
# Filter and display results
@render.table
def results_table():
df = results_df.get()
try:
if input.use_grid():
return pd.DataFrame()
except Exception:
pass
if df.empty:
return pd.DataFrame()
# Apply filters
filtered_df = df.copy()
# Apply search debouncing
search_term = debounced_search()
if search_term and search_term.strip():
mask = filtered_df.apply(
lambda row: row.astype(str).str.contains(search_term, case=False, na=False).any(),
axis=1
)
filtered_df = filtered_df[mask]
# NO MATCH filter
if input.show_no_match():
if 'status' in filtered_df.columns:
filtered_df = filtered_df[filtered_df['status'] == 'NO MATCH']
# Sort by score
if input.sort_by_score():
score_cols = [col for col in filtered_df.columns if 'score' in col.lower()]
if score_cols:
filtered_df = filtered_df.sort_values(score_cols[0], ascending=False)
return filtered_df
# Export All Data - includes original columns from input and target CSVs
@render.download(filename=lambda: f"all_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
def download_all_data():
df = results_df.get()
in_df = input_df.get()
tgt_df = target_df.get()
if df.empty:
return io.BytesIO(b"No results to export")
# Start with the original input dataframe
export_df = in_df.copy()
# Check if text cleaning was applied to input column
# If so, replace the input column with the cleaned version from results
in_col = input.input_column()
if input.clean_input() and in_col and 'input_description' in df.columns:
# Replace the original column with the cleaned version
export_df[in_col] = df['input_description'].values[:len(export_df)]
# Add the status and similarity score columns from results
if 'status' in df.columns:
export_df['status'] = df['status'].values[:len(export_df)]
if 'similarity_score' in df.columns:
export_df['similarity_score'] = df['similarity_score'].values[:len(export_df)]
# Add the matched target data (already cleaned if toggle was on)
if 'best_match' in df.columns:
export_df['matched_target'] = df['best_match'].values[:len(export_df)]
# Try to merge with target dataframe to get all target columns
# This assumes the target column selected contains unique identifiers
tgt_col = input.target_column()
if tgt_col and not tgt_df.empty:
# Create a mapping from target description to full target row
tgt_df_unique = tgt_df.drop_duplicates(subset=[tgt_col])
# Merge based on the matched values
merged = export_df.merge(
tgt_df_unique,
left_on='matched_target',
right_on=tgt_col,
how='left',
suffixes=('', '_target')
)
export_df = merged
# Remove UI-only columns like score bars
bar_cols = [c for c in export_df.columns if c.endswith('_bar')]
export_df = export_df.drop(columns=bar_cols, errors='ignore')
return io.BytesIO(export_df.to_csv(index=False).encode())
# Export Matches - current functionality (results with mappings)
@render.download(filename=lambda: f"matches_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
def download_matches():
df = results_df.get()
if not df.empty:
# Remove UI-only columns like score bars
export_df = df.copy()
bar_cols = [c for c in export_df.columns if c.endswith('_bar')]
export_df = export_df.drop(columns=bar_cols, errors='ignore')
return io.BytesIO(export_df.to_csv(index=False).encode())
return io.BytesIO(b"No results to download")
# Build interactive grid (Tabulator)
@render.ui
def results_tabulator():
df = results_df.get()
if df.empty:
return None
# Convert DataFrame to records for Tabulator
records = df.to_dict(orient='records')
cols = []
for c in df.columns:
col = {"title": c, "field": c}
lc = c.lower()
if ("score" in lc) or ("similarity" in lc):
col["hozAlign"] = "right"
col["sorter"] = "number"
# Disable filter/sort for bar columns
if lc.endswith("_bar"):
col["headerFilter"] = False
col["headerSort"] = False
else:
col["headerFilter"] = "input"
cols.append(col)
data_json = json.dumps(records)
cols_json = json.dumps(cols)
html = (
"<div style=\"height:600px\" id=\"tabulator_results\"></div>\n"
"<script>\n"
"(function(){\n"
" function ensureTabulatorAssets(cb){\n"
" var cssId='tabulator-css';\n"
" if(!document.getElementById(cssId)){\n"
" var l=document.createElement('link'); l.id=cssId; l.rel='stylesheet'; l.href='https://unpkg.com/tabulator-tables@5.5.2/dist/css/tabulator.min.css'; document.head.appendChild(l);\n"
" }\n"
" if(window.Tabulator){ cb(); return; }\n"
" var s=document.createElement('script'); s.src='https://unpkg.com/tabulator-tables@5.5.2/dist/js/tabulator.min.js'; s.onload=cb; document.body.appendChild(s);\n"
" }\n"
" function init(){\n"
" var el = document.getElementById('tabulator_results');\n"
" if(!el) return;\n"
" if (el._tabulator) { el._tabulator.destroy(); }\n"
" var table = new Tabulator(el, {\n"
" data: " + data_json + ",\n"
" reactiveData: false,\n"
" layout: 'fitDataStretch',\n"
" height: '600px',\n"
" movableColumns: true,\n"
" resizableColumns: true,\n"
" columnDefaults: {headerSort: true, editable: false},\n"
" columns: " + cols_json + ",\n"
" initialSort: [{column: 'similarity_score', dir: 'desc'}],\n"
" rowFormatter: function(row){\n"
" var data = row.getData();\n"
" if(data.status === 'NO MATCH'){\n"
" row.getElement().style.backgroundColor = 'rgba(220, 53, 69, 0.08)';\n"
" }\n"
" }\n"
" });\n"
" el._tabulator = table;\n"
" }\n"
" ensureTabulatorAssets(init);\n"
"})();\n"
"</script>\n"
)
return ui.HTML(html)
# Chart description helper
@render.ui
def chart_description():
viz_type = input.plotly_viz_type()
descriptions = {
"density": "**Density Plot**: Shows probability density of scores. Higher peaks = more concentrated scores. Rug plot below shows individual points.",
"histogram": "**Histogram**: Frequency of scores in bins. Height = count in each range. Shows mean and median.",
"threshold": "**Threshold Analysis**: Shows how match percentage changes at different threshold values. Helps find optimal cutoff point."
# HIDDEN DESCRIPTIONS - Keep in sync with hidden visualizations above
# "box": "**Box Plot**: Shows quartiles (Q1, median, Q3) and outliers. Box = middle 50% of data.",
# "violin": "**Violin Plot**: Distribution shape split by match/no-match status. Width = data density.",
# "scatter": "**Scatter Plot**: Each point is one item (index vs score). Blue = match, red = no match. Shows sequential patterns.",
# "ecdf": "**Cumulative Distribution**: Shows % of data at or below each score. Answers 'what % scores below X?'",
# "sunburst": "**Match Breakdown**: Hierarchical view of match statistics. Inner ring shows overall split, outer rings show score ranges."
}
desc = descriptions.get(viz_type, "")
if desc:
return ui.div(
ui.markdown(desc),
class_="border rounded",
style="padding: 0.75rem 1rem; margin-bottom: 1rem; font-size: 0.9rem; background-color: var(--bs-tertiary-bg, transparent); color: inherit;"
)
return None
# Interactive Plotly visualizations
@render_widget
def plotly_viz():
# Access reactive values to establish dependencies
df = results_df.get()
if df.empty:
# Return empty figure when no data
import plotly.graph_objects as go
fig = go.Figure()
fig.add_annotation(
text="Run mapping to view interactive charts",
xref="paper", yref="paper",
x=0.5, y=0.5,
showarrow=False,
font=dict(size=14, color="#666")
)
fig.update_layout(
xaxis=dict(visible=False),
yaxis=dict(visible=False),
height=400
)
return fig
# Check for similarity score column
if "similarity_score" not in df.columns:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_annotation(
text="No similarity scores available",
xref="paper", yref="paper",
x=0.5, y=0.5,
showarrow=False,
font=dict(size=14, color="#666")
)
fig.update_layout(
xaxis=dict(visible=False),
yaxis=dict(visible=False),
height=400
)
return fig
# Get input values - these trigger reactive updates
viz_type = input.plotly_viz_type()
show_threshold = input.show_threshold_line()
threshold = input.threshold()
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
# Keep full dataframe for scatter plot, filter for other plots
df_clean = df.dropna(subset=['similarity_score']).copy()
scores = df_clean["similarity_score"]
# Create figure based on visualization type
if viz_type == "density":
# Kernel Density Estimation
from scipy import stats
density = stats.gaussian_kde(scores)
x_range = np.linspace(0, 1, 200)
y_density = density(x_range)
fig = go.Figure()
# Add density trace
fig.add_trace(go.Scatter(
x=x_range,
y=y_density,
mode='lines',
fill='tozeroy',
name='Density',
line=dict(color='#4e79a7', width=2),
fillcolor='rgba(78, 121, 167, 0.3)',
hovertemplate='Score: %{x:.3f}<br>Density: %{y:.3f}<extra></extra>'
))
# Add rug plot for actual data points
fig.add_trace(go.Scatter(
x=scores,
y=[-0.01 * max(y_density)] * len(scores),
mode='markers',
name='Data points',
marker=dict(color='#4e79a7', size=2, symbol='line-ns', line=dict(width=1, color='#4e79a7')),
hovertemplate='Score: %{x:.3f}<extra></extra>'
))
fig.update_layout(
title="Similarity Score Density Distribution",
xaxis_title="Similarity Score",
yaxis_title="Density",
showlegend=True
)
elif viz_type == "histogram":
fig = go.Figure()
fig.add_trace(go.Histogram(
x=scores,
nbinsx=30,
name='Scores',
marker_color='#4e79a7',
opacity=0.8,
hovertemplate='Score range: %{x}<br>Count: %{y}<extra></extra>'
))
# Add statistics annotation
mean_score = scores.mean()
median_score = scores.median()
fig.add_annotation(
text=f"Mean: {mean_score:.3f}<br>Median: {median_score:.3f}",
xref="paper", yref="paper",
x=0.98, y=0.98,
showarrow=False,
bgcolor="white",
bordercolor="#4e79a7",
borderwidth=1
)
fig.update_layout(
title="Interactive Histogram of Similarity Scores",
xaxis_title="Similarity Score",
yaxis_title="Count",
bargap=0.05
)
elif viz_type == "box":
# Box plot with individual points
fig = go.Figure()
fig.add_trace(go.Box(
y=scores,
name='Scores',
marker_color='#4e79a7',
boxpoints='outliers',
jitter=0.3,
pointpos=-1.8,
hovertemplate='Score: %{y:.3f}<extra></extra>'
))
# Add violin for comparison
fig.add_trace(go.Violin(
y=scores,
name='Distribution',
side='positive',
opacity=0.3,
marker_color='#e15759',
hovertemplate='Score: %{y:.3f}<extra></extra>'
))
fig.update_layout(
title="Box Plot with Distribution Shape",
yaxis_title="Similarity Score",
showlegend=True
)
elif viz_type == "violin":
fig = go.Figure()
# Split by match status using cleaned dataframe
matched_scores = df_clean[df_clean['status'] != 'NO MATCH']['similarity_score']
no_match_scores = df_clean[df_clean['status'] == 'NO MATCH']['similarity_score']
if len(matched_scores) > 0:
fig.add_trace(go.Violin(
y=matched_scores,
name='Matched',
side='negative',
marker_color='#4e79a7',
line_color='#4e79a7',
meanline_visible=True,
hovertemplate='Matched<br>Score: %{y:.3f}<extra></extra>'
))
if len(no_match_scores) > 0:
fig.add_trace(go.Violin(
y=no_match_scores,
name='No Match',
side='positive',
marker_color='#e15759',
line_color='#e15759',
meanline_visible=True,
hovertemplate='No Match<br>Score: %{y:.3f}<extra></extra>'
))
fig.update_layout(
title="Violin Plot: Score Distribution by Match Status",
yaxis_title="Similarity Score",
violingap=0.3,
violinmode='overlay'
)
elif viz_type == "scatter":
# Scatter plot with color by match status
colors = ['#4e79a7' if status != 'NO MATCH' else '#e15759'
for status in df_clean['status']]
fig = go.Figure()
fig.add_trace(go.Scatter(
x=list(range(len(df_clean))),
y=df_clean['similarity_score'],
mode='markers',
marker=dict(
color=colors,
size=8,
opacity=0.6,
line=dict(width=1, color='white')
),
text=df_clean['input_description'],
hovertemplate='Index: %{x}<br>Score: %{y:.3f}<br>Input: %{text}<extra></extra>'
))
fig.update_layout(
title="Similarity Scores by Index",
xaxis_title="Item Index",
yaxis_title="Similarity Score"
)
elif viz_type == "ecdf":
# Empirical Cumulative Distribution Function
sorted_scores = np.sort(scores)
ecdf = np.arange(1, len(sorted_scores) + 1) / len(sorted_scores)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=sorted_scores,
y=ecdf,
mode='lines',
name='ECDF',
line=dict(color='#4e79a7', width=2),
hovertemplate='Score: %{x:.3f}<br>Cumulative %: %{y:.1%}<extra></extra>'
))
# Add markers at quartiles
q25, q50, q75 = np.percentile(scores, [25, 50, 75])
fig.add_trace(go.Scatter(
x=[q25, q50, q75],
y=[0.25, 0.50, 0.75],
mode='markers+text',
name='Quartiles',
marker=dict(color='#e15759', size=10),
text=['Q1', 'Median', 'Q3'],
textposition='top center',
hovertemplate='%{text}<br>Score: %{x:.3f}<extra></extra>'
))
fig.update_layout(
title="Empirical Cumulative Distribution",
xaxis_title="Similarity Score",
yaxis_title="Cumulative Probability",
yaxis=dict(tickformat='.0%')
)
elif viz_type == "threshold":
# Threshold Analysis - shows match rate at different thresholds
thresholds = np.linspace(0, 1, 101)
match_rates = [(scores >= t).mean() for t in thresholds]
fig = go.Figure()
# Main threshold curve
fig.add_trace(go.Scatter(
x=thresholds,
y=match_rates,
mode='lines',
name='Match Rate',
line=dict(color='#4e79a7', width=3),
hovertemplate='Threshold: %{x:.3f}<br>Match Rate: %{y:.1%}<extra></extra>'
))
# Add current threshold marker
if threshold:
current_match_rate = (scores >= threshold).mean()
fig.add_trace(go.Scatter(
x=[threshold],
y=[current_match_rate],
mode='markers+text',
name='Current Threshold',
marker=dict(color='#e15759', size=12),
text=[f'{current_match_rate:.1%}'],
textposition='top center',
hovertemplate='Current: %{x:.3f}<br>Rate: %{y:.1%}<extra></extra>'
))
# Add reference lines
fig.add_vline(x=threshold, line_dash="dash", line_color="#e15759", opacity=0.5)
fig.add_hline(y=current_match_rate, line_dash="dot", line_color="#e15759", opacity=0.5)
# Add 50% reference line
fig.add_hline(y=0.5, line_dash="dash", line_color="gray", opacity=0.3)
fig.update_layout(
title="Threshold Analysis - Match Rate vs Cutoff",
xaxis_title="Threshold Value",
yaxis_title="Match Rate",
yaxis=dict(tickformat='.0%', range=[0, 1.05]),
xaxis=dict(range=[0, 1])
)
# Hidden chart - uncomment block to re-enable
# elif viz_type == "sunburst":
# # Sunburst chart - hierarchical match breakdown
# # Create hierarchical data for sunburst
# total = len(df_clean)
#
# # Calculate match/no-match counts
# if 'status' in df_clean.columns:
# match_mask = df_clean['status'] != 'NO MATCH'
# else:
# match_mask = df_clean['similarity_score'] >= threshold if threshold else df_clean['similarity_score'] >= 0.85
#
# matched = match_mask.sum()
# no_match = total - matched
#
# # Create score bins for matched items
# matched_df = df_clean[match_mask]
#
# # Define score ranges
# labels = []
# parents = []
# values = []
# colors = []
#
# # Root level
# labels.append("All Items")
# parents.append("")
# values.append(total)
# colors.append("#94a3b8")
#
# # Match/No Match level
# if matched > 0:
# labels.append("Matched")
# parents.append("All Items")
# values.append(matched)
# colors.append("#4e79a7")
#
# # Score ranges for matched items - dynamic based on threshold
# if len(matched_df) > 0:
# scores_matched = matched_df['similarity_score']
#
# # Calculate dynamic ranges based on user's threshold
# # Use the actual threshold value, defaulting to 0.85 if not set
# thresh_val = threshold if threshold else 0.85
#
# # High confidence: threshold + 0.10 or higher (capped at 1.0)
# high_threshold = min(thresh_val + 0.10, 1.0)
# high_conf = (scores_matched >= high_threshold).sum()
# if high_conf > 0:
# labels.append(f"High (≥{high_threshold:.2f}): {high_conf}")
# parents.append("Matched")
# values.append(high_conf)
# colors.append("#059669")
#
# # Good confidence: threshold + 0.05 to threshold + 0.10
# good_lower = thresh_val + 0.05
# good_upper = high_threshold
# if good_lower < 1.0: # Only show if range is valid
# good_conf = ((scores_matched >= good_lower) & (scores_matched < good_upper)).sum()
# if good_conf > 0:
# labels.append(f"Good ({good_lower:.2f}-{good_upper:.2f}): {good_conf}")
# parents.append("Matched")
# values.append(good_conf)
# colors.append("#0ea5e9")
#
# # Moderate confidence: threshold to threshold + 0.05
# mod_lower = thresh_val
# mod_upper = min(thresh_val + 0.05, 1.0)
# moderate_conf = ((scores_matched >= mod_lower) & (scores_matched < mod_upper)).sum()
# if moderate_conf > 0:
# labels.append(f"Moderate ({mod_lower:.2f}-{mod_upper:.2f}): {moderate_conf}")
# parents.append("Matched")
# values.append(moderate_conf)
# colors.append("#8b5cf6")
#
# # Note: There shouldn't be any "Low" matches below threshold
# # since we filter by threshold, but check just in case
# low_conf = (scores_matched < thresh_val).sum()
# if low_conf > 0:
# labels.append(f"Below threshold (<{thresh_val:.2f}): {low_conf}")
# parents.append("Matched")
# values.append(low_conf)
# colors.append("#f59e0b")
#
# if no_match > 0:
# labels.append("No Match")
# parents.append("All Items")
# values.append(no_match)
# colors.append("#e15759")
#
# # Create sunburst
# fig = go.Figure(go.Sunburst(
# labels=labels,
# parents=parents,
# values=values,
# branchvalues="total",
# marker=dict(colors=colors),
# textinfo="label+percent parent",
# hovertemplate='<b>%{label}</b><br>Count: %{value}<br>%{percentParent}<extra></extra>'
# ))
#
# fig.update_layout(
# title="Match Statistics Breakdown",
# height=500
# )
# Add threshold line if requested (but not for sunburst chart where it doesn't apply)
if show_threshold and threshold and viz_type != "sunburst":
fig.add_vline(
x=threshold,
line_dash="dash",
line_color="red",
annotation_text=f"Threshold: {threshold:.2f}",
annotation_position="top"
)
# Common layout updates (individual charts already set their axis ranges)
fig.update_layout(
template="plotly_white",
hovermode='closest',
height=500,
margin=dict(l=50, r=50, t=50, b=50),
font=dict(family="system-ui, -apple-system, sans-serif")
)
# Return the Plotly figure directly for render_widget
return fig
# Create the app
app = App(app_ui, server)