OCR_MONITORING / app.py
pitcserverbilal's picture
Update app.py
75ea906 verified
Raw
History Blame Contribute Delete
37.5 kB
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from datetime import datetime, timedelta
import os
import io
# =========================
# PAGE CONFIG
# =========================
st.set_page_config(
page_title="OCR Batch Intelligence Dashboard",
page_icon="πŸ“Š",
layout="wide",
initial_sidebar_state="expanded"
)
# =========================
# DISCO MAPPING
# =========================
DISCO_MAP = {
"11": "LESCO",
"12": "GEPCO",
"13": "FESCO",
"14": "IESCO",
"15": "MEPCO",
"26": "PESCO",
"27": "HAZECO",
"37": "HESCO",
"38": "SEPCO",
"48": "QESCO",
"59": "TESCO"
}
# =========================
# POWER BI MODERN STYLING
# =========================
st.markdown("""
<style>
:root {
--primary-blue: #0078D4;
--secondary-blue: #50E6FF;
--accent-blue: #1890FF;
--dark-blue: #005A9E;
--light-blue: #E6F7FF;
--dark-gray: #2C2C2C;
--medium-gray: #4A4A4A;
--light-gray: #F3F2F1;
--white: #FFFFFF;
--success-green: #107C10;
--light-green: #DFF6DD;
--warning-orange: #FFB900;
--light-orange: #FFF4CE;
--danger-red: #D13438;
--light-red: #FDE7E9;
--accent-purple: #8661C5;
--light-purple: #F3F0FF;
}
.enhanced-header {
background: linear-gradient(135deg, var(--dark-blue), var(--primary-blue));
padding: 30px;
border-radius: 16px;
margin-bottom: 30px;
text-align: center;
color: white;
box-shadow: 0 10px 40px rgba(0, 120, 212, 0.2);
border: 1px solid rgba(255, 255, 255, 0.15);
}
.enhanced-kpi {
background: var(--white);
border-radius: 16px;
padding: 25px 15px;
color: var(--dark-gray);
text-align: center;
height: 200px;
width: 100%;
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.08);
margin: 0 5px 15px 5px;
}
.enhanced-kpi.blue {
background: linear-gradient(135deg, var(--primary-blue), var(--dark-blue));
color: white;
}
.enhanced-kpi.green {
background: linear-gradient(135deg, var(--success-green), #0D5C0D);
color: white;
}
.enhanced-kpi.orange {
background: linear-gradient(135deg, var(--warning-orange), #D69C00);
color: white;
}
.enhanced-kpi.red {
background: linear-gradient(135deg, var(--danger-red), #B02A30);
color: white;
}
.enhanced-kpi.purple {
background: linear-gradient(135deg, var(--accent-purple), #6A4CA6);
color: white;
}
.enhanced-kpi .kpi-icon {
font-size: 30px;
margin-bottom: 5px;
opacity: 0.9;
}
.enhanced-kpi .kpi-value {
font-size: 48px;
font-weight: 800;
line-height: 1;
margin: 15px 0;
text-shadow: 1px 1px 3px rgba(0, 0, 0, 0.2);
}
.enhanced-kpi .kpi-label {
font-size: 15px;
opacity: 0.9;
text-transform: uppercase;
letter-spacing: 1.2px;
font-weight: 600;
margin-top: 5px;
padding: 0 20px;
text-align: center;
line-height: 1.2;
}
.enhanced-card {
background: var(--white);
border-radius: 16px;
padding: 25px;
margin-bottom: 25px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.08);
border: 1px solid rgba(0, 0, 0, 0.06);
}
.flag-card {
border-radius: 12px;
padding: 25px 15px;
text-align: center;
margin: 5px;
height: 200px;
width: 100%;
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
}
.flag-A {
background: linear-gradient(135deg, rgba(16, 124, 16, 0.1), rgba(16, 124, 16, 0.05));
border: 5px solid var(--success-green);
color: var(--success-green);
}
.flag-C {
background: linear-gradient(135deg, rgba(255, 185, 0, 0.1), rgba(255, 185, 0, 0.05));
border: 5px solid var(--warning-orange);
color: var(--warning-orange);
}
.flag-D {
background: linear-gradient(135deg, rgba(253, 126, 20, 0.1), rgba(253, 126, 20, 0.05));
border: 5px solid #fd7e14;
color: #fd7e14;
}
.flag-E {
background: linear-gradient(135deg, rgba(220, 53, 69, 0.1), rgba(220, 53, 69, 0.05));
border: 5px solid var(--danger-red);
color: var(--danger-red);
}
.flag-N {
background: linear-gradient(135deg, rgba(108, 117, 125, 0.1), rgba(108, 117, 125, 0.05));
border: 5px solid #6c757d;
color: #6c757d;
}
.status-indicator {
display: inline-flex;
align-items: center;
padding: 8px 14px;
border-radius: 20px;
font-size: 14px;
font-weight: 600;
margin: 2px;
}
.status-success {
background: var(--light-green);
color: var(--success-green);
}
.status-warning {
background: var(--light-orange);
color: var(--warning-orange);
}
.status-info {
background: var(--light-blue);
color: var(--primary-blue);
}
.recently-added {
background: linear-gradient(135deg, rgba(0, 120, 212, 0.1), rgba(0, 120, 212, 0.05));
border-left: 5px solid var(--primary-blue);
padding: 10px;
margin: 5px 0;
border-radius: 8px;
}
.new-records-badge {
background: linear-gradient(135deg, #ff6b6b, #ee5a52);
color: white;
padding: 4px 12px;
border-radius: 20px;
font-size: 12px;
font-weight: bold;
margin-left: 8px;
}
</style>
""", unsafe_allow_html=True)
# =========================
# DATA LOADING FUNCTIONS
# =========================
@st.cache_data(ttl=300, show_spinner=False)
def load_data_from_file(uploaded_file):
"""Load data from uploaded file and add processing timestamp"""
try:
file_extension = uploaded_file.name.split('.')[-1].lower()
if file_extension == 'parquet':
df = pd.read_parquet(uploaded_file)
st.sidebar.success(f"βœ… Loaded {len(df):,} records from Parquet file")
elif file_extension == 'feather':
df = pd.read_feather(uploaded_file)
st.sidebar.success(f"βœ… Loaded {len(df):,} records from Feather file")
elif file_extension == 'csv':
df = pd.read_csv(uploaded_file)
st.sidebar.success(f"βœ… Loaded {len(df):,} records from CSV file")
elif file_extension in ['xlsx', 'xls']:
df = pd.read_excel(uploaded_file)
st.sidebar.success(f"βœ… Loaded {len(df):,} records from Excel file")
else:
st.sidebar.error(f"❌ Unsupported file format: {file_extension}")
return None
# Check required columns
required_columns = ['REF_DIGITS', 'DISCO_CODE', 'BILMONTH', 'IMAGE_VERIFY_CODE_PITC']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
st.sidebar.error(f"❌ Missing required columns: {missing_columns}")
return None
# =========================
# NEW: ADD PROCESSING TIMESTAMP
# =========================
# Generate a unique batch timestamp for this upload
current_time = datetime.now()
# Check if PROCESSING_TIMESTAMP already exists
if 'PROCESSING_TIMESTAMP' not in df.columns:
# If timestamp doesn't exist, add it for all records
df['PROCESSING_TIMESTAMP'] = current_time
df['PROCESSING_DATE'] = current_time.date()
df['PROCESSING_HOUR'] = current_time.hour
# Store original upload time
df['UPLOAD_TIMESTAMP'] = current_time
st.session_state['last_upload_time'] = current_time
else:
# Convert existing timestamp to datetime if needed
if not pd.api.types.is_datetime64_any_dtype(df["PROCESSING_TIMESTAMP"]):
df["PROCESSING_TIMESTAMP"] = pd.to_datetime(df["PROCESSING_TIMESTAMP"])
# Extract date and hour
df['PROCESSING_DATE'] = df['PROCESSING_TIMESTAMP'].dt.date
df['PROCESSING_HOUR'] = df['PROCESSING_TIMESTAMP'].dt.hour
# Store the latest processing time
latest_time = df['PROCESSING_TIMESTAMP'].max()
st.session_state['last_upload_time'] = latest_time
# Process the data
df["DISCO_NAME"] = df["DISCO_CODE"].map(DISCO_MAP).fillna("UNKNOWN")
df["BATCH_NO"] = df["REF_DIGITS"].str[:2]
df["SUB_DIV"] = df["REF_DIGITS"].str[:5]
df["BATCH_ID"] = df["BATCH_NO"] + "-" + df["DISCO_CODE"]
df["PROCESSING_STATUS"] = df["IMAGE_VERIFY_CODE_PITC"].apply(
lambda x: "Processed" if pd.notna(x) else "Pending"
)
df["ACCURACY_CATEGORY"] = df["IMAGE_VERIFY_CODE_PITC"].apply(
lambda x: "Success (A,C,D)" if x in ['A', 'C', 'D'] else
"Images Not Available (E)" if x == 'E' else
"Reading Not Matched (N)" if x == 'N' else
"Not Processed" if pd.isna(x) else "Other"
)
# Convert BILMONTH to datetime if needed
if not pd.api.types.is_datetime64_any_dtype(df["BILMONTH"]):
df["BILMONTH"] = pd.to_datetime(df["BILMONTH"])
return df
except Exception as e:
st.sidebar.error(f"❌ Error loading file: {str(e)}")
return None
def create_sample_data():
"""Create sample data for demonstration with timestamps"""
sample_size = 5000
codes = list(DISCO_MAP.keys())[:5]
data = {
'REF_DIGITS': [],
'DISCO_CODE': [],
'BILMONTH': [],
'IMAGE_VERIFY_CODE_PITC': []
}
for i in range(sample_size):
disco = np.random.choice(codes)
batch_no = f"{np.random.randint(1, 50):02d}"
sub_div = f"{batch_no}{disco}{np.random.randint(1, 5)}"
ref_digits = f"{sub_div}{np.random.randint(100000000, 999999999)}"
data['REF_DIGITS'].append(ref_digits)
data['DISCO_CODE'].append(disco)
data['BILMONTH'].append(np.random.choice(pd.date_range('2024-01-01', periods=6, freq='MS')))
data['IMAGE_VERIFY_CODE_PITC'].append(np.random.choice(['A', 'C', 'D', 'E', 'N', None],
p=[0.35, 0.2, 0.15, 0.1, 0.05, 0.15]))
df = pd.DataFrame(data)
# =========================
# ADD TIMESTAMPS TO SAMPLE DATA
# =========================
# Simulate records processed at different times
current_time = datetime.now()
# Create varying timestamps for the last 7 days
time_stamps = []
for i in range(len(df)):
# 70% processed today, 20% yesterday, 10% older
rand = np.random.random()
if rand < 0.7:
# Today with varying hours
hour = np.random.randint(0, 24)
delta = timedelta(hours=hour, minutes=np.random.randint(0, 60))
time_stamps.append(current_time - delta)
elif rand < 0.9:
# Yesterday
delta = timedelta(days=1, hours=np.random.randint(0, 24))
time_stamps.append(current_time - delta)
else:
# 2-7 days ago
days = np.random.randint(2, 8)
delta = timedelta(days=days, hours=np.random.randint(0, 24))
time_stamps.append(current_time - delta)
df['PROCESSING_TIMESTAMP'] = time_stamps
df['PROCESSING_DATE'] = pd.to_datetime(df['PROCESSING_TIMESTAMP']).dt.date
df['PROCESSING_HOUR'] = pd.to_datetime(df['PROCESSING_TIMESTAMP']).dt.hour
df['UPLOAD_TIMESTAMP'] = current_time
# Process the data
df["DISCO_NAME"] = df["DISCO_CODE"].map(DISCO_MAP)
df["BATCH_NO"] = df["REF_DIGITS"].str[:2]
df["SUB_DIV"] = df["REF_DIGITS"].str[:5]
df["BATCH_ID"] = df["BATCH_NO"] + "-" + df["DISCO_CODE"]
df["PROCESSING_STATUS"] = df["IMAGE_VERIFY_CODE_PITC"].apply(
lambda x: "Processed" if pd.notna(x) else "Pending"
)
df["ACCURACY_CATEGORY"] = df["IMAGE_VERIFY_CODE_PITC"].apply(
lambda x: "Success (A,C,D)" if x in ['A', 'C', 'D'] else
"Images Not Available (E)" if x == 'E' else
"Reading Not Matched (N)" if x == 'N' else
"Not Processed" if pd.isna(x) else "Other"
)
# Store the latest timestamp
st.session_state['last_upload_time'] = df['PROCESSING_TIMESTAMP'].max()
return df
# =========================
# HELPER FUNCTIONS
# =========================
def get_batch_statistics(df):
"""Get statistics for all batches"""
batch_stats = []
for batch_id in sorted(df["BATCH_ID"].unique()):
batch_df = df[df["BATCH_ID"] == batch_id]
total_records = len(batch_df)
processed_records = batch_df["IMAGE_VERIFY_CODE_PITC"].notna().sum()
flag_counts = {}
for flag in ['A', 'C', 'D', 'E', 'N']:
flag_counts[flag] = (batch_df["IMAGE_VERIFY_CODE_PITC"] == flag).sum()
successful = flag_counts['A'] + flag_counts['C'] + flag_counts['D']
total_an = flag_counts['A'] + flag_counts['N']
ocr_accuracy = (flag_counts['A'] / total_an * 100) if total_an > 0 else 0
processing_rate = (processed_records / total_records * 100) if total_records > 0 else 0
success_rate = (successful / processed_records * 100) if processed_records > 0 else 0
# Get latest processing time for this batch
latest_processing = batch_df['PROCESSING_TIMESTAMP'].max() if 'PROCESSING_TIMESTAMP' in batch_df.columns else None
batch_stats.append({
"Batch ID": batch_id,
"Batch No": batch_df["BATCH_NO"].iloc[0],
"DISCO": batch_df["DISCO_NAME"].iloc[0],
"Total Records": total_records,
"Processed": processed_records,
"Successful (A,C,D)": successful,
"Images Not Available (E)": flag_counts['E'],
"Reading Not Matched (N)": flag_counts['N'],
"Success Rate (A,C,D)": success_rate,
"Processing Rate": processing_rate,
"OCR Model Accuracy (A vs N)": ocr_accuracy,
"Flag A": flag_counts['A'],
"Flag C": flag_counts['C'],
"Flag D": flag_counts['D'],
"Flag E": flag_counts['E'],
"Flag N": flag_counts['N'],
"Latest Processing Time": latest_processing,
})
return pd.DataFrame(batch_stats)
def calculate_ocr_model_accuracy(df):
"""Calculate OCR model accuracy based on A vs N flags only"""
count_a = (df["IMAGE_VERIFY_CODE_PITC"] == 'A').sum()
count_n = (df["IMAGE_VERIFY_CODE_PITC"] == 'N').sum()
total_an = count_a + count_n
if total_an > 0:
accuracy = (count_a / total_an * 100)
else:
accuracy = 0
return {
"count_a": count_a,
"count_n": count_n,
"total_an": total_an,
"accuracy": accuracy
}
def get_recently_processed(df, hours=24):
"""Get records processed in the last N hours"""
if 'PROCESSING_TIMESTAMP' not in df.columns:
return pd.DataFrame()
cutoff_time = datetime.now() - timedelta(hours=hours)
recent_df = df[df['PROCESSING_TIMESTAMP'] > cutoff_time].copy()
return recent_df
def get_processing_trend(df):
"""Get processing trend by date/hour"""
if 'PROCESSING_DATE' not in df.columns:
return pd.DataFrame()
# Group by processing date
daily_trend = df.groupby('PROCESSING_DATE').size().reset_index(name='Records_Processed')
daily_trend = daily_trend.sort_values('PROCESSING_DATE')
# Add running total
daily_trend['Cumulative_Total'] = daily_trend['Records_Processed'].cumsum()
return daily_trend
# =========================
# SIDEBAR
# =========================
with st.sidebar:
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### πŸ“‚ DATA SOURCE")
data_source = st.radio(
"Select data source:",
["Upload File", "Sample Data"],
index=0
)
df = None
if data_source == "Upload File":
uploaded_file = st.file_uploader(
"Choose a file",
type=['parquet', 'feather', 'csv', 'xlsx', 'xls'],
help="Upload Parquet, Feather, CSV, or Excel file"
)
if uploaded_file is not None:
with st.spinner(f"Loading {uploaded_file.name}..."):
df = load_data_from_file(uploaded_file)
if df is not None:
st.info(f"**File:** {uploaded_file.name}")
st.info(f"**Records:** {len(df):,}")
st.info(f"**DISCOs:** {df['DISCO_NAME'].nunique()}")
# Show timestamp info
if 'PROCESSING_TIMESTAMP' in df.columns:
latest_time = df['PROCESSING_TIMESTAMP'].max()
st.info(f"**Latest Processing:** {latest_time.strftime('%Y-%m-%d %H:%M:%S')}")
else: # Sample Data
if st.button("Load Sample Data", use_container_width=True):
with st.spinner("Creating sample data..."):
df = create_sample_data()
st.success(f"βœ… Loaded {len(df):,} sample records")
st.markdown('</div>', unsafe_allow_html=True)
if df is not None:
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### 🎯 ANALYSIS SETTINGS")
# DISCO selection
selection_mode = st.radio(
"Analysis Mode:",
["Single DISCO", "All DISCOS"],
index=0
)
if selection_mode == "Single DISCO":
disco_choice = st.selectbox(
"Select DISCO:",
options=sorted(df["DISCO_NAME"].unique().tolist()),
index=0
)
disco_code = None
for code, name in DISCO_MAP.items():
if name == disco_choice:
disco_code = code
break
else:
disco_code = None
disco_choice = "ALL DISCOS"
# Filters
st.markdown("#### βš™οΈ FILTERS")
date_filter = st.checkbox("Apply Date Filter", value=False)
if date_filter:
min_date = df["BILMONTH"].min().date()
max_date = df["BILMONTH"].max().date()
start_date, end_date = st.date_input(
"Select Date Range:",
value=(min_date, max_date),
min_value=min_date,
max_value=max_date
)
# NEW: Processing Time Filter
st.markdown("#### ⏰ PROCESSING TIME")
show_recent = st.checkbox("Show only recently processed records", value=False)
if show_recent:
recent_hours = st.slider(
"Last N hours:",
min_value=1,
max_value=168,
value=24,
help="Show records processed in the last N hours"
)
show_only_processed = st.checkbox("Show only processed records", value=False)
# Flag filters
selected_flags = st.multiselect(
"Filter by Flag:",
["A", "C", "D", "E", "N", "Unprocessed"],
default=["A", "C", "D", "E", "N", "Unprocessed"]
)
# Batch filter
batch_filter = st.text_input(
"Filter by Batch ID:",
placeholder="e.g., 01-15, 02-15"
)
st.markdown('</div>', unsafe_allow_html=True)
# Refresh button
if st.button("πŸ”„ Refresh Dashboard", use_container_width=True, type="primary"):
st.cache_data.clear()
st.rerun()
# =========================
# MAIN DASHBOARD
# =========================
if df is None:
# Show welcome screen
st.markdown(f"""
<div class="enhanced-header">
<h1 style="margin: 0; font-size: 42px; font-weight: 800;">
πŸ“Š OCR BATCH INTELLIGENCE DASHBOARD
</h1>
<p style="margin: 10px 0 0 0; font-size: 18px; opacity: 0.95;">
Upload your data file to get started
</p>
</div>
""", unsafe_allow_html=True)
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### πŸ“‹ HOW TO USE")
col1, col2 = st.columns(2)
with col1:
st.markdown("#### πŸ“₯ **Step 1: Extract Data**")
st.markdown("""
1. Run the extraction script on your local machine
2. Connect to your Oracle database
3. Export data as **Parquet file** (recommended)
4. The script will create `ocr_data_YYYYMMDD_HHMMSS.parquet`
**Add timestamp during extraction:**
- Add a timestamp column during Oracle extraction
- This helps track when each record was processed
""")
with col2:
st.markdown("#### πŸ“€ **Step 2: Upload Here**")
st.markdown("""
1. Go to sidebar β†’ **Upload File**
2. Select your Parquet file
3. Wait for data to load
4. Start analyzing!
**Automatic timestamp:**
- If no timestamp exists, one will be added
- You can filter by recent processing time
- Track new records easily
""")
st.markdown("#### ⏰ **Time Tracking Features:**")
st.markdown("""
βœ… **Processing Timestamp** - When each record was processed
βœ… **Recent Records Filter** - View records from last N hours
βœ… **Processing Trend** - See daily processing volume
βœ… **New Records Alert** - Identify newly added records
""")
st.markdown('</div>', unsafe_allow_html=True)
st.stop()
# =========================
# APPLY FILTERS
# =========================
# Apply DISCO filter
if disco_code:
filtered_df = df[df["DISCO_CODE"] == disco_code].copy()
else:
filtered_df = df.copy()
# Apply date filter
if 'date_filter' in locals() and date_filter:
filtered_df = filtered_df[
(filtered_df["BILMONTH"] >= pd.Timestamp(start_date)) &
(filtered_df["BILMONTH"] <= pd.Timestamp(end_date))
]
# NEW: Apply recent processing filter
if 'show_recent' in locals() and show_recent:
recent_df = get_recently_processed(filtered_df, hours=recent_hours)
if not recent_df.empty:
filtered_df = recent_df
else:
st.warning(f"No records processed in the last {recent_hours} hours")
# Apply processed filter
if 'show_only_processed' in locals() and show_only_processed:
filtered_df = filtered_df[filtered_df["IMAGE_VERIFY_CODE_PITC"].notna()]
# Apply flag filter
if 'selected_flags' in locals() and selected_flags:
if "Unprocessed" in selected_flags:
flag_filter = filtered_df["IMAGE_VERIFY_CODE_PITC"].isna()
else:
flag_filter = filtered_df["IMAGE_VERIFY_CODE_PITC"].isin([f for f in selected_flags if f != "Unprocessed"])
if "Unprocessed" in selected_flags and len(selected_flags) > 1:
flag_filter = flag_filter | filtered_df["IMAGE_VERIFY_CODE_PITC"].isin([f for f in selected_flags if f != "Unprocessed"])
filtered_df = filtered_df[flag_filter]
# Apply batch filter
if 'batch_filter' in locals() and batch_filter:
filtered_df = filtered_df[filtered_df["BATCH_ID"].str.contains(batch_filter, case=False, na=False)]
# Calculate statistics
batch_stats = get_batch_statistics(filtered_df)
ocr_accuracy = calculate_ocr_model_accuracy(filtered_df)
# =========================
# HEADER WITH TIME INFO
# =========================
# Get timestamp info
if 'PROCESSING_TIMESTAMP' in filtered_df.columns and not filtered_df.empty:
latest_time = filtered_df['PROCESSING_TIMESTAMP'].max()
earliest_time = filtered_df['PROCESSING_TIMESTAMP'].min()
time_range = f"{earliest_time.strftime('%Y-%m-%d')} to {latest_time.strftime('%Y-%m-%d')}"
else:
latest_time = "N/A"
time_range = "N/A"
st.markdown(f"""
<div class="enhanced-header">
<h1 style="margin: 0; font-size: 42px; font-weight: 800;">
πŸ“Š OCR BATCH INTELLIGENCE DASHBOARD
</h1>
<p style="margin: 10px 0 0 0; font-size: 18px; opacity: 0.95;">
Advanced Analytics | Time-Based Tracking | OCR Model Performance
</p>
<div style="margin-top: 20px; display: flex; justify-content: center; gap: 20px; flex-wrap: wrap;">
<span class="status-indicator status-success">πŸ“ File Data</span>
<span class="status-indicator status-info">πŸ“… Current: {datetime.now().strftime('%Y-%m-%d %H:%M')}</span>
<span class="status-indicator status-info">🏒 {disco_choice}</span>
<span class="status-indicator status-info">πŸ“Š {len(filtered_df):,} Records</span>
<span class="status-indicator status-info">⏰ Latest: {latest_time.strftime('%Y-%m-%d %H:%M') if isinstance(latest_time, datetime) else 'N/A'}</span>
</div>
</div>
""", unsafe_allow_html=True)
# =========================
# NEW: RECENT PROCESSING SECTION
# =========================
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### ⏰ RECENT PROCESSING ACTIVITY")
# Calculate recent records
current_time = datetime.now()
if 'PROCESSING_TIMESTAMP' in df.columns:
# Records from last 24 hours
last_24h = get_recently_processed(df, hours=24)
last_7d = get_recently_processed(df, hours=168)
col1, col2, col3, col4 = st.columns(4)
with col1:
recent_1h = get_recently_processed(df, hours=1)
st.metric(
label="Last 1 Hour",
value=f"{len(recent_1h):,}",
delta=f"{len(recent_1h)} new" if len(recent_1h) > 0 else "0 new"
)
with col2:
recent_24h = get_recently_processed(df, hours=24)
st.metric(
label="Last 24 Hours",
value=f"{len(recent_24h):,}",
delta=f"{len(recent_24h)} processed"
)
with col3:
recent_7d = get_recently_processed(df, hours=168)
st.metric(
label="Last 7 Days",
value=f"{len(recent_7d):,}",
delta=f"{len(recent_7d)} processed"
)
with col4:
# Calculate processing rate per day
if len(recent_7d) > 0:
avg_per_day = len(recent_7d) / 7
st.metric(
label="Avg/Day (7D)",
value=f"{avg_per_day:,.0f}",
delta=""
)
else:
st.metric("Avg/Day (7D)", "0", delta="")
# Show processing trend chart
processing_trend = get_processing_trend(df)
if not processing_trend.empty:
fig = px.line(
processing_trend,
x='PROCESSING_DATE',
y='Records_Processed',
title='Daily Processing Trend',
markers=True,
color_discrete_sequence=['#0078D4']
)
fig.update_layout(
xaxis_title='Date',
yaxis_title='Records Processed',
hovermode='x unified'
)
st.plotly_chart(fig, use_container_width=True)
else:
st.info("No processing timestamp data available. Timestamps will be added on next upload.")
st.markdown('</div>', unsafe_allow_html=True)
# =========================
# EXECUTIVE KPIs
# =========================
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### πŸ“ˆ EXECUTIVE DASHBOARD")
total_records = len(filtered_df)
processed_records = filtered_df["IMAGE_VERIFY_CODE_PITC"].notna().sum()
successful_records = filtered_df["IMAGE_VERIFY_CODE_PITC"].isin(['A', 'C', 'D']).sum()
processing_rate = (processed_records / total_records * 100) if total_records > 0 else 0
success_rate = (successful_records / processed_records * 100) if processed_records > 0 else 0
kpi_cols = st.columns(5)
with kpi_cols[0]:
st.markdown(f"""
<div class="enhanced-kpi blue">
<div class="kpi-icon">πŸ“Š</div>
<div class="kpi-value">{total_records:,}</div>
<div class="kpi-label">Total Records</div>
<div style="margin-top: 10px; font-size: 14px; opacity: 0.8;">
{processing_rate:.1f}% processed
</div>
</div>
""", unsafe_allow_html=True)
with kpi_cols[1]:
st.markdown(f"""
<div class="enhanced-kpi green">
<div class="kpi-icon">βœ…</div>
<div class="kpi-value">{success_rate:.1f}%</div>
<div class="kpi-label">Success Rate (A,C,D)</div>
<div style="margin-top: 10px; font-size: 14px; opacity: 0.8;">
{successful_records:,} successful
</div>
</div>
""", unsafe_allow_html=True)
with kpi_cols[2]:
st.markdown(f"""
<div class="enhanced-kpi purple">
<div class="kpi-icon">πŸ€–</div>
<div class="kpi-value">{ocr_accuracy['accuracy']:.1f}%</div>
<div class="kpi-label">OCR Model Accuracy</div>
<div style="margin-top: 10px; font-size: 14px; opacity: 0.8;">
A vs N only
</div>
</div>
""", unsafe_allow_html=True)
with kpi_cols[3]:
perfect_matches = (filtered_df["IMAGE_VERIFY_CODE_PITC"] == 'A').sum()
perfect_match_rate = (perfect_matches / processed_records * 100) if processed_records > 0 else 0
st.markdown(f"""
<div class="enhanced-kpi orange">
<div class="kpi-icon">⭐</div>
<div class="kpi-value">{perfect_match_rate:.1f}%</div>
<div class="kpi-label">Perfect Match (A)</div>
<div style="margin-top: 10px; font-size: 14px; opacity: 0.8;">
{perfect_matches:,} perfect
</div>
</div>
""", unsafe_allow_html=True)
with kpi_cols[4]:
image_issues = (filtered_df["IMAGE_VERIFY_CODE_PITC"] == 'E').sum()
image_issue_rate = (image_issues / processed_records * 100) if processed_records > 0 else 0
st.markdown(f"""
<div class="enhanced-kpi red">
<div class="kpi-icon">πŸ–ΌοΈ</div>
<div class="kpi-value">{image_issue_rate:.1f}%</div>
<div class="kpi-label">Image Issues (E)</div>
<div style="margin-top: 10px; font-size: 14px; opacity: 0.8;">
{image_issues:,} records
</div>
</div>
""", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# =========================
# FLAG DISTRIBUTION
# =========================
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### 🏷️ FLAG CLASSIFICATION DISTRIBUTION")
flag_counts = filtered_df["IMAGE_VERIFY_CODE_PITC"].value_counts()
flag_cols = st.columns(5)
for idx, flag in enumerate(['A', 'C', 'D', 'E', 'N']):
count = flag_counts.get(flag, 0)
with flag_cols[idx]:
percentage = (count / processed_records * 100) if processed_records > 0 else 0
flag_labels = {
'A': 'βœ… PERFECT MATCH',
'C': '⚠️ FLAG C',
'D': '⚠️ FLAG D',
'E': '❌ IMAGE ISSUE',
'N': '❌ READING ISSUE'
}
flag_descs = {
'A': 'Successful with perfect reading (80%+)',
'C': 'Successful with minor issues',
'D': 'Successful but needs review',
'E': 'Image not available',
'N': 'Reading not matched (<80%)'
}
st.markdown(f"""
<div class="flag-card flag-{flag}">
<div style="font-size: 32px; font-weight: 800;">{flag}</div>
<div style="font-size: 28px; font-weight: 900;">{count:,}</div>
<div style="font-size: 12px; font-weight: 600; margin: 5px 0;">
{flag_labels.get(flag, f'FLAG {flag}')}
</div>
<div style="font-size: 14px; color: #666; margin: 5px 0;">
{percentage:.1f}% of processed
</div>
<div style="font-size: 12px; color: #888; font-style: italic; margin-top: 5px;">
{flag_descs.get(flag, 'Unknown flag')}
</div>
</div>
""", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# =========================
# DATA PREVIEW WITH TIMESTAMP
# =========================
st.markdown('<div class="enhanced-card">', unsafe_allow_html=True)
st.markdown("### πŸ“‹ DATA OVERVIEW")
tab1, tab2, tab3 = st.tabs(["πŸ“Š Records", "πŸ“ˆ Batch Statistics", "⏰ Recent Activity"])
with tab1:
display_columns = ["REF_DIGITS", "DISCO_NAME", "BATCH_ID",
"IMAGE_VERIFY_CODE_PITC", "ACCURACY_CATEGORY"]
# Add timestamp if available
if 'PROCESSING_TIMESTAMP' in filtered_df.columns:
display_columns.append("PROCESSING_TIMESTAMP")
st.dataframe(
filtered_df[display_columns].head(100),
use_container_width=True,
column_config={
"PROCESSING_TIMESTAMP": st.column_config.DatetimeColumn(
"Processing Time",
format="YYYY-MM-DD HH:mm:ss"
)
} if 'PROCESSING_TIMESTAMP' in display_columns else None
)
with tab2:
if not batch_stats.empty:
display_stats = batch_stats[["Batch ID", "DISCO", "Total Records", "Processed",
"Success Rate (A,C,D)", "OCR Model Accuracy (A vs N)",
"Latest Processing Time"]]
st.dataframe(display_stats, use_container_width=True)
else:
st.info("No batch statistics available")
with tab3:
# Show recently processed records
if 'PROCESSING_TIMESTAMP' in df.columns:
recent_records = get_recently_processed(df, hours=24)
if not recent_records.empty:
st.markdown(f"### πŸ†• Records Processed in Last 24 Hours: {len(recent_records):,}")
# Group by hour
recent_records['PROCESSING_HOUR'] = pd.to_datetime(recent_records['PROCESSING_TIMESTAMP']).dt.floor('H')
hourly_counts = recent_records.groupby('PROCESSING_HOUR').size().reset_index(name='Count')
hourly_counts = hourly_counts.sort_values('PROCESSING_HOUR', ascending=False)
# Show hourly breakdown
col1, col2 = st.columns([2, 1])
with col1:
st.dataframe(
recent_records[["REF_DIGITS", "DISCO_NAME", "BATCH_ID",
"IMAGE_VERIFY_CODE_PITC", "PROCESSING_TIMESTAMP"]]
.sort_values('PROCESSING_TIMESTAMP', ascending=False)
.head(50),
use_container_width=True
)
with col2:
st.markdown("#### πŸ“Š By Hour")
st.dataframe(hourly_counts, use_container_width=True)
# Show latest batch
if not recent_records.empty:
latest_batch = recent_records.loc[recent_records['PROCESSING_TIMESTAMP'].idxmax()]
st.markdown("#### πŸ” Latest Record")
st.write(f"**REF:** {latest_batch['REF_DIGITS']}")
st.write(f"**DISCO:** {latest_batch['DISCO_NAME']}")
st.write(f"**Batch:** {latest_batch['BATCH_ID']}")
st.write(f"**Flag:** {latest_batch['IMAGE_VERIFY_CODE_PITC']}")
st.write(f"**Time:** {latest_batch['PROCESSING_TIMESTAMP']}")
else:
st.info("No records processed in the last 24 hours")
else:
st.info("No timestamp data available")
st.markdown('</div>', unsafe_allow_html=True)
# =========================
# FOOTER WITH ENHANCED TIME INFO
# =========================
# Calculate time-based metrics
if 'PROCESSING_TIMESTAMP' in filtered_df.columns and not filtered_df.empty:
time_metrics = {
"latest_time": filtered_df['PROCESSING_TIMESTAMP'].max(),
"earliest_time": filtered_df['PROCESSING_TIMESTAMP'].min(),
"records_today": len(get_recently_processed(df, hours=24)),
"unique_dates": filtered_df['PROCESSING_DATE'].nunique() if 'PROCESSING_DATE' in filtered_df.columns else 0
}
else:
time_metrics = {
"latest_time": "N/A",
"earliest_time": "N/A",
"records_today": 0,
"unique_dates": 0
}
st.markdown(f"""
<div style="
background: linear-gradient(135deg, var(--dark-gray), #1a1a1a);
color: white;
padding: 20px;
border-radius: 12px;
margin-top: 30px;
text-align: center;
font-size: 12px;
">
<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 10px;">
<div>
πŸ“… <strong>Last Updated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
</div>
<div>
🏒 <strong>DISCO:</strong> {disco_choice}
</div>
<div>
πŸ“Š <strong>Records:</strong> {len(filtered_df):,}
</div>
<div>
βœ… <strong>Success Rate:</strong> {success_rate:.1f}%
</div>
<div>
πŸ€– <strong>OCR Accuracy:</strong> {ocr_accuracy['accuracy']:.1f}%
</div>
<div>
⏰ <strong>Latest Processing:</strong> {time_metrics['latest_time'].strftime('%Y-%m-%d %H:%M') if isinstance(time_metrics['latest_time'], datetime) else 'N/A'}
</div>
</div>
<div style="margin-top: 10px; font-size: 11px; opacity: 0.7;">
πŸš€ OCR Intelligence Dashboard | Time-Based Tracking | Parquet Format Recommended
</div>
</div>
""", unsafe_allow_html=True)