Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| import os | |
| import io | |
| # ========================= | |
| # PAGE CONFIG | |
| # ========================= | |
| st.set_page_config( | |
| page_title="OCR Batch Intelligence Dashboard", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # ========================= | |
| # DISCO MAPPING | |
| # ========================= | |
| DISCO_MAP = { | |
| "11": "LESCO", | |
| "12": "GEPCO", | |
| "13": "FESCO", | |
| "14": "IESCO", | |
| "15": "MEPCO", | |
| "26": "PESCO", | |
| "27": "HAZECO", | |
| "37": "HESCO", | |
| "38": "SEPCO", | |
| "48": "QESCO", | |
| "59": "TESCO" | |
| } | |
| # ========================= | |
| # POWER BI MODERN STYLING | |
| # ========================= | |
| st.markdown(""" | |
| <style> | |
| :root { | |
| --primary-blue: #0078D4; | |
| --secondary-blue: #50E6FF; | |
| --accent-blue: #1890FF; | |
| --dark-blue: #005A9E; | |
| --light-blue: #E6F7FF; | |
| --dark-gray: #2C2C2C; | |
| --medium-gray: #4A4A4A; | |
| --light-gray: #F3F2F1; | |
| --white: #FFFFFF; | |
| --success-green: #107C10; | |
| --light-green: #DFF6DD; | |
| --warning-orange: #FFB900; | |
| --light-orange: #FFF4CE; | |
| --danger-red: #D13438; | |
| --light-red: #FDE7E9; | |
| --accent-purple: #8661C5; | |
| --light-purple: #F3F0FF; | |
| } | |
| .enhanced-header { | |
| background: linear-gradient(135deg, var(--dark-blue), var(--primary-blue)); | |
| padding: 30px; | |
| border-radius: 16px; | |
| margin-bottom: 30px; | |
| text-align: center; | |
| color: white; | |
| box-shadow: 0 10px 40px rgba(0, 120, 212, 0.2); | |
| border: 1px solid rgba(255, 255, 255, 0.15); | |
| } | |
| .enhanced-kpi { | |
| background: var(--white); | |
| border-radius: 16px; | |
| padding: 25px 15px; | |
| color: var(--dark-gray); | |
| text-align: center; | |
| height: 200px; | |
| width: 100%; | |
| display: flex; | |
| flex-direction: column; | |
| justify-content: center; | |
| align-items: center; | |
| box-shadow: 0 8px 32px rgba(0, 0, 0, 0.08); | |
| margin: 0 5px 15px 5px; | |
| } | |
| .enhanced-kpi.blue { | |
| background: linear-gradient(135deg, var(--primary-blue), var(--dark-blue)); | |
| color: white; | |
| } | |
| .enhanced-kpi.green { | |
| background: linear-gradient(135deg, var(--success-green), #0D5C0D); | |
| color: white; | |
| } | |
| .enhanced-kpi.orange { | |
| background: linear-gradient(135deg, var(--warning-orange), #D69C00); | |
| color: white; | |
| } | |
| .enhanced-kpi.red { | |
| background: linear-gradient(135deg, var(--danger-red), #B02A30); | |
| color: white; | |
| } | |
| .enhanced-kpi.purple { | |
| background: linear-gradient(135deg, var(--accent-purple), #6A4CA6); | |
| color: white; | |
| } | |
| .enhanced-kpi .kpi-icon { | |
| font-size: 30px; | |
| margin-bottom: 5px; | |
| opacity: 0.9; | |
| } | |
| .enhanced-kpi .kpi-value { | |
| font-size: 48px; | |
| font-weight: 800; | |
| line-height: 1; | |
| margin: 15px 0; | |
| text-shadow: 1px 1px 3px rgba(0, 0, 0, 0.2); | |
| } | |
| .enhanced-kpi .kpi-label { | |
| font-size: 15px; | |
| opacity: 0.9; | |
| text-transform: uppercase; | |
| letter-spacing: 1.2px; | |
| font-weight: 600; | |
| margin-top: 5px; | |
| padding: 0 20px; | |
| text-align: center; | |
| line-height: 1.2; | |
| } | |
| .enhanced-card { | |
| background: var(--white); | |
| border-radius: 16px; | |
| padding: 25px; | |
| margin-bottom: 25px; | |
| box-shadow: 0 8px 32px rgba(0, 0, 0, 0.08); | |
| border: 1px solid rgba(0, 0, 0, 0.06); | |
| } | |
| .flag-card { | |
| border-radius: 12px; | |
| padding: 25px 15px; | |
| text-align: center; | |
| margin: 5px; | |
| height: 200px; | |
| width: 100%; | |
| display: flex; | |
| flex-direction: column; | |
| justify-content: center; | |
| align-items: center; | |
| } | |
| .flag-A { | |
| background: linear-gradient(135deg, rgba(16, 124, 16, 0.1), rgba(16, 124, 16, 0.05)); | |
| border: 5px solid var(--success-green); | |
| color: var(--success-green); | |
| } | |
| .flag-C { | |
| background: linear-gradient(135deg, rgba(255, 185, 0, 0.1), rgba(255, 185, 0, 0.05)); | |
| border: 5px solid var(--warning-orange); | |
| color: var(--warning-orange); | |
| } | |
| .flag-D { | |
| background: linear-gradient(135deg, rgba(253, 126, 20, 0.1), rgba(253, 126, 20, 0.05)); | |
| border: 5px solid #fd7e14; | |
| color: #fd7e14; | |
| } | |
| .flag-E { | |
| background: linear-gradient(135deg, rgba(220, 53, 69, 0.1), rgba(220, 53, 69, 0.05)); | |
| border: 5px solid var(--danger-red); | |
| color: var(--danger-red); | |
| } | |
| .flag-N { | |
| background: linear-gradient(135deg, rgba(108, 117, 125, 0.1), rgba(108, 117, 125, 0.05)); | |
| border: 5px solid #6c757d; | |
| color: #6c757d; | |
| } | |
| .status-indicator { | |
| display: inline-flex; | |
| align-items: center; | |
| padding: 8px 14px; | |
| border-radius: 20px; | |
| font-size: 14px; | |
| font-weight: 600; | |
| margin: 2px; | |
| } | |
| .status-success { | |
| background: var(--light-green); | |
| color: var(--success-green); | |
| } | |
| .status-warning { | |
| background: var(--light-orange); | |
| color: var(--warning-orange); | |
| } | |
| .status-info { | |
| background: var(--light-blue); | |
| color: var(--primary-blue); | |
| } | |
| .recently-added { | |
| background: linear-gradient(135deg, rgba(0, 120, 212, 0.1), rgba(0, 120, 212, 0.05)); | |
| border-left: 5px solid var(--primary-blue); | |
| padding: 10px; | |
| margin: 5px 0; | |
| border-radius: 8px; | |
| } | |
| .new-records-badge { | |
| background: linear-gradient(135deg, #ff6b6b, #ee5a52); | |
| color: white; | |
| padding: 4px 12px; | |
| border-radius: 20px; | |
| font-size: 12px; | |
| font-weight: bold; | |
| margin-left: 8px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ========================= | |
| # DATA LOADING FUNCTIONS | |
| # ========================= | |
| def load_data_from_file(uploaded_file): | |
| """Load data from uploaded file and add processing timestamp""" | |
| try: | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| if file_extension == 'parquet': | |
| df = pd.read_parquet(uploaded_file) | |
| st.sidebar.success(f"β Loaded {len(df):,} records from Parquet file") | |
| elif file_extension == 'feather': | |
| df = pd.read_feather(uploaded_file) | |
| st.sidebar.success(f"β Loaded {len(df):,} records from Feather file") | |
| elif file_extension == 'csv': | |
| df = pd.read_csv(uploaded_file) | |
| st.sidebar.success(f"β Loaded {len(df):,} records from CSV file") | |
| elif file_extension in ['xlsx', 'xls']: | |
| df = pd.read_excel(uploaded_file) | |
| st.sidebar.success(f"β Loaded {len(df):,} records from Excel file") | |
| else: | |
| st.sidebar.error(f"β Unsupported file format: {file_extension}") | |
| return None | |
| # Check required columns | |
| required_columns = ['REF_DIGITS', 'DISCO_CODE', 'BILMONTH', 'IMAGE_VERIFY_CODE_PITC'] | |
| missing_columns = [col for col in required_columns if col not in df.columns] | |
| if missing_columns: | |
| st.sidebar.error(f"β Missing required columns: {missing_columns}") | |
| return None | |
| # ========================= | |
| # NEW: ADD PROCESSING TIMESTAMP | |
| # ========================= | |
| # Generate a unique batch timestamp for this upload | |
| current_time = datetime.now() | |
| # Check if PROCESSING_TIMESTAMP already exists | |
| if 'PROCESSING_TIMESTAMP' not in df.columns: | |
| # If timestamp doesn't exist, add it for all records | |
| df['PROCESSING_TIMESTAMP'] = current_time | |
| df['PROCESSING_DATE'] = current_time.date() | |
| df['PROCESSING_HOUR'] = current_time.hour | |
| # Store original upload time | |
| df['UPLOAD_TIMESTAMP'] = current_time | |
| st.session_state['last_upload_time'] = current_time | |
| else: | |
| # Convert existing timestamp to datetime if needed | |
| if not pd.api.types.is_datetime64_any_dtype(df["PROCESSING_TIMESTAMP"]): | |
| df["PROCESSING_TIMESTAMP"] = pd.to_datetime(df["PROCESSING_TIMESTAMP"]) | |
| # Extract date and hour | |
| df['PROCESSING_DATE'] = df['PROCESSING_TIMESTAMP'].dt.date | |
| df['PROCESSING_HOUR'] = df['PROCESSING_TIMESTAMP'].dt.hour | |
| # Store the latest processing time | |
| latest_time = df['PROCESSING_TIMESTAMP'].max() | |
| st.session_state['last_upload_time'] = latest_time | |
| # Process the data | |
| df["DISCO_NAME"] = df["DISCO_CODE"].map(DISCO_MAP).fillna("UNKNOWN") | |
| df["BATCH_NO"] = df["REF_DIGITS"].str[:2] | |
| df["SUB_DIV"] = df["REF_DIGITS"].str[:5] | |
| df["BATCH_ID"] = df["BATCH_NO"] + "-" + df["DISCO_CODE"] | |
| df["PROCESSING_STATUS"] = df["IMAGE_VERIFY_CODE_PITC"].apply( | |
| lambda x: "Processed" if pd.notna(x) else "Pending" | |
| ) | |
| df["ACCURACY_CATEGORY"] = df["IMAGE_VERIFY_CODE_PITC"].apply( | |
| lambda x: "Success (A,C,D)" if x in ['A', 'C', 'D'] else | |
| "Images Not Available (E)" if x == 'E' else | |
| "Reading Not Matched (N)" if x == 'N' else | |
| "Not Processed" if pd.isna(x) else "Other" | |
| ) | |
| # Convert BILMONTH to datetime if needed | |
| if not pd.api.types.is_datetime64_any_dtype(df["BILMONTH"]): | |
| df["BILMONTH"] = pd.to_datetime(df["BILMONTH"]) | |
| return df | |
| except Exception as e: | |
| st.sidebar.error(f"β Error loading file: {str(e)}") | |
| return None | |
| def create_sample_data(): | |
| """Create sample data for demonstration with timestamps""" | |
| sample_size = 5000 | |
| codes = list(DISCO_MAP.keys())[:5] | |
| data = { | |
| 'REF_DIGITS': [], | |
| 'DISCO_CODE': [], | |
| 'BILMONTH': [], | |
| 'IMAGE_VERIFY_CODE_PITC': [] | |
| } | |
| for i in range(sample_size): | |
| disco = np.random.choice(codes) | |
| batch_no = f"{np.random.randint(1, 50):02d}" | |
| sub_div = f"{batch_no}{disco}{np.random.randint(1, 5)}" | |
| ref_digits = f"{sub_div}{np.random.randint(100000000, 999999999)}" | |
| data['REF_DIGITS'].append(ref_digits) | |
| data['DISCO_CODE'].append(disco) | |
| data['BILMONTH'].append(np.random.choice(pd.date_range('2024-01-01', periods=6, freq='MS'))) | |
| data['IMAGE_VERIFY_CODE_PITC'].append(np.random.choice(['A', 'C', 'D', 'E', 'N', None], | |
| p=[0.35, 0.2, 0.15, 0.1, 0.05, 0.15])) | |
| df = pd.DataFrame(data) | |
| # ========================= | |
| # ADD TIMESTAMPS TO SAMPLE DATA | |
| # ========================= | |
| # Simulate records processed at different times | |
| current_time = datetime.now() | |
| # Create varying timestamps for the last 7 days | |
| time_stamps = [] | |
| for i in range(len(df)): | |
| # 70% processed today, 20% yesterday, 10% older | |
| rand = np.random.random() | |
| if rand < 0.7: | |
| # Today with varying hours | |
| hour = np.random.randint(0, 24) | |
| delta = timedelta(hours=hour, minutes=np.random.randint(0, 60)) | |
| time_stamps.append(current_time - delta) | |
| elif rand < 0.9: | |
| # Yesterday | |
| delta = timedelta(days=1, hours=np.random.randint(0, 24)) | |
| time_stamps.append(current_time - delta) | |
| else: | |
| # 2-7 days ago | |
| days = np.random.randint(2, 8) | |
| delta = timedelta(days=days, hours=np.random.randint(0, 24)) | |
| time_stamps.append(current_time - delta) | |
| df['PROCESSING_TIMESTAMP'] = time_stamps | |
| df['PROCESSING_DATE'] = pd.to_datetime(df['PROCESSING_TIMESTAMP']).dt.date | |
| df['PROCESSING_HOUR'] = pd.to_datetime(df['PROCESSING_TIMESTAMP']).dt.hour | |
| df['UPLOAD_TIMESTAMP'] = current_time | |
| # Process the data | |
| df["DISCO_NAME"] = df["DISCO_CODE"].map(DISCO_MAP) | |
| df["BATCH_NO"] = df["REF_DIGITS"].str[:2] | |
| df["SUB_DIV"] = df["REF_DIGITS"].str[:5] | |
| df["BATCH_ID"] = df["BATCH_NO"] + "-" + df["DISCO_CODE"] | |
| df["PROCESSING_STATUS"] = df["IMAGE_VERIFY_CODE_PITC"].apply( | |
| lambda x: "Processed" if pd.notna(x) else "Pending" | |
| ) | |
| df["ACCURACY_CATEGORY"] = df["IMAGE_VERIFY_CODE_PITC"].apply( | |
| lambda x: "Success (A,C,D)" if x in ['A', 'C', 'D'] else | |
| "Images Not Available (E)" if x == 'E' else | |
| "Reading Not Matched (N)" if x == 'N' else | |
| "Not Processed" if pd.isna(x) else "Other" | |
| ) | |
| # Store the latest timestamp | |
| st.session_state['last_upload_time'] = df['PROCESSING_TIMESTAMP'].max() | |
| return df | |
| # ========================= | |
| # HELPER FUNCTIONS | |
| # ========================= | |
| def get_batch_statistics(df): | |
| """Get statistics for all batches""" | |
| batch_stats = [] | |
| for batch_id in sorted(df["BATCH_ID"].unique()): | |
| batch_df = df[df["BATCH_ID"] == batch_id] | |
| total_records = len(batch_df) | |
| processed_records = batch_df["IMAGE_VERIFY_CODE_PITC"].notna().sum() | |
| flag_counts = {} | |
| for flag in ['A', 'C', 'D', 'E', 'N']: | |
| flag_counts[flag] = (batch_df["IMAGE_VERIFY_CODE_PITC"] == flag).sum() | |
| successful = flag_counts['A'] + flag_counts['C'] + flag_counts['D'] | |
| total_an = flag_counts['A'] + flag_counts['N'] | |
| ocr_accuracy = (flag_counts['A'] / total_an * 100) if total_an > 0 else 0 | |
| processing_rate = (processed_records / total_records * 100) if total_records > 0 else 0 | |
| success_rate = (successful / processed_records * 100) if processed_records > 0 else 0 | |
| # Get latest processing time for this batch | |
| latest_processing = batch_df['PROCESSING_TIMESTAMP'].max() if 'PROCESSING_TIMESTAMP' in batch_df.columns else None | |
| batch_stats.append({ | |
| "Batch ID": batch_id, | |
| "Batch No": batch_df["BATCH_NO"].iloc[0], | |
| "DISCO": batch_df["DISCO_NAME"].iloc[0], | |
| "Total Records": total_records, | |
| "Processed": processed_records, | |
| "Successful (A,C,D)": successful, | |
| "Images Not Available (E)": flag_counts['E'], | |
| "Reading Not Matched (N)": flag_counts['N'], | |
| "Success Rate (A,C,D)": success_rate, | |
| "Processing Rate": processing_rate, | |
| "OCR Model Accuracy (A vs N)": ocr_accuracy, | |
| "Flag A": flag_counts['A'], | |
| "Flag C": flag_counts['C'], | |
| "Flag D": flag_counts['D'], | |
| "Flag E": flag_counts['E'], | |
| "Flag N": flag_counts['N'], | |
| "Latest Processing Time": latest_processing, | |
| }) | |
| return pd.DataFrame(batch_stats) | |
| def calculate_ocr_model_accuracy(df): | |
| """Calculate OCR model accuracy based on A vs N flags only""" | |
| count_a = (df["IMAGE_VERIFY_CODE_PITC"] == 'A').sum() | |
| count_n = (df["IMAGE_VERIFY_CODE_PITC"] == 'N').sum() | |
| total_an = count_a + count_n | |
| if total_an > 0: | |
| accuracy = (count_a / total_an * 100) | |
| else: | |
| accuracy = 0 | |
| return { | |
| "count_a": count_a, | |
| "count_n": count_n, | |
| "total_an": total_an, | |
| "accuracy": accuracy | |
| } | |
| def get_recently_processed(df, hours=24): | |
| """Get records processed in the last N hours""" | |
| if 'PROCESSING_TIMESTAMP' not in df.columns: | |
| return pd.DataFrame() | |
| cutoff_time = datetime.now() - timedelta(hours=hours) | |
| recent_df = df[df['PROCESSING_TIMESTAMP'] > cutoff_time].copy() | |
| return recent_df | |
| def get_processing_trend(df): | |
| """Get processing trend by date/hour""" | |
| if 'PROCESSING_DATE' not in df.columns: | |
| return pd.DataFrame() | |
| # Group by processing date | |
| daily_trend = df.groupby('PROCESSING_DATE').size().reset_index(name='Records_Processed') | |
| daily_trend = daily_trend.sort_values('PROCESSING_DATE') | |
| # Add running total | |
| daily_trend['Cumulative_Total'] = daily_trend['Records_Processed'].cumsum() | |
| return daily_trend | |
| # ========================= | |
| # SIDEBAR | |
| # ========================= | |
| with st.sidebar: | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### π DATA SOURCE") | |
| data_source = st.radio( | |
| "Select data source:", | |
| ["Upload File", "Sample Data"], | |
| index=0 | |
| ) | |
| df = None | |
| if data_source == "Upload File": | |
| uploaded_file = st.file_uploader( | |
| "Choose a file", | |
| type=['parquet', 'feather', 'csv', 'xlsx', 'xls'], | |
| help="Upload Parquet, Feather, CSV, or Excel file" | |
| ) | |
| if uploaded_file is not None: | |
| with st.spinner(f"Loading {uploaded_file.name}..."): | |
| df = load_data_from_file(uploaded_file) | |
| if df is not None: | |
| st.info(f"**File:** {uploaded_file.name}") | |
| st.info(f"**Records:** {len(df):,}") | |
| st.info(f"**DISCOs:** {df['DISCO_NAME'].nunique()}") | |
| # Show timestamp info | |
| if 'PROCESSING_TIMESTAMP' in df.columns: | |
| latest_time = df['PROCESSING_TIMESTAMP'].max() | |
| st.info(f"**Latest Processing:** {latest_time.strftime('%Y-%m-%d %H:%M:%S')}") | |
| else: # Sample Data | |
| if st.button("Load Sample Data", use_container_width=True): | |
| with st.spinner("Creating sample data..."): | |
| df = create_sample_data() | |
| st.success(f"β Loaded {len(df):,} sample records") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| if df is not None: | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### π― ANALYSIS SETTINGS") | |
| # DISCO selection | |
| selection_mode = st.radio( | |
| "Analysis Mode:", | |
| ["Single DISCO", "All DISCOS"], | |
| index=0 | |
| ) | |
| if selection_mode == "Single DISCO": | |
| disco_choice = st.selectbox( | |
| "Select DISCO:", | |
| options=sorted(df["DISCO_NAME"].unique().tolist()), | |
| index=0 | |
| ) | |
| disco_code = None | |
| for code, name in DISCO_MAP.items(): | |
| if name == disco_choice: | |
| disco_code = code | |
| break | |
| else: | |
| disco_code = None | |
| disco_choice = "ALL DISCOS" | |
| # Filters | |
| st.markdown("#### βοΈ FILTERS") | |
| date_filter = st.checkbox("Apply Date Filter", value=False) | |
| if date_filter: | |
| min_date = df["BILMONTH"].min().date() | |
| max_date = df["BILMONTH"].max().date() | |
| start_date, end_date = st.date_input( | |
| "Select Date Range:", | |
| value=(min_date, max_date), | |
| min_value=min_date, | |
| max_value=max_date | |
| ) | |
| # NEW: Processing Time Filter | |
| st.markdown("#### β° PROCESSING TIME") | |
| show_recent = st.checkbox("Show only recently processed records", value=False) | |
| if show_recent: | |
| recent_hours = st.slider( | |
| "Last N hours:", | |
| min_value=1, | |
| max_value=168, | |
| value=24, | |
| help="Show records processed in the last N hours" | |
| ) | |
| show_only_processed = st.checkbox("Show only processed records", value=False) | |
| # Flag filters | |
| selected_flags = st.multiselect( | |
| "Filter by Flag:", | |
| ["A", "C", "D", "E", "N", "Unprocessed"], | |
| default=["A", "C", "D", "E", "N", "Unprocessed"] | |
| ) | |
| # Batch filter | |
| batch_filter = st.text_input( | |
| "Filter by Batch ID:", | |
| placeholder="e.g., 01-15, 02-15" | |
| ) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Refresh button | |
| if st.button("π Refresh Dashboard", use_container_width=True, type="primary"): | |
| st.cache_data.clear() | |
| st.rerun() | |
| # ========================= | |
| # MAIN DASHBOARD | |
| # ========================= | |
| if df is None: | |
| # Show welcome screen | |
| st.markdown(f""" | |
| <div class="enhanced-header"> | |
| <h1 style="margin: 0; font-size: 42px; font-weight: 800;"> | |
| π OCR BATCH INTELLIGENCE DASHBOARD | |
| </h1> | |
| <p style="margin: 10px 0 0 0; font-size: 18px; opacity: 0.95;"> | |
| Upload your data file to get started | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### π HOW TO USE") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("#### π₯ **Step 1: Extract Data**") | |
| st.markdown(""" | |
| 1. Run the extraction script on your local machine | |
| 2. Connect to your Oracle database | |
| 3. Export data as **Parquet file** (recommended) | |
| 4. The script will create `ocr_data_YYYYMMDD_HHMMSS.parquet` | |
| **Add timestamp during extraction:** | |
| - Add a timestamp column during Oracle extraction | |
| - This helps track when each record was processed | |
| """) | |
| with col2: | |
| st.markdown("#### π€ **Step 2: Upload Here**") | |
| st.markdown(""" | |
| 1. Go to sidebar β **Upload File** | |
| 2. Select your Parquet file | |
| 3. Wait for data to load | |
| 4. Start analyzing! | |
| **Automatic timestamp:** | |
| - If no timestamp exists, one will be added | |
| - You can filter by recent processing time | |
| - Track new records easily | |
| """) | |
| st.markdown("#### β° **Time Tracking Features:**") | |
| st.markdown(""" | |
| β **Processing Timestamp** - When each record was processed | |
| β **Recent Records Filter** - View records from last N hours | |
| β **Processing Trend** - See daily processing volume | |
| β **New Records Alert** - Identify newly added records | |
| """) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| st.stop() | |
| # ========================= | |
| # APPLY FILTERS | |
| # ========================= | |
| # Apply DISCO filter | |
| if disco_code: | |
| filtered_df = df[df["DISCO_CODE"] == disco_code].copy() | |
| else: | |
| filtered_df = df.copy() | |
| # Apply date filter | |
| if 'date_filter' in locals() and date_filter: | |
| filtered_df = filtered_df[ | |
| (filtered_df["BILMONTH"] >= pd.Timestamp(start_date)) & | |
| (filtered_df["BILMONTH"] <= pd.Timestamp(end_date)) | |
| ] | |
| # NEW: Apply recent processing filter | |
| if 'show_recent' in locals() and show_recent: | |
| recent_df = get_recently_processed(filtered_df, hours=recent_hours) | |
| if not recent_df.empty: | |
| filtered_df = recent_df | |
| else: | |
| st.warning(f"No records processed in the last {recent_hours} hours") | |
| # Apply processed filter | |
| if 'show_only_processed' in locals() and show_only_processed: | |
| filtered_df = filtered_df[filtered_df["IMAGE_VERIFY_CODE_PITC"].notna()] | |
| # Apply flag filter | |
| if 'selected_flags' in locals() and selected_flags: | |
| if "Unprocessed" in selected_flags: | |
| flag_filter = filtered_df["IMAGE_VERIFY_CODE_PITC"].isna() | |
| else: | |
| flag_filter = filtered_df["IMAGE_VERIFY_CODE_PITC"].isin([f for f in selected_flags if f != "Unprocessed"]) | |
| if "Unprocessed" in selected_flags and len(selected_flags) > 1: | |
| flag_filter = flag_filter | filtered_df["IMAGE_VERIFY_CODE_PITC"].isin([f for f in selected_flags if f != "Unprocessed"]) | |
| filtered_df = filtered_df[flag_filter] | |
| # Apply batch filter | |
| if 'batch_filter' in locals() and batch_filter: | |
| filtered_df = filtered_df[filtered_df["BATCH_ID"].str.contains(batch_filter, case=False, na=False)] | |
| # Calculate statistics | |
| batch_stats = get_batch_statistics(filtered_df) | |
| ocr_accuracy = calculate_ocr_model_accuracy(filtered_df) | |
| # ========================= | |
| # HEADER WITH TIME INFO | |
| # ========================= | |
| # Get timestamp info | |
| if 'PROCESSING_TIMESTAMP' in filtered_df.columns and not filtered_df.empty: | |
| latest_time = filtered_df['PROCESSING_TIMESTAMP'].max() | |
| earliest_time = filtered_df['PROCESSING_TIMESTAMP'].min() | |
| time_range = f"{earliest_time.strftime('%Y-%m-%d')} to {latest_time.strftime('%Y-%m-%d')}" | |
| else: | |
| latest_time = "N/A" | |
| time_range = "N/A" | |
| st.markdown(f""" | |
| <div class="enhanced-header"> | |
| <h1 style="margin: 0; font-size: 42px; font-weight: 800;"> | |
| π OCR BATCH INTELLIGENCE DASHBOARD | |
| </h1> | |
| <p style="margin: 10px 0 0 0; font-size: 18px; opacity: 0.95;"> | |
| Advanced Analytics | Time-Based Tracking | OCR Model Performance | |
| </p> | |
| <div style="margin-top: 20px; display: flex; justify-content: center; gap: 20px; flex-wrap: wrap;"> | |
| <span class="status-indicator status-success">π File Data</span> | |
| <span class="status-indicator status-info">π Current: {datetime.now().strftime('%Y-%m-%d %H:%M')}</span> | |
| <span class="status-indicator status-info">π’ {disco_choice}</span> | |
| <span class="status-indicator status-info">π {len(filtered_df):,} Records</span> | |
| <span class="status-indicator status-info">β° Latest: {latest_time.strftime('%Y-%m-%d %H:%M') if isinstance(latest_time, datetime) else 'N/A'}</span> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ========================= | |
| # NEW: RECENT PROCESSING SECTION | |
| # ========================= | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### β° RECENT PROCESSING ACTIVITY") | |
| # Calculate recent records | |
| current_time = datetime.now() | |
| if 'PROCESSING_TIMESTAMP' in df.columns: | |
| # Records from last 24 hours | |
| last_24h = get_recently_processed(df, hours=24) | |
| last_7d = get_recently_processed(df, hours=168) | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| recent_1h = get_recently_processed(df, hours=1) | |
| st.metric( | |
| label="Last 1 Hour", | |
| value=f"{len(recent_1h):,}", | |
| delta=f"{len(recent_1h)} new" if len(recent_1h) > 0 else "0 new" | |
| ) | |
| with col2: | |
| recent_24h = get_recently_processed(df, hours=24) | |
| st.metric( | |
| label="Last 24 Hours", | |
| value=f"{len(recent_24h):,}", | |
| delta=f"{len(recent_24h)} processed" | |
| ) | |
| with col3: | |
| recent_7d = get_recently_processed(df, hours=168) | |
| st.metric( | |
| label="Last 7 Days", | |
| value=f"{len(recent_7d):,}", | |
| delta=f"{len(recent_7d)} processed" | |
| ) | |
| with col4: | |
| # Calculate processing rate per day | |
| if len(recent_7d) > 0: | |
| avg_per_day = len(recent_7d) / 7 | |
| st.metric( | |
| label="Avg/Day (7D)", | |
| value=f"{avg_per_day:,.0f}", | |
| delta="" | |
| ) | |
| else: | |
| st.metric("Avg/Day (7D)", "0", delta="") | |
| # Show processing trend chart | |
| processing_trend = get_processing_trend(df) | |
| if not processing_trend.empty: | |
| fig = px.line( | |
| processing_trend, | |
| x='PROCESSING_DATE', | |
| y='Records_Processed', | |
| title='Daily Processing Trend', | |
| markers=True, | |
| color_discrete_sequence=['#0078D4'] | |
| ) | |
| fig.update_layout( | |
| xaxis_title='Date', | |
| yaxis_title='Records Processed', | |
| hovermode='x unified' | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.info("No processing timestamp data available. Timestamps will be added on next upload.") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # ========================= | |
| # EXECUTIVE KPIs | |
| # ========================= | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### π EXECUTIVE DASHBOARD") | |
| total_records = len(filtered_df) | |
| processed_records = filtered_df["IMAGE_VERIFY_CODE_PITC"].notna().sum() | |
| successful_records = filtered_df["IMAGE_VERIFY_CODE_PITC"].isin(['A', 'C', 'D']).sum() | |
| processing_rate = (processed_records / total_records * 100) if total_records > 0 else 0 | |
| success_rate = (successful_records / processed_records * 100) if processed_records > 0 else 0 | |
| kpi_cols = st.columns(5) | |
| with kpi_cols[0]: | |
| st.markdown(f""" | |
| <div class="enhanced-kpi blue"> | |
| <div class="kpi-icon">π</div> | |
| <div class="kpi-value">{total_records:,}</div> | |
| <div class="kpi-label">Total Records</div> | |
| <div style="margin-top: 10px; font-size: 14px; opacity: 0.8;"> | |
| {processing_rate:.1f}% processed | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with kpi_cols[1]: | |
| st.markdown(f""" | |
| <div class="enhanced-kpi green"> | |
| <div class="kpi-icon">β </div> | |
| <div class="kpi-value">{success_rate:.1f}%</div> | |
| <div class="kpi-label">Success Rate (A,C,D)</div> | |
| <div style="margin-top: 10px; font-size: 14px; opacity: 0.8;"> | |
| {successful_records:,} successful | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with kpi_cols[2]: | |
| st.markdown(f""" | |
| <div class="enhanced-kpi purple"> | |
| <div class="kpi-icon">π€</div> | |
| <div class="kpi-value">{ocr_accuracy['accuracy']:.1f}%</div> | |
| <div class="kpi-label">OCR Model Accuracy</div> | |
| <div style="margin-top: 10px; font-size: 14px; opacity: 0.8;"> | |
| A vs N only | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with kpi_cols[3]: | |
| perfect_matches = (filtered_df["IMAGE_VERIFY_CODE_PITC"] == 'A').sum() | |
| perfect_match_rate = (perfect_matches / processed_records * 100) if processed_records > 0 else 0 | |
| st.markdown(f""" | |
| <div class="enhanced-kpi orange"> | |
| <div class="kpi-icon">β</div> | |
| <div class="kpi-value">{perfect_match_rate:.1f}%</div> | |
| <div class="kpi-label">Perfect Match (A)</div> | |
| <div style="margin-top: 10px; font-size: 14px; opacity: 0.8;"> | |
| {perfect_matches:,} perfect | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with kpi_cols[4]: | |
| image_issues = (filtered_df["IMAGE_VERIFY_CODE_PITC"] == 'E').sum() | |
| image_issue_rate = (image_issues / processed_records * 100) if processed_records > 0 else 0 | |
| st.markdown(f""" | |
| <div class="enhanced-kpi red"> | |
| <div class="kpi-icon">πΌοΈ</div> | |
| <div class="kpi-value">{image_issue_rate:.1f}%</div> | |
| <div class="kpi-label">Image Issues (E)</div> | |
| <div style="margin-top: 10px; font-size: 14px; opacity: 0.8;"> | |
| {image_issues:,} records | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # ========================= | |
| # FLAG DISTRIBUTION | |
| # ========================= | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### π·οΈ FLAG CLASSIFICATION DISTRIBUTION") | |
| flag_counts = filtered_df["IMAGE_VERIFY_CODE_PITC"].value_counts() | |
| flag_cols = st.columns(5) | |
| for idx, flag in enumerate(['A', 'C', 'D', 'E', 'N']): | |
| count = flag_counts.get(flag, 0) | |
| with flag_cols[idx]: | |
| percentage = (count / processed_records * 100) if processed_records > 0 else 0 | |
| flag_labels = { | |
| 'A': 'β PERFECT MATCH', | |
| 'C': 'β οΈ FLAG C', | |
| 'D': 'β οΈ FLAG D', | |
| 'E': 'β IMAGE ISSUE', | |
| 'N': 'β READING ISSUE' | |
| } | |
| flag_descs = { | |
| 'A': 'Successful with perfect reading (80%+)', | |
| 'C': 'Successful with minor issues', | |
| 'D': 'Successful but needs review', | |
| 'E': 'Image not available', | |
| 'N': 'Reading not matched (<80%)' | |
| } | |
| st.markdown(f""" | |
| <div class="flag-card flag-{flag}"> | |
| <div style="font-size: 32px; font-weight: 800;">{flag}</div> | |
| <div style="font-size: 28px; font-weight: 900;">{count:,}</div> | |
| <div style="font-size: 12px; font-weight: 600; margin: 5px 0;"> | |
| {flag_labels.get(flag, f'FLAG {flag}')} | |
| </div> | |
| <div style="font-size: 14px; color: #666; margin: 5px 0;"> | |
| {percentage:.1f}% of processed | |
| </div> | |
| <div style="font-size: 12px; color: #888; font-style: italic; margin-top: 5px;"> | |
| {flag_descs.get(flag, 'Unknown flag')} | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # ========================= | |
| # DATA PREVIEW WITH TIMESTAMP | |
| # ========================= | |
| st.markdown('<div class="enhanced-card">', unsafe_allow_html=True) | |
| st.markdown("### π DATA OVERVIEW") | |
| tab1, tab2, tab3 = st.tabs(["π Records", "π Batch Statistics", "β° Recent Activity"]) | |
| with tab1: | |
| display_columns = ["REF_DIGITS", "DISCO_NAME", "BATCH_ID", | |
| "IMAGE_VERIFY_CODE_PITC", "ACCURACY_CATEGORY"] | |
| # Add timestamp if available | |
| if 'PROCESSING_TIMESTAMP' in filtered_df.columns: | |
| display_columns.append("PROCESSING_TIMESTAMP") | |
| st.dataframe( | |
| filtered_df[display_columns].head(100), | |
| use_container_width=True, | |
| column_config={ | |
| "PROCESSING_TIMESTAMP": st.column_config.DatetimeColumn( | |
| "Processing Time", | |
| format="YYYY-MM-DD HH:mm:ss" | |
| ) | |
| } if 'PROCESSING_TIMESTAMP' in display_columns else None | |
| ) | |
| with tab2: | |
| if not batch_stats.empty: | |
| display_stats = batch_stats[["Batch ID", "DISCO", "Total Records", "Processed", | |
| "Success Rate (A,C,D)", "OCR Model Accuracy (A vs N)", | |
| "Latest Processing Time"]] | |
| st.dataframe(display_stats, use_container_width=True) | |
| else: | |
| st.info("No batch statistics available") | |
| with tab3: | |
| # Show recently processed records | |
| if 'PROCESSING_TIMESTAMP' in df.columns: | |
| recent_records = get_recently_processed(df, hours=24) | |
| if not recent_records.empty: | |
| st.markdown(f"### π Records Processed in Last 24 Hours: {len(recent_records):,}") | |
| # Group by hour | |
| recent_records['PROCESSING_HOUR'] = pd.to_datetime(recent_records['PROCESSING_TIMESTAMP']).dt.floor('H') | |
| hourly_counts = recent_records.groupby('PROCESSING_HOUR').size().reset_index(name='Count') | |
| hourly_counts = hourly_counts.sort_values('PROCESSING_HOUR', ascending=False) | |
| # Show hourly breakdown | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.dataframe( | |
| recent_records[["REF_DIGITS", "DISCO_NAME", "BATCH_ID", | |
| "IMAGE_VERIFY_CODE_PITC", "PROCESSING_TIMESTAMP"]] | |
| .sort_values('PROCESSING_TIMESTAMP', ascending=False) | |
| .head(50), | |
| use_container_width=True | |
| ) | |
| with col2: | |
| st.markdown("#### π By Hour") | |
| st.dataframe(hourly_counts, use_container_width=True) | |
| # Show latest batch | |
| if not recent_records.empty: | |
| latest_batch = recent_records.loc[recent_records['PROCESSING_TIMESTAMP'].idxmax()] | |
| st.markdown("#### π Latest Record") | |
| st.write(f"**REF:** {latest_batch['REF_DIGITS']}") | |
| st.write(f"**DISCO:** {latest_batch['DISCO_NAME']}") | |
| st.write(f"**Batch:** {latest_batch['BATCH_ID']}") | |
| st.write(f"**Flag:** {latest_batch['IMAGE_VERIFY_CODE_PITC']}") | |
| st.write(f"**Time:** {latest_batch['PROCESSING_TIMESTAMP']}") | |
| else: | |
| st.info("No records processed in the last 24 hours") | |
| else: | |
| st.info("No timestamp data available") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # ========================= | |
| # FOOTER WITH ENHANCED TIME INFO | |
| # ========================= | |
| # Calculate time-based metrics | |
| if 'PROCESSING_TIMESTAMP' in filtered_df.columns and not filtered_df.empty: | |
| time_metrics = { | |
| "latest_time": filtered_df['PROCESSING_TIMESTAMP'].max(), | |
| "earliest_time": filtered_df['PROCESSING_TIMESTAMP'].min(), | |
| "records_today": len(get_recently_processed(df, hours=24)), | |
| "unique_dates": filtered_df['PROCESSING_DATE'].nunique() if 'PROCESSING_DATE' in filtered_df.columns else 0 | |
| } | |
| else: | |
| time_metrics = { | |
| "latest_time": "N/A", | |
| "earliest_time": "N/A", | |
| "records_today": 0, | |
| "unique_dates": 0 | |
| } | |
| st.markdown(f""" | |
| <div style=" | |
| background: linear-gradient(135deg, var(--dark-gray), #1a1a1a); | |
| color: white; | |
| padding: 20px; | |
| border-radius: 12px; | |
| margin-top: 30px; | |
| text-align: center; | |
| font-size: 12px; | |
| "> | |
| <div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 10px;"> | |
| <div> | |
| π <strong>Last Updated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| </div> | |
| <div> | |
| π’ <strong>DISCO:</strong> {disco_choice} | |
| </div> | |
| <div> | |
| π <strong>Records:</strong> {len(filtered_df):,} | |
| </div> | |
| <div> | |
| β <strong>Success Rate:</strong> {success_rate:.1f}% | |
| </div> | |
| <div> | |
| π€ <strong>OCR Accuracy:</strong> {ocr_accuracy['accuracy']:.1f}% | |
| </div> | |
| <div> | |
| β° <strong>Latest Processing:</strong> {time_metrics['latest_time'].strftime('%Y-%m-%d %H:%M') if isinstance(time_metrics['latest_time'], datetime) else 'N/A'} | |
| </div> | |
| </div> | |
| <div style="margin-top: 10px; font-size: 11px; opacity: 0.7;"> | |
| π OCR Intelligence Dashboard | Time-Based Tracking | Parquet Format Recommended | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) |