Spaces:

tyasnk
/

OCR_Dashboard

Sleeping

App Files Files Community

tyasnk commited on Dec 3, 2025

Commit

34844c6

verified ·

1 Parent(s): f111ecc

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitkeep +0 -0
README.md +3 -9
calculate_ocr_accuracy.py +729 -0
data.sql +140 -0
ocr_accuracy_dashboard.py +238 -0
results.json +514 -0
results.md +290 -0
unused/label.sql +0 -0
unused/page1_label.sql +29 -0
unused/page1_prediction.sql +36 -0
unused/page2_label.sql +40 -0
unused/page2_prediction.sql +47 -0
unused/page3_label.sql +0 -0
unused/page3_prediction.sql +90 -0
unused/prediction.sql +68 -0
unused/sample_accuracy_output.json +719 -0

.gitkeep ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: OCR Dashboard
-emoji: 🔥
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 6.0.2
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OCR_Dashboard
+app_file: ocr_accuracy_dashboard.py
 sdk: gradio
+sdk_version: 6.0.1
 ---

calculate_ocr_accuracy.py ADDED Viewed

	@@ -0,0 +1,729 @@

+#!/usr/bin/env python3
+"""
+Calculate OCR accuracy using edit distance (Levenshtein distance)
+between label and predict data from JSON files or BigQuery SQL queries.
+Supports two modes:
+1. Separate files: Provide --label and --predict files (records matched by keys)
+2. Combined data: Provide --data file with label fields having a suffix (e.g., _label)
+Example with combined data (data.sql):
+    python calculate_ocr_accuracy.py --data scripts/data.sql --normalize
+Example with separate files:
+    python calculate_ocr_accuracy.py --label labels.json --predict predicts.json --normalize
+"""
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Any
+from collections import defaultdict
+try:
+    from google.cloud import bigquery
+    BIGQUERY_AVAILABLE = True
+except ImportError:
+    BIGQUERY_AVAILABLE = False
+    print("Warning: google-cloud-bigquery not installed. SQL query support disabled.")
+def levenshtein_distance(s1: str, s2: str) -> int:
+    """
+    Calculate Levenshtein distance between two strings.
+    Args:
+        s1: First string
+        s2: Second string
+    Returns:
+        Edit distance (minimum number of single-character edits)
+    """
+    if len(s1) < len(s2):
+        return levenshtein_distance(s2, s1)
+    if len(s2) == 0:
+        return len(s1)
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+    return previous_row[-1]
+def normalize_string(s: str) -> str:
+    """
+    Normalize string for comparison (convert to lowercase, strip whitespace).
+    Args:
+        s: Input string
+    Returns:
+        Normalized string
+    """
+    return s.lower().strip()
+def calculate_field_accuracy(
+    label_value: str,
+    predict_value: str,
+    normalize: bool = False
+) -> Tuple[int, float, bool]:
+    """
+    Calculate accuracy metrics for a single field.
+    Args:
+        label_value: Ground truth value
+        predict_value: Predicted value
+        normalize: Whether to normalize strings before comparison
+    Returns:
+        Tuple of (edit_distance, normalized_accuracy, exact_match)
+    """
+    if normalize:
+        label_norm = normalize_string(label_value)
+        predict_norm = normalize_string(predict_value)
+    else:
+        label_norm = label_value
+        predict_norm = predict_value
+    # Exact match check
+    exact_match = label_norm == predict_norm
+    # Calculate edit distance
+    edit_dist = levenshtein_distance(label_norm, predict_norm)
+    # Calculate normalized accuracy
+    # Accuracy = 1 - (edit_distance / max_length)
+    max_len = max(len(label_norm), len(predict_norm), 1)
+    normalized_accuracy = 1.0 - (edit_dist / max_len) if max_len > 0 else 1.0
+    return edit_dist, normalized_accuracy, exact_match
+def execute_bigquery_query(query_file: str, project_id: Optional[str] = None) -> List[Dict[str, Any]]:
+    """
+    Execute a BigQuery SQL query from a file and return results as list of dictionaries.
+    Args:
+        query_file: Path to SQL query file
+        project_id: GCP project ID (if None, uses default from environment)
+    Returns:
+        List of dictionaries, one per row
+    """
+    if not BIGQUERY_AVAILABLE:
+        raise ImportError("google-cloud-bigquery is required for SQL query support. "
+                         "Install it with: poetry add google-cloud-bigquery")
+    # Read SQL query from file
+    query_path = Path(query_file)
+    if not query_path.exists():
+        raise FileNotFoundError(f"Query file not found: {query_file}")
+    with open(query_path, 'r', encoding='utf-8') as f:
+        query = f.read()
+    # Initialize BigQuery client
+    if project_id:
+        client = bigquery.Client(project=project_id)
+    else:
+        client = bigquery.Client()
+    # Execute query
+    print(f"Executing BigQuery query from: {query_file}")
+    query_job = client.query(query)
+    results = query_job.result()
+    # Convert to list of dictionaries
+    records = []
+    for row in results:
+        record = {}
+        for key, value in row.items():
+            # Convert None to empty string for consistency
+            record[key] = str(value) if value is not None else ''
+        records.append(record)
+    print(f"Retrieved {len(records)} records from BigQuery")
+    return records
+def load_data_from_file(file_path: str, project_id: Optional[str] = None) -> List[Dict[str, Any]]:
+    """
+    Load data from either JSON file or SQL query file.
+    Args:
+        file_path: Path to JSON file or SQL query file
+        project_id: GCP project ID for BigQuery queries (optional)
+    Returns:
+        List of dictionaries
+    """
+    path = Path(file_path)
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {file_path}")
+    # Check if it's a SQL file
+    if path.suffix.lower() == '.sql':
+        return execute_bigquery_query(file_path, project_id)
+    else:
+        # Assume JSON file
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+def match_records(
+    labels: List[Dict[str, Any]],
+    predicts: List[Dict[str, Any]],
+    match_keys: List[str] = None
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Match label and predict records by common keys (e.g., filename, project_id_mother).
+    Args:
+        labels: List of label records
+        predicts: List of predict records
+        match_keys: List of field names to use for matching (default: ['filename', 'project_id_mother'])
+    Returns:
+        Tuple of (matched_labels, matched_predicts) with same length and order
+    """
+    if match_keys is None:
+        match_keys = ['filename', 'project_id_mother']
+    # Find keys that exist in both datasets
+    available_keys = []
+    if labels and predicts:
+        label_keys = set(labels[0].keys())
+        predict_keys = set(predicts[0].keys())
+        for key in match_keys:
+            if key in label_keys and key in predict_keys:
+                available_keys.append(key)
+                break
+    if not available_keys:
+        print("Warning: No matching keys found. Matching by index order.")
+        min_len = min(len(labels), len(predicts))
+        return labels[:min_len], predicts[:min_len]
+    match_key = available_keys[0]
+    print(f"Matching records by key: {match_key}")
+    # Create index of predict records by match key
+    predict_index = {}
+    for predict_record in predicts:
+        key_value = str(predict_record.get(match_key, ''))
+        if key_value:
+            predict_index[key_value] = predict_record
+    # Match labels with predicts
+    matched_labels = []
+    matched_predicts = []
+    for label_record in labels:
+        key_value = str(label_record.get(match_key, ''))
+        if key_value in predict_index:
+            matched_labels.append(label_record)
+            matched_predicts.append(predict_index[key_value])
+    print(f"Matched {len(matched_labels)} records out of {len(labels)} labels and {len(predicts)} predicts")
+    return matched_labels, matched_predicts
+def calculate_ocr_accuracy_from_combined_data(
+    data_file: str,
+    normalize: bool = False,
+    project_id: Optional[str] = None,
+    label_suffix: str = '_label'
+) -> Dict:
+    """
+    Calculate OCR accuracy from a single data source where label and predict
+    fields are in the same records (labels have a suffix like '_label').
+    This function is designed for queries like data.sql that return combined
+    label and predict data in a single result set.
+    Args:
+        data_file: Path to JSON file or SQL query file containing combined data
+        normalize: Whether to normalize strings before comparison
+        project_id: GCP project ID for BigQuery queries (optional)
+        label_suffix: Suffix used to identify label fields (default: '_label')
+    Returns:
+        Dictionary containing accuracy metrics per field
+    """
+    # Load combined data
+    records = load_data_from_file(data_file, project_id)
+    if len(records) == 0:
+        print("Warning: No records found in data file")
+        return {}
+    # Identify all fields and separate label/predict pairs
+    all_fields = set(records[0].keys())
+    # Find predict fields (those without label suffix)
+    # and their corresponding label fields
+    predict_fields = set()
+    field_pairs = {}  # predict_field_name -> label_field_name
+    for field in all_fields:
+        if field.endswith(label_suffix):
+            # This is a label field
+            predict_field = field[:-len(label_suffix)]
+            if predict_field in all_fields:
+                # Both label and predict fields exist
+                predict_fields.add(predict_field)
+                field_pairs[predict_field] = field
+        elif field + label_suffix in all_fields:
+            # This is a predict field with a corresponding label
+            predict_fields.add(field)
+            field_pairs[field] = field + label_suffix
+    # Exclude metadata columns
+    exclude_fields = {'filename', 'project_id_mother', 'id', 'rn'}
+    predict_fields = {f for f in predict_fields if f not in exclude_fields}
+    if not predict_fields:
+        print("Warning: No matching field pairs found. "
+              f"Looking for fields with '{label_suffix}' suffix.")
+        return {}
+    print(f"Found {len(predict_fields)} field pairs to compare")
+    print(f"Fields: {', '.join(sorted(predict_fields))}")
+    # Initialize statistics per field
+    field_stats = defaultdict(lambda: {
+        'total_records': 0,
+        'exact_matches': 0,
+        'total_edit_distance': 0,
+        'total_normalized_accuracy': 0.0,
+        'min_accuracy': 1.0,
+        'max_edit_distance': 0,
+        'examples': []  # Store examples of mismatches
+    })
+    # Process each record
+    for record_idx, record in enumerate(records):
+        for predict_field in predict_fields:
+            label_field = field_pairs[predict_field]
+            label_value = str(record.get(label_field, ''))
+            predict_value = str(record.get(predict_field, ''))
+            # Skip if both label and predict are empty
+            label_stripped = label_value.strip()
+            predict_stripped = predict_value.strip()
+            if not label_stripped and not predict_stripped:
+                continue
+            edit_dist, normalized_acc, exact_match = calculate_field_accuracy(
+                label_value, predict_value, normalize
+            )
+            stats = field_stats[predict_field]
+            stats['total_records'] += 1
+            stats['total_edit_distance'] += edit_dist
+            stats['total_normalized_accuracy'] += normalized_acc
+            stats['max_edit_distance'] = max(stats['max_edit_distance'], edit_dist)
+            stats['min_accuracy'] = min(stats['min_accuracy'], normalized_acc)
+            if exact_match:
+                stats['exact_matches'] += 1
+            # Store example if there's a mismatch
+            if not exact_match and len(stats['examples']) < 3:
+                stats['examples'].append({
+                    'record_idx': record_idx,
+                    'label': label_value,
+                    'predict': predict_value,
+                    'edit_distance': edit_dist,
+                    'accuracy': normalized_acc
+                })
+    # Calculate final statistics
+    results = {}
+    for predict_field in sorted(predict_fields):
+        stats = field_stats[predict_field]
+        total = stats['total_records']
+        if total == 0:
+            continue
+        results[predict_field] = {
+            'exact_match_rate': stats['exact_matches'] / total,
+            'average_edit_distance': stats['total_edit_distance'] / total,
+            'average_normalized_accuracy': stats['total_normalized_accuracy'] / total,
+            'min_accuracy': stats['min_accuracy'],
+            'max_edit_distance': stats['max_edit_distance'],
+            'exact_matches': stats['exact_matches'],
+            'total_records': total,
+            'examples': stats['examples']
+        }
+    return results
+def calculate_ocr_accuracy(
+    label_file: str,
+    predict_file: str,
+    normalize: bool = False,
+    match_keys: List[str] = None,
+    project_id: Optional[str] = None
+) -> Dict:
+    """
+    Calculate OCR accuracy per field between label and predict data.
+    Supports both JSON files and BigQuery SQL query files.
+    Args:
+        label_file: Path to label JSON file or SQL query file
+        predict_file: Path to predict JSON file or SQL query file
+        normalize: Whether to normalize strings before comparison
+        match_keys: List of field names to use for matching records
+        project_id: GCP project ID for BigQuery queries (optional)
+    Returns:
+        Dictionary containing accuracy metrics per field
+    """
+    # Load data from files (JSON or SQL)
+    labels = load_data_from_file(label_file, project_id)
+    predicts = load_data_from_file(predict_file, project_id)
+    # Match records by common keys
+    labels, predicts = match_records(labels, predicts, match_keys)
+    # Validate that both files have the same number of records
+    if len(labels) != len(predicts):
+        print(f"Warning: Label file has {len(labels)} records, "
+              f"predict file has {len(predicts)} records")
+        min_records = min(len(labels), len(predicts))
+        labels = labels[:min_records]
+        predicts = predicts[:min_records]
+    # Get all field names from the first record
+    if len(labels) == 0:
+        return {}
+    # Exclude metadata columns from accuracy calculation
+    exclude_fields = {'filename', 'project_id_mother'}
+    field_names = [f for f in labels[0].keys() if f not in exclude_fields]
+    if not field_names:
+        print("Warning: No fields to calculate accuracy for (all fields excluded)")
+        return {}
+    print(f"Calculating accuracy for {len(field_names)} fields (excluding: {', '.join(exclude_fields)})")
+    # Initialize statistics per field
+    field_stats = defaultdict(lambda: {
+        'total_records': 0,
+        'exact_matches': 0,
+        'total_edit_distance': 0,
+        'total_normalized_accuracy': 0.0,
+        'min_accuracy': 1.0,
+        'max_edit_distance': 0,
+        'examples': []  # Store examples of mismatches
+    })
+    # Process each record
+    for record_idx, (label_record, predict_record) in enumerate(zip(labels, predicts)):
+        for field_name in field_names:
+            label_value = str(label_record.get(field_name, ''))
+            predict_value = str(predict_record.get(field_name, ''))
+            # Skip if both label and predict are empty
+            label_stripped = label_value.strip()
+            predict_stripped = predict_value.strip()
+            if not label_stripped and not predict_stripped:
+                continue
+            edit_dist, normalized_acc, exact_match = calculate_field_accuracy(
+                label_value, predict_value, normalize
+            )
+            stats = field_stats[field_name]
+            stats['total_records'] += 1
+            stats['total_edit_distance'] += edit_dist
+            stats['total_normalized_accuracy'] += normalized_acc
+            stats['max_edit_distance'] = max(stats['max_edit_distance'], edit_dist)
+            stats['min_accuracy'] = min(stats['min_accuracy'], normalized_acc)
+            if exact_match:
+                stats['exact_matches'] += 1
+            # Store example if there's a mismatch
+            if not exact_match and len(stats['examples']) < 3:
+                stats['examples'].append({
+                    'record_idx': record_idx,
+                    'label': label_value,
+                    'predict': predict_value,
+                    'edit_distance': edit_dist,
+                    'accuracy': normalized_acc
+                })
+    # Calculate final statistics
+    results = {}
+    for field_name in field_names:
+        stats = field_stats[field_name]
+        total = stats['total_records']
+        if total == 0:
+            continue
+        results[field_name] = {
+            'exact_match_rate': stats['exact_matches'] / total,
+            'average_edit_distance': stats['total_edit_distance'] / total,
+            'average_normalized_accuracy': stats['total_normalized_accuracy'] / total,
+            'min_accuracy': stats['min_accuracy'],
+            'max_edit_distance': stats['max_edit_distance'],
+            'exact_matches': stats['exact_matches'],
+            'total_records': total,
+            'examples': stats['examples']
+        }
+    return results
+def print_results(results: Dict, output_file: str = None):
+    """
+    Print accuracy results in a formatted way.
+    Args:
+        results: Dictionary containing accuracy metrics per field
+        output_file: Optional file path to save results
+    """
+    output_lines = []
+    # Header
+    header = "=" * 100
+    output_lines.append(header)
+    output_lines.append("OCR ACCURACY REPORT (Edit Distance Analysis)")
+    output_lines.append(header)
+    output_lines.append("")
+    # Sort fields by average normalized accuracy (descending)
+    sorted_fields = sorted(
+        results.items(),
+        key=lambda x: x[1]['average_normalized_accuracy'],
+        reverse=True
+    )
+    # Summary statistics
+    total_fields = len(results)
+    avg_exact_match = sum(r['exact_match_rate'] for r in results.values()) / total_fields
+    avg_normalized_acc = sum(r['average_normalized_accuracy'] for r in results.values()) / total_fields
+    output_lines.append("SUMMARY STATISTICS")
+    output_lines.append("-" * 100)
+    output_lines.append(f"Total Fields Analyzed: {total_fields}")
+    output_lines.append(f"Overall Exact Match Rate: {avg_exact_match:.2%}")
+    output_lines.append(f"Overall Average Normalized Accuracy: {avg_normalized_acc:.2%}")
+    output_lines.append("")
+    # Per-field statistics
+    output_lines.append("PER-FIELD STATISTICS")
+    output_lines.append("-" * 100)
+    output_lines.append(f"{'Field Name':<50} {'Exact Match':<15} {'Avg Accuracy':<15} {'Avg Edit Dist':<15}")
+    output_lines.append("-" * 100)
+    for field_name, stats in sorted_fields:
+        exact_match_pct = stats['exact_match_rate'] * 100
+        avg_acc = stats['average_normalized_accuracy'] * 100
+        avg_edit = stats['average_edit_distance']
+        output_lines.append(
+            f"{field_name:<50} {exact_match_pct:>6.2f}% ({stats['exact_matches']}/{stats['total_records']}) "
+            f"{avg_acc:>6.2f}% {avg_edit:>6.2f}"
+        )
+    output_lines.append("")
+    output_lines.append("")
+    # Detailed examples for fields with errors
+    output_lines.append("EXAMPLES OF MISMATCHES (Top 3 per field)")
+    output_lines.append("-" * 100)
+    for field_name, stats in sorted_fields:
+        if stats['exact_matches'] < stats['total_records'] and stats['examples']:
+            output_lines.append(f"\nField: {field_name}")
+            output_lines.append(f"  Exact Match Rate: {stats['exact_match_rate']:.2%}")
+            output_lines.append(f"  Average Accuracy: {stats['average_normalized_accuracy']:.2%}")
+            for example in stats['examples']:
+                output_lines.append(f"  Record {example['record_idx']}:")
+                output_lines.append(f"    Label:   '{example['label']}'")
+                output_lines.append(f"    Predict: '{example['predict']}'")
+                output_lines.append(f"    Edit Distance: {example['edit_distance']}, "
+                                  f"Accuracy: {example['accuracy']:.2%}")
+    # Print to console
+    output_text = "\n".join(output_lines)
+    print(output_text)
+    # Save to file if specified
+    if output_file:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(output_text)
+        print(f"\nResults saved to: {output_file}")
+def main():
+    """Main function to run the OCR accuracy calculation."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Calculate OCR accuracy using edit distance between label and predict data. '
+                   'Supports JSON files and BigQuery SQL query files (.sql). '
+                   'Can use either two separate files (--label and --predict) or a single '
+                   'combined file (--data) with label fields having a suffix (e.g., _label).'
+    )
+    # Mode selection: either combined data or separate label/predict files
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        '--data',
+        type=str,
+        help='Path to combined data file (JSON or SQL) with label and predict fields in same records. '
+             'Label fields should have a suffix (default: _label). Use this for queries like data.sql'
+    )
+    input_group.add_argument(
+        '--label',
+        type=str,
+        help='Path to label JSON file or SQL query file (use with --predict)'
+    )
+    parser.add_argument(
+        '--predict',
+        type=str,
+        help='Path to predict JSON file or SQL query file (use with --label)'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default=None,
+        help='Path to output file for results (optional)'
+    )
+    parser.add_argument(
+        '--normalize',
+        action='store_true',
+        help='Normalize strings (lowercase, strip) before comparison'
+    )
+    parser.add_argument(
+        '--json-output',
+        type=str,
+        default=None,
+        help='Path to save results as JSON (optional)'
+    )
+    parser.add_argument(
+        '--match-keys',
+        type=str,
+        nargs='+',
+        default=['filename', 'project_id_mother'],
+        help='Field names to use for matching records (default: filename project_id_mother). '
+             'Only used with --label/--predict mode.'
+    )
+    parser.add_argument(
+        '--project-id',
+        type=str,
+        default=None,
+        help='GCP project ID for BigQuery (default: uses default from environment)'
+    )
+    parser.add_argument(
+        '--label-suffix',
+        type=str,
+        default='_label',
+        help='Suffix used to identify label fields in combined data mode (default: _label)'
+    )
+    args = parser.parse_args()
+    # Validate arguments based on mode
+    if args.data:
+        # Combined data mode
+        data_path = Path(args.data)
+        if not data_path.exists():
+            print(f"Error: Data file not found: {data_path}")
+            sys.exit(1)
+        print(f"Calculating OCR accuracy from combined data...")
+        print(f"Data file: {data_path}")
+        print(f"Label suffix: {args.label_suffix}")
+        print(f"Normalize: {args.normalize}")
+        print()
+        try:
+            results = calculate_ocr_accuracy_from_combined_data(
+                str(data_path),
+                normalize=args.normalize,
+                project_id=args.project_id,
+                label_suffix=args.label_suffix
+            )
+            # Print results
+            print_results(results, args.output)
+            # Save JSON output if requested
+            if args.json_output:
+                with open(args.json_output, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, indent=2, ensure_ascii=False)
+                print(f"\nJSON results saved to: {args.json_output}")
+        except Exception as e:
+            print(f"Error: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        # Separate label/predict files mode
+        if not args.label or not args.predict:
+            parser.error("--label and --predict are required when not using --data mode")
+        label_path = Path(args.label)
+        predict_path = Path(args.predict)
+        if not label_path.exists():
+            print(f"Error: Label file not found: {label_path}")
+            sys.exit(1)
+        if not predict_path.exists():
+            print(f"Error: Predict file not found: {predict_path}")
+            sys.exit(1)
+        # Calculate accuracy
+        print(f"Calculating OCR accuracy...")
+        print(f"Label file: {label_path}")
+        print(f"Predict file: {predict_path}")
+        print(f"Normalize: {args.normalize}")
+        print(f"Match keys: {args.match_keys}")
+        print()
+        try:
+            results = calculate_ocr_accuracy(
+                str(label_path),
+                str(predict_path),
+                normalize=args.normalize,
+                match_keys=args.match_keys,
+                project_id=args.project_id
+            )
+            # Print results
+            print_results(results, args.output)
+            # Save JSON output if requested
+            if args.json_output:
+                with open(args.json_output, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, indent=2, ensure_ascii=False)
+                print(f"\nJSON results saved to: {args.json_output}")
+        except Exception as e:
+            print(f"Error: {e}", file=sys.stderr)
+            sys.exit(1)
+if __name__ == '__main__':
+    main()

data.sql ADDED Viewed

	@@ -0,0 +1,140 @@

+WITH dedup_master_files AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      LOWER(
+        REGEXP_REPLACE(original_filename, r'^(Copy of\s*)+', '')
+      ) AS cleaned_filename,
+      ROW_NUMBER() OVER (
+        PARTITION BY original_filename
+        ORDER BY created_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
+  )
+  WHERE rn = 1
+),
+dedup_page1_cover AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY file_id
+        ORDER BY created_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page1_cover`
+  )
+  WHERE rn = 1
+),
+dedup_page2_identitas AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY file_id
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page2_identitas`
+  )
+  WHERE rn = 1
+),
+label AS (
+  SELECT
+    filename AS filename_label,
+    project_id_mother AS project_id_mother_label,
+    CASE
+      WHEN first_name IS NOT NULL
+          AND last_name IS NOT NULL
+          AND first_name = last_name
+        THEN first_name
+      ELSE CONCAT(
+            IFNULL(first_name, ''),
+            IFNULL(
+              CONCAT(' ', last_name),
+              ''
+            )
+          )
+    END AS nama_ibu_cover_label,
+    puskesmas_name AS dikeluarkan_oleh_fasilitas_kesehatan_label,
+    regency_name AS kabupaten_kota_label,
+    CASE
+    WHEN first_name IS NOT NULL
+        AND last_name IS NOT NULL
+        AND first_name = last_name
+      THEN first_name
+    ELSE CONCAT(
+          IFNULL(first_name, ''),
+          IFNULL(
+            CONCAT(' ', last_name),
+            ''
+          )
+        )
+    END AS nama_ibu,
+    nik_mother AS nik_ibu_label,
+    birth_date AS tempat_tanggal_lahir_ibu_label,
+    address_street AS alamat_rumah_ibu_label,
+    contact_number AS telepon_ibu_label,
+    bpjs_mother AS no_jkn_ibu_label,
+    education_level AS pendidikan_ibu_label,
+    occupation AS pekerjaan_ibu_label,
+    blood_type_result AS golongan_darah_ibu_label,
+    age AS usia_ibu_label,
+    pregnancy_number AS kehamilan_ke_label,
+    number_live_birth AS jumlah_anak_lahir_hidup_label,
+    number_birth_lost AS riwayat_keguguran_label,
+    previous_preg_issue AS riwayat_penyakit_ibu_label
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY `filename`
+        ORDER BY `end` DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.ocr_bukukia_ocr_sid.List Correct Entry 2025`
+  )
+  WHERE
+    rn = 1
+)
+,
+main AS (
+SELECT
+  MF.id,
+  MF.cleaned_filename AS `filename`,
+  REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
+  P1.nama_ibu_cover,
+  P1.dikeluarkan_oleh_fasilitas_kesehatan,
+  P1.kabupaten_kota,
+  P2.nama_ibu,
+  P2.nik_ibu,
+  P2.tempat_tanggal_lahir_ibu,
+  P2.alamat_rumah_ibu,
+  P2.telepon_ibu,
+  P2.no_jkn_ibu,
+  P2.pendidikan_ibu,
+  P2.pekerjaan_ibu,
+  P2.golongan_darah_ibu,
+  P2.usia_ibu,
+  P2.kehamilan_ke,
+  P2.jumlah_anak_lahir_hidup,
+  P2.riwayat_keguguran,
+  P2.riwayat_penyakit_ibu,
+  L.*,
+  ROW_NUMBER() OVER (
+    PARTITION BY MF.`cleaned_filename`
+    ORDER BY MF.`cleaned_filename` DESC
+  ) AS rn
+FROM dedup_master_files MF
+LEFT JOIN dedup_page1_cover P1 ON MF.id = P1.id
+LEFT JOIN dedup_page2_identitas P2 ON MF.id = P2.id
+LEFT JOIN label L ON CAST(REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS INTEGER) = L.project_id_mother_label
+)
+SELECT *
+FROM main
+WHERE
+  rn = 1 and
+  id != "9a21ef00-6b02-4b04-81c3-68e25e2c8b7f" --duplicated

ocr_accuracy_dashboard.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""Gradio dashboard for visualizing OCR accuracy results."""
+import json
+from pathlib import Path
+from typing import Any
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+def load_accuracy_data(json_path: str | Path) -> dict[str, Any]:
+    """Load accuracy data from JSON file."""
+    with open(json_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def create_summary_stats(data: dict[str, Any]) -> pd.DataFrame:
+    """Create summary statistics DataFrame with percent-style floats (2 decimals)."""
+    summary_data = []
+    for field_name, metrics in data.items():
+        summary_data.append(
+            {
+                "Field": field_name.replace("_", " ").title(),
+                "Exact Match Rate": metrics['exact_match_rate'],
+                "Average Accuracy": metrics['average_normalized_accuracy'],
+                "Min Accuracy": metrics['min_accuracy'],
+                "Average Edit Distance": metrics['average_edit_distance'],
+                "Total Records": metrics['total_records'],
+                "Exact Matches": metrics['exact_matches'],
+            }
+        )
+    df = pd.DataFrame(summary_data)
+    df = df.sort_values("Average Accuracy", ascending=False)
+    df["Average Accuracy"] = df["Average Accuracy"].apply(lambda x: f"{x:.2%}")
+    df["Exact Match Rate"] = df["Exact Match Rate"].apply(lambda x: f"{x:.2%}")
+    df["Min Accuracy"] = df["Min Accuracy"].apply(lambda x: f"{x:.2%}")
+    return df
+def create_accuracy_chart(data: dict[str, Any]) -> go.Figure:
+    """Create bar chart of average normalized accuracy by field."""
+    # Sort by average normalized accuracy (descending - best first)
+    sorted_items = sorted(
+        data.items(),
+        key=lambda x: x[1]["average_normalized_accuracy"],
+        reverse=True,
+    )
+    fields = []
+    accuracies = []
+    exact_match_rates = []
+    for field_name, metrics in sorted_items:
+        fields.append(field_name.replace("_", " ").title())
+        accuracies.append(metrics["average_normalized_accuracy"])
+        exact_match_rates.append(metrics["exact_match_rate"])
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            name="Average Normalized Accuracy",
+            x=fields,
+            y=accuracies,
+            marker_color="lightblue",
+        )
+    )
+    fig.add_trace(
+        go.Bar(
+            name="Exact Match Rate",
+            x=fields,
+            y=exact_match_rates,
+            marker_color="lightcoral",
+        )
+    )
+    fig.update_layout(
+        title="OCR Accuracy Metrics by Field",
+        xaxis_title="Field",
+        yaxis_title="Accuracy Rate",
+        barmode="group",
+        height=600,
+        xaxis={"tickangle": -45},
+    )
+    return fig
+def create_edit_distance_chart(data: dict[str, Any]) -> go.Figure:
+    """Create bar chart of average edit distance by field."""
+    # Sort by average edit distance (ascending - best first, lower is better)
+    sorted_items = sorted(
+        data.items(),
+        key=lambda x: x[1]["average_edit_distance"],
+    )
+    fields = []
+    edit_distances = []
+    for field_name, metrics in sorted_items:
+        fields.append(field_name.replace("_", " ").title())
+        edit_distances.append(metrics["average_edit_distance"])
+    fig = px.bar(
+        x=fields,
+        y=edit_distances,
+        title="Average Edit Distance by Field",
+        labels={"x": "Field", "y": "Average Edit Distance"},
+        color=edit_distances,
+        color_continuous_scale="Reds",
+    )
+    fig.update_layout(height=600, xaxis={"tickangle": -45})
+    return fig
+def get_field_examples(data: dict[str, Any], field_name: str) -> pd.DataFrame:
+    """Get examples for a specific field."""
+    if field_name not in data:
+        return pd.DataFrame()
+    examples = data[field_name].get("examples", [])
+    if not examples:
+        return pd.DataFrame(
+            columns=["Record Index", "Label", "Prediction", "Edit Distance", "Accuracy"]
+        )
+    example_data = []
+    for ex in examples:
+        example_data.append(
+            {
+                "Record Index": ex["record_idx"],
+                "Label": ex["label"],
+                "Prediction": ex["predict"],
+                "Edit Distance": ex["edit_distance"],
+                "Accuracy": f"{ex['accuracy']:.2%}",
+            }
+        )
+    return pd.DataFrame(example_data)
+def create_dashboard(json_path: str | Path):
+    """Create and launch Gradio dashboard."""
+    # Load data
+    data = load_accuracy_data(json_path)
+    # Create components
+    with gr.Blocks(title="OCR Accuracy Dashboard") as demo:
+        gr.Markdown(
+            """
+            # 📊 OCR Accuracy Dashboard
+            Visualize and analyze OCR accuracy results from KIA document processing.
+            """
+        )
+        with gr.Tabs():
+            # Summary Tab
+            with gr.Tab("📈 Summary"):
+                gr.Markdown("### Overall Statistics")
+                summary_df = create_summary_stats(data)
+                summary_table = gr.Dataframe(
+                    value=summary_df,
+                    interactive=False,
+                    wrap=True,
+                )
+                gr.Markdown("### Accuracy Metrics Comparison")
+                accuracy_chart = gr.Plot(
+                    value=create_accuracy_chart(data),
+                    label="Accuracy Metrics",
+                )
+                gr.Markdown("### Edit Distance Analysis")
+                edit_distance_chart = gr.Plot(
+                    value=create_edit_distance_chart(data),
+                    label="Edit Distance",
+                )
+            # Field Details Tab
+            with gr.Tab("🔍 Field Details"):
+                gr.Markdown("### Select a field to view detailed examples")
+                field_dropdown = gr.Dropdown(
+                    choices=[name.replace("_", " ").title() for name in data.keys()],
+                    label="Select Field",
+                    value=list(data.keys())[0].replace("_", " ").title(),
+                )
+                def update_field_details(field_display_name: str):
+                    # Find the original field name
+                    field_name = None
+                    for name in data.keys():
+                        if name.replace("_", " ").title() == field_display_name:
+                            field_name = name
+                            break
+                    if not field_name:
+                        return "", pd.DataFrame()
+                    metrics = data[field_name]
+                    metrics_text = f"""
+                    ### {field_display_name}
+                    - **Exact Match Rate**: {metrics['exact_match_rate']:.2%}
+                    - **Average Normalized Accuracy**: {metrics['average_normalized_accuracy']:.2%}
+                    - **Min Accuracy**: {metrics['min_accuracy']:.2%}
+                    - **Average Edit Distance**: {metrics['average_edit_distance']:.2f}
+                    - **Max Edit Distance**: {metrics['max_edit_distance']}
+                    - **Exact Matches**: {metrics['exact_matches']} / {metrics['total_records']}
+                    """
+                    examples_df = get_field_examples(data, field_name)
+                    return metrics_text, examples_df
+                # Initialize with first field
+                first_field = list(data.keys())[0].replace("_", " ").title()
+                initial_metrics, initial_examples = update_field_details(first_field)
+                field_metrics = gr.Markdown(value=initial_metrics)
+                examples_table = gr.Dataframe(
+                    value=initial_examples,
+                    interactive=False,
+                    wrap=True,
+                )
+                field_dropdown.change(
+                    fn=update_field_details,
+                    inputs=field_dropdown,
+                    outputs=[field_metrics, examples_table],
+                )
+    return demo
+if __name__ == "__main__":
+    # Default path to the JSON file
+    json_file = Path(__file__).parent / "results.json"
+    # Create and launch dashboard
+    demo = create_dashboard(json_file)
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

results.json ADDED Viewed

	@@ -0,0 +1,514 @@

+{
+  "alamat_rumah_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 11.180851063829786,
+    "average_normalized_accuracy": 0.13064494103170882,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 33,
+    "exact_matches": 0,
+    "total_records": 94,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "",
+        "predict": "Tebon Selatan",
+        "edit_distance": 13,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 1,
+        "label": "",
+        "predict": "PANGKALAN BUNTE.",
+        "edit_distance": 16,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 2,
+        "label": "",
+        "predict": "BENDUNG SELATAN, KILANG",
+        "edit_distance": 23,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "dikeluarkan_oleh_fasilitas_kesehatan": {
+    "exact_match_rate": 0.05,
+    "average_edit_distance": 7.98,
+    "average_normalized_accuracy": 0.34242116726433275,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 20,
+    "exact_matches": 5,
+    "total_records": 100,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Keruak",
+        "predict": "kruak",
+        "edit_distance": 1,
+        "accuracy": 0.8333333333333334
+      },
+      {
+        "record_idx": 1,
+        "label": "Lenek",
+        "predict": "LETIEK",
+        "edit_distance": 2,
+        "accuracy": 0.6666666666666667
+      },
+      {
+        "record_idx": 2,
+        "label": "Montong Betok",
+        "predict": "PUSKESDES KILANG",
+        "edit_distance": 15,
+        "accuracy": 0.0625
+      }
+    ]
+  },
+  "golongan_darah_ibu": {
+    "exact_match_rate": 0.2839506172839506,
+    "average_edit_distance": 0.9506172839506173,
+    "average_normalized_accuracy": 0.35020576131687237,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 4,
+    "exact_matches": 23,
+    "total_records": 81,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "o",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 1,
+        "label": "b",
+        "predict": "B +",
+        "edit_distance": 2,
+        "accuracy": 0.33333333333333337
+      },
+      {
+        "record_idx": 4,
+        "label": "a",
+        "predict": "B A",
+        "edit_distance": 2,
+        "accuracy": 0.33333333333333337
+      }
+    ]
+  },
+  "jumlah_anak_lahir_hidup": {
+    "exact_match_rate": 0.40860215053763443,
+    "average_edit_distance": 0.6559139784946236,
+    "average_normalized_accuracy": 0.4155145929339477,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 6,
+    "exact_matches": 38,
+    "total_records": 93,
+    "examples": [
+      {
+        "record_idx": 3,
+        "label": "",
+        "predict": "-",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 4,
+        "label": "2",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 7,
+        "label": "1",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "kabupaten_kota": {
+    "exact_match_rate": 0.18,
+    "average_edit_distance": 6.73,
+    "average_normalized_accuracy": 0.44309218559218544,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 14,
+    "exact_matches": 18,
+    "total_records": 100,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Lombok Timur",
+        "predict": "LOTIM",
+        "edit_distance": 7,
+        "accuracy": 0.41666666666666663
+      },
+      {
+        "record_idx": 1,
+        "label": "Lombok Timur",
+        "predict": "LOTIM",
+        "edit_distance": 7,
+        "accuracy": 0.41666666666666663
+      },
+      {
+        "record_idx": 3,
+        "label": "Lombok Timur",
+        "predict": "LOTIM",
+        "edit_distance": 7,
+        "accuracy": 0.41666666666666663
+      }
+    ]
+  },
+  "kehamilan_ke": {
+    "exact_match_rate": 0.3978494623655914,
+    "average_edit_distance": 0.946236559139785,
+    "average_normalized_accuracy": 0.44871138419525514,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 12,
+    "exact_matches": 37,
+    "total_records": 93,
+    "examples": [
+      {
+        "record_idx": 2,
+        "label": "2",
+        "predict": "6219041",
+        "edit_distance": 6,
+        "accuracy": 0.1428571428571429
+      },
+      {
+        "record_idx": 3,
+        "label": "",
+        "predict": "1",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 4,
+        "label": "3",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "nama_ibu_cover": {
+    "exact_match_rate": 0.56,
+    "average_edit_distance": 1.68,
+    "average_normalized_accuracy": 0.8923876665746872,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 15,
+    "exact_matches": 56,
+    "total_records": 100,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "Asmini Wati",
+        "predict": "ASMIHI WATI",
+        "edit_distance": 1,
+        "accuracy": 0.9090909090909091
+      },
+      {
+        "record_idx": 3,
+        "label": "Depa Apriani",
+        "predict": "PEPA APRIANI",
+        "edit_distance": 1,
+        "accuracy": 0.9166666666666666
+      },
+      {
+        "record_idx": 4,
+        "label": "ENI NURIANA FATHURRAHMAN",
+        "predict": "ERI HURIANA P",
+        "edit_distance": 14,
+        "accuracy": 0.41666666666666663
+      }
+    ]
+  },
+  "nik_ibu": {
+    "exact_match_rate": 0.15053763440860216,
+    "average_edit_distance": 3.7204301075268815,
+    "average_normalized_accuracy": 0.7661248784584818,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 18,
+    "exact_matches": 14,
+    "total_records": 93,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "5203014210030001",
+        "predict": "S 003019216830001",
+        "edit_distance": 6,
+        "accuracy": 0.6470588235294117
+      },
+      {
+        "record_idx": 2,
+        "label": "5203035202980001",
+        "predict": "S20303S20298 0001",
+        "edit_distance": 3,
+        "accuracy": 0.8235294117647058
+      },
+      {
+        "record_idx": 3,
+        "label": "5203015901070003",
+        "predict": "S203015901090003",
+        "edit_distance": 2,
+        "accuracy": 0.875
+      }
+    ]
+  },
+  "no_jkn_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 7.730769230769231,
+    "average_normalized_accuracy": 0.19083130293537534,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 16,
+    "exact_matches": 0,
+    "total_records": 26,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "",
+        "predict": "- 17 1 568 5 NA",
+        "edit_distance": 15,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 4,
+        "label": "2190703127",
+        "predict": "0002190703127",
+        "edit_distance": 3,
+        "accuracy": 0.7692307692307692
+      },
+      {
+        "record_idx": 5,
+        "label": "",
+        "predict": "Q - ",
+        "edit_distance": 3,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "pekerjaan_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 12.646464646464647,
+    "average_normalized_accuracy": 0.17792207792207793,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 16,
+    "exact_matches": 0,
+    "total_records": 99,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "ibu_rumah_tangga",
+        "predict": "IRT",
+        "edit_distance": 13,
+        "accuracy": 0.1875
+      },
+      {
+        "record_idx": 1,
+        "label": "ibu_rumah_tangga",
+        "predict": "Irt.",
+        "edit_distance": 13,
+        "accuracy": 0.1875
+      },
+      {
+        "record_idx": 2,
+        "label": "ibu_rumah_tangga",
+        "predict": "IIT",
+        "edit_distance": 14,
+        "accuracy": 0.125
+      }
+    ]
+  },
+  "pendidikan_ibu": {
+    "exact_match_rate": 0.58,
+    "average_edit_distance": 2.32,
+    "average_normalized_accuracy": 0.7334166666666667,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 16,
+    "exact_matches": 58,
+    "total_records": 100,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "sd",
+        "predict": "SD.",
+        "edit_distance": 1,
+        "accuracy": 0.6666666666666667
+      },
+      {
+        "record_idx": 5,
+        "label": "smp",
+        "predict": "SMP.",
+        "edit_distance": 1,
+        "accuracy": 0.75
+      },
+      {
+        "record_idx": 8,
+        "label": "sma",
+        "predict": "SLTA",
+        "edit_distance": 2,
+        "accuracy": 0.5
+      }
+    ]
+  },
+  "riwayat_keguguran": {
+    "exact_match_rate": 0.3118279569892473,
+    "average_edit_distance": 0.6989247311827957,
+    "average_normalized_accuracy": 0.3118279569892473,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 2,
+    "exact_matches": 29,
+    "total_records": 93,
+    "examples": [
+      {
+        "record_idx": 2,
+        "label": "0",
+        "predict": "-",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 3,
+        "label": "",
+        "predict": "-",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 4,
+        "label": "0",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "riwayat_penyakit_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 18.66,
+    "average_normalized_accuracy": 0.01044047619047619,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 30,
+    "exact_matches": 0,
+    "total_records": 100,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "other",
+        "predict": "-",
+        "edit_distance": 5,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 1,
+        "label": "no_past_complications",
+        "predict": "-",
+        "edit_distance": 21,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 2,
+        "label": "do_not_know_past_complications",
+        "predict": "-",
+        "edit_distance": 30,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "telepon_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 5.385714285714286,
+    "average_normalized_accuracy": 0.6077884615384617,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 16,
+    "exact_matches": 0,
+    "total_records": 70,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "6281929813493",
+        "predict": "081 929 813 493.",
+        "edit_distance": 6,
+        "accuracy": 0.625
+      },
+      {
+        "record_idx": 1,
+        "label": "",
+        "predict": "087 894 63 251",
+        "edit_distance": 14,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 3,
+        "label": "6282340578115",
+        "predict": "082340578115",
+        "edit_distance": 2,
+        "accuracy": 0.8461538461538461
+      }
+    ]
+  },
+  "tempat_tanggal_lahir_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 10.747474747474747,
+    "average_normalized_accuracy": 0.23632201183326582,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 17,
+    "exact_matches": 0,
+    "total_records": 99,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Oct 2, 03",
+        "predict": "TJ. Luar 0/10 2003",
+        "edit_distance": 14,
+        "accuracy": 0.2222222222222222
+      },
+      {
+        "record_idx": 1,
+        "label": "Feb 27, 91",
+        "predict": "27 -2 -1991",
+        "edit_distance": 8,
+        "accuracy": 0.2727272727272727
+      },
+      {
+        "record_idx": 2,
+        "label": "Feb 12, 98",
+        "predict": "sukadana 12-02-1998",
+        "edit_distance": 14,
+        "accuracy": 0.26315789473684215
+      }
+    ]
+  },
+  "usia_ibu": {
+    "exact_match_rate": 0.0,
+    "average_edit_distance": 5.172413793103448,
+    "average_normalized_accuracy": 0.0,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 8,
+    "exact_matches": 0,
+    "total_records": 58,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "",
+        "predict": "31 Th",
+        "edit_distance": 5,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 1,
+        "label": "",
+        "predict": "34 thn",
+        "edit_distance": 6,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 2,
+        "label": "",
+        "predict": "27th",
+        "edit_distance": 4,
+        "accuracy": 0.0
+      }
+    ]
+  }
+}

results.md ADDED Viewed

	@@ -0,0 +1,290 @@

+====================================================================================================
+OCR ACCURACY REPORT (Edit Distance Analysis)
+====================================================================================================
+SUMMARY STATISTICS
+----------------------------------------------------------------------------------------------------
+Total Fields Analyzed: 16
+Overall Exact Match Rate: 18.27%
+Overall Average Normalized Accuracy: 37.86%
+PER-FIELD STATISTICS
+----------------------------------------------------------------------------------------------------
+Field Name                                         Exact Match     Avg Accuracy    Avg Edit Dist
+----------------------------------------------------------------------------------------------------
+nama_ibu_cover                                      56.00% (56/100)  89.24%   1.68
+nik_ibu                                             15.05% (14/93)  76.61%   3.72
+pendidikan_ibu                                      58.00% (58/100)  73.34%   2.32
+telepon_ibu                                          0.00% (0/70)  60.78%   5.39
+kehamilan_ke                                        39.78% (37/93)  44.87%   0.95
+kabupaten_kota                                      18.00% (18/100)  44.31%   6.73
+jumlah_anak_lahir_hidup                             40.86% (38/93)  41.55%   0.66
+golongan_darah_ibu                                  28.40% (23/81)  35.02%   0.95
+dikeluarkan_oleh_fasilitas_kesehatan                 5.00% (5/100)  34.24%   7.98
+riwayat_keguguran                                   31.18% (29/93)  31.18%   0.70
+tempat_tanggal_lahir_ibu                             0.00% (0/99)  23.63%  10.75
+no_jkn_ibu                                           0.00% (0/26)  19.08%   7.73
+pekerjaan_ibu                                        0.00% (0/99)  17.79%  12.65
+alamat_rumah_ibu                                     0.00% (0/94)  13.06%  11.18
+riwayat_penyakit_ibu                                 0.00% (0/100)   1.04%  18.66
+usia_ibu                                             0.00% (0/58)   0.00%   5.17
+EXAMPLES OF MISMATCHES (Top 3 per field)
+----------------------------------------------------------------------------------------------------
+Field: nama_ibu_cover
+  Exact Match Rate: 56.00%
+  Average Accuracy: 89.24%
+  Record 1:
+    Label:   'Asmini Wati'
+    Predict: 'ASMIHI WATI'
+    Edit Distance: 1, Accuracy: 90.91%
+  Record 3:
+    Label:   'Depa Apriani'
+    Predict: 'PEPA APRIANI'
+    Edit Distance: 1, Accuracy: 91.67%
+  Record 4:
+    Label:   'ENI NURIANA FATHURRAHMAN'
+    Predict: 'ERI HURIANA P'
+    Edit Distance: 14, Accuracy: 41.67%
+Field: nik_ibu
+  Exact Match Rate: 15.05%
+  Average Accuracy: 76.61%
+  Record 0:
+    Label:   '5203014210030001'
+    Predict: 'S 003019216830001'
+    Edit Distance: 6, Accuracy: 64.71%
+  Record 2:
+    Label:   '5203035202980001'
+    Predict: 'S20303S20298 0001'
+    Edit Distance: 3, Accuracy: 82.35%
+  Record 3:
+    Label:   '5203015901070003'
+    Predict: 'S203015901090003'
+    Edit Distance: 2, Accuracy: 87.50%
+Field: pendidikan_ibu
+  Exact Match Rate: 58.00%
+  Average Accuracy: 73.34%
+  Record 1:
+    Label:   'sd'
+    Predict: 'SD.'
+    Edit Distance: 1, Accuracy: 66.67%
+  Record 5:
+    Label:   'smp'
+    Predict: 'SMP.'
+    Edit Distance: 1, Accuracy: 75.00%
+  Record 8:
+    Label:   'sma'
+    Predict: 'SLTA'
+    Edit Distance: 2, Accuracy: 50.00%
+Field: telepon_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 60.78%
+  Record 0:
+    Label:   '6281929813493'
+    Predict: '081 929 813 493.'
+    Edit Distance: 6, Accuracy: 62.50%
+  Record 1:
+    Label:   ''
+    Predict: '087 894 63 251'
+    Edit Distance: 14, Accuracy: 0.00%
+  Record 3:
+    Label:   '6282340578115'
+    Predict: '082340578115'
+    Edit Distance: 2, Accuracy: 84.62%
+Field: kehamilan_ke
+  Exact Match Rate: 39.78%
+  Average Accuracy: 44.87%
+  Record 2:
+    Label:   '2'
+    Predict: '6219041'
+    Edit Distance: 6, Accuracy: 14.29%
+  Record 3:
+    Label:   ''
+    Predict: '1'
+    Edit Distance: 1, Accuracy: 0.00%
+  Record 4:
+    Label:   '3'
+    Predict: ''
+    Edit Distance: 1, Accuracy: 0.00%
+Field: kabupaten_kota
+  Exact Match Rate: 18.00%
+  Average Accuracy: 44.31%
+  Record 0:
+    Label:   'Lombok Timur'
+    Predict: 'LOTIM'
+    Edit Distance: 7, Accuracy: 41.67%
+  Record 1:
+    Label:   'Lombok Timur'
+    Predict: 'LOTIM'
+    Edit Distance: 7, Accuracy: 41.67%
+  Record 3:
+    Label:   'Lombok Timur'
+    Predict: 'LOTIM'
+    Edit Distance: 7, Accuracy: 41.67%
+Field: jumlah_anak_lahir_hidup
+  Exact Match Rate: 40.86%
+  Average Accuracy: 41.55%
+  Record 3:
+    Label:   ''
+    Predict: '-'
+    Edit Distance: 1, Accuracy: 0.00%
+  Record 4:
+    Label:   '2'
+    Predict: ''
+    Edit Distance: 1, Accuracy: 0.00%
+  Record 7:
+    Label:   '1'
+    Predict: ''
+    Edit Distance: 1, Accuracy: 0.00%
+Field: golongan_darah_ibu
+  Exact Match Rate: 28.40%
+  Average Accuracy: 35.02%
+  Record 0:
+    Label:   'o'
+    Predict: ''
+    Edit Distance: 1, Accuracy: 0.00%
+  Record 1:
+    Label:   'b'
+    Predict: 'B +'
+    Edit Distance: 2, Accuracy: 33.33%
+  Record 4:
+    Label:   'a'
+    Predict: 'B A'
+    Edit Distance: 2, Accuracy: 33.33%
+Field: dikeluarkan_oleh_fasilitas_kesehatan
+  Exact Match Rate: 5.00%
+  Average Accuracy: 34.24%
+  Record 0:
+    Label:   'Keruak'
+    Predict: 'kruak'
+    Edit Distance: 1, Accuracy: 83.33%
+  Record 1:
+    Label:   'Lenek'
+    Predict: 'LETIEK'
+    Edit Distance: 2, Accuracy: 66.67%
+  Record 2:
+    Label:   'Montong Betok'
+    Predict: 'PUSKESDES KILANG'
+    Edit Distance: 15, Accuracy: 6.25%
+Field: riwayat_keguguran
+  Exact Match Rate: 31.18%
+  Average Accuracy: 31.18%
+  Record 2:
+    Label:   '0'
+    Predict: '-'
+    Edit Distance: 1, Accuracy: 0.00%
+  Record 3:
+    Label:   ''
+    Predict: '-'
+    Edit Distance: 1, Accuracy: 0.00%
+  Record 4:
+    Label:   '0'
+    Predict: ''
+    Edit Distance: 1, Accuracy: 0.00%
+Field: tempat_tanggal_lahir_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 23.63%
+  Record 0:
+    Label:   'Oct 2, 03'
+    Predict: 'TJ. Luar 0/10 2003'
+    Edit Distance: 14, Accuracy: 22.22%
+  Record 1:
+    Label:   'Feb 27, 91'
+    Predict: '27 -2 -1991'
+    Edit Distance: 8, Accuracy: 27.27%
+  Record 2:
+    Label:   'Feb 12, 98'
+    Predict: 'sukadana 12-02-1998'
+    Edit Distance: 14, Accuracy: 26.32%
+Field: no_jkn_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 19.08%
+  Record 1:
+    Label:   ''
+    Predict: '- 17 1 568 5 NA'
+    Edit Distance: 15, Accuracy: 0.00%
+  Record 4:
+    Label:   '2190703127'
+    Predict: '0002190703127'
+    Edit Distance: 3, Accuracy: 76.92%
+  Record 5:
+    Label:   ''
+    Predict: 'Q - '
+    Edit Distance: 3, Accuracy: 0.00%
+Field: pekerjaan_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 17.79%
+  Record 0:
+    Label:   'ibu_rumah_tangga'
+    Predict: 'IRT'
+    Edit Distance: 13, Accuracy: 18.75%
+  Record 1:
+    Label:   'ibu_rumah_tangga'
+    Predict: 'Irt.'
+    Edit Distance: 13, Accuracy: 18.75%
+  Record 2:
+    Label:   'ibu_rumah_tangga'
+    Predict: 'IIT'
+    Edit Distance: 14, Accuracy: 12.50%
+Field: alamat_rumah_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 13.06%
+  Record 0:
+    Label:   ''
+    Predict: 'Tebon Selatan'
+    Edit Distance: 13, Accuracy: 0.00%
+  Record 1:
+    Label:   ''
+    Predict: 'PANGKALAN BUNTE.'
+    Edit Distance: 16, Accuracy: 0.00%
+  Record 2:
+    Label:   ''
+    Predict: 'BENDUNG SELATAN, KILANG'
+    Edit Distance: 23, Accuracy: 0.00%
+Field: riwayat_penyakit_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 1.04%
+  Record 0:
+    Label:   'other'
+    Predict: '-'
+    Edit Distance: 5, Accuracy: 0.00%
+  Record 1:
+    Label:   'no_past_complications'
+    Predict: '-'
+    Edit Distance: 21, Accuracy: 0.00%
+  Record 2:
+    Label:   'do_not_know_past_complications'
+    Predict: '-'
+    Edit Distance: 30, Accuracy: 0.00%
+Field: usia_ibu
+  Exact Match Rate: 0.00%
+  Average Accuracy: 0.00%
+  Record 0:
+    Label:   ''
+    Predict: '31 Th'
+    Edit Distance: 5, Accuracy: 0.00%
+  Record 1:
+    Label:   ''
+    Predict: '34 thn'
+    Edit Distance: 6, Accuracy: 0.00%
+  Record 2:
+    Label:   ''
+    Predict: '27th'
+    Edit Distance: 4, Accuracy: 0.00%

unused/label.sql ADDED Viewed

File without changes

unused/page1_label.sql ADDED Viewed

	@@ -0,0 +1,29 @@

+SELECT
+  filename,
+  project_id_mother,
+  CASE
+    WHEN first_name IS NOT NULL
+        AND last_name IS NOT NULL
+        AND first_name = last_name
+      THEN first_name
+    ELSE CONCAT(
+          IFNULL(first_name, ''),
+          IFNULL(
+            CONCAT(' ', last_name),
+            ''
+          )
+        )
+  END AS nama_ibu_cover,
+  puskesmas_name AS dikeluarkan_oleh_fasilitas_kesehatan,
+  regency_name AS kabupaten_kota
+FROM (
+  SELECT
+    *,
+    ROW_NUMBER() OVER (
+      PARTITION BY project_id_mother
+      ORDER BY `end` DESC
+    ) AS rn
+  FROM `stellar-orb-451904-d9.ocr_bukukia_ocr_sid.List Correct Entry 2025`
+)
+WHERE
+  rn = 1

unused/page1_prediction.sql ADDED Viewed

	@@ -0,0 +1,36 @@

+WITH dedup_master_files AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY original_filename
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
+  )
+  WHERE rn = 1
+),
+dedup_page1_cover AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page1_cover`
+  )
+  WHERE rn = 1
+)
+SELECT
+--   MF.id,
+--   MF.file_id,
+  MF.original_filename AS filename,
+  REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
+  P1.nama_ibu_cover,
+  P1.dikeluarkan_oleh_fasilitas_kesehatan,
+  P1.kabupaten_kota
+FROM dedup_page1_cover P1
+LEFT JOIN dedup_master_files MF ON P1.id = MF.id

unused/page2_label.sql ADDED Viewed

	@@ -0,0 +1,40 @@

+SELECT
+  filename,
+  project_id_mother,
+  CASE
+  WHEN first_name IS NOT NULL
+      AND last_name IS NOT NULL
+      AND first_name = last_name
+    THEN first_name
+  ELSE CONCAT(
+        IFNULL(first_name, ''),
+        IFNULL(
+          CONCAT(' ', last_name),
+          ''
+        )
+      )
+  END AS nama_ibu,
+  nik_mother AS nik_ibu,
+  birth_date AS tempat_tanggal_lahir_ibu,
+  address_street AS alamat_rumah_ibu,
+  contact_number AS telepon_ibu,
+  bpjs_mother AS no_jkn_ibu,
+  education_level AS pendidikan_ibu,
+  occupation AS pekerjaan_ibu,
+  blood_type_result AS golongan_darah_ibu,
+  age AS usia_ibu,
+  pregnancy_number AS kehamilan_ke,
+  number_live_birth AS jumlah_anak_lahir_hidup,
+  number_birth_lost AS riwayat_keguguran,
+  previous_preg_issue AS riwayat_penyakit_ibu
+FROM (
+  SELECT
+    *,
+    ROW_NUMBER() OVER (
+      PARTITION BY project_id_mother
+      ORDER BY `end` DESC
+    ) AS rn
+  FROM `stellar-orb-451904-d9.ocr_bukukia_ocr_sid.List Correct Entry 2025`
+)
+WHERE
+  rn = 1

unused/page2_prediction.sql ADDED Viewed

	@@ -0,0 +1,47 @@

+WITH dedup_master_files AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY original_filename
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
+  )
+  WHERE rn = 1
+),
+dedup_page2_identitas AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page2_identitas`
+  )
+  WHERE rn = 1
+)
+SELECT
+  -- MF.id,
+  -- MF.file_id,
+  MF.original_filename AS filename,
+  REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
+  P2.nama_ibu,
+  P2.nik_ibu,
+  P2.tempat_tanggal_lahir_ibu,
+  P2.alamat_rumah_ibu,
+  P2.telepon_ibu,
+  P2.no_jkn_ibu,
+  P2.pendidikan_ibu,
+  P2.pekerjaan_ibu,
+  P2.golongan_darah_ibu,
+  P2.usia_ibu,
+  P2.kehamilan_ke,
+  P2.jumlah_anak_lahir_hidup,
+  P2.riwayat_keguguran,
+  P2.riwayat_penyakit_ibu
+FROM dedup_page2_identitas P2
+LEFT JOIN dedup_master_files MF ON P2.id = MF.id

unused/page3_label.sql ADDED Viewed

File without changes

unused/page3_prediction.sql ADDED Viewed

	@@ -0,0 +1,90 @@

+WITH dedup_master_files AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY original_filename
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
+  )
+  WHERE rn = 1
+),
+dedup_page3_pengukuran_kunjungan AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page3_pengukuran_kunjungan`
+  )
+  WHERE rn = 1
+)
+SELECT
+  MF.id,
+  MF.file_id,
+  MF.original_filename,
+  REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
+  tanggal_periksa_kunjungan_ke_1,
+  tempat_periksa_kunjungan_ke_1,
+  berat_badan_kunjungan_ke_1,
+  tinggi_badan_kunjungan_ke_1,
+  lingkar_lengan_atas_kunjungan_ke_1,
+  tekanan_darah_kunjungan_ke_1,
+  tinggi_rahim_kunjungan_ke_1,
+  letak_dan_denyut_jantung_bayi_kunjungan_ke_1,
+  status_dan_imunisasi_tetanus_kunjungan_ke_1,
+  konseling_kunjungan_ke_1,
+  skrining_dokter_kunjungan_ke_1,
+  tablet_tambah_darah_kunjungan_ke_1,
+  tes_lab_hemoglobin_hb_kunjungan_ke_1,
+  tes_golongan_darah_kunjungan_ke_1,
+  tes_lab_protein_urine_kunjungan_ke_1,
+  tes_lab_gula_darah_kunjungan_ke_1,
+  usg_kunjungan_ke_1,
+  tripel_eliminasi_h_s_hep_b_kunjungan_ke_1,
+  tata_laksana_kasus_kunjungan_ke_1,
+  tanggal_periksa_kunjungan_ke_2,
+  tempat_periksa_kunjungan_ke_2,
+  berat_badan_kunjungan_ke_2,
+  tinggi_badan_kunjungan_ke_2,
+  lingkar_lengan_atas_kunjungan_ke_2,
+  tekanan_darah_kunjungan_ke_2,
+  tinggi_rahim_kunjungan_ke_2,
+  letak_dan_denyut_jantung_bayi_kunjungan_ke_2,
+  status_dan_imunisasi_tetanus_kunjungan_ke_2,
+  konseling_kunjungan_ke_2,
+  skrining_dokter_kunjungan_ke_2,
+  tablet_tambah_darah_kunjungan_ke_2,
+  tes_lab_hemoglobin_hb_kunjungan_ke_2,
+  tes_golongan_darah_kunjungan_ke_2,
+  tes_lab_protein_urine_kunjungan_ke_2,
+  tes_lab_gula_darah_kunjungan_ke_2,
+  usg_kunjungan_ke_2,
+  tripel_eliminasi_h_s_hep_b_kunjungan_ke_2,
+  tata_laksana_kasus_kunjungan_ke_2,
+  tanggal_periksa_kunjungan_ke_3,
+  tempat_periksa_kunjungan_ke_3,
+  berat_badan_kunjungan_ke_3,
+  tinggi_badan_kunjungan_ke_3,
+  lingkar_lengan_atas_kunjungan_ke_3,
+  tekanan_darah_kunjungan_ke_3,
+  tinggi_rahim_kunjungan_ke_3,
+  letak_dan_denyut_jantung_bayi_kunjungan_ke_3,
+  status_dan_imunisasi_tetanus_kunjungan_ke_3,
+  konseling_kunjungan_ke_3,
+  skrining_dokter_kunjungan_ke_3,
+  tablet_tambah_darah_kunjungan_ke_3,
+  tes_lab_hemoglobin_hb_kunjungan_ke_3,
+  tes_golongan_darah_kunjungan_ke_3,
+  tes_lab_protein_urine_kunjungan_ke_3,
+  tes_lab_gula_darah_kunjungan_ke_3,
+  usg_kunjungan_ke_3,
+  tripel_eliminasi_h_s_hep_b_kunjungan_ke_3,
+  tata_laksana_kasus_kunjungan_ke_3
+FROM dedup_page3_pengukuran_kunjungan P3
+LEFT JOIN dedup_master_files MF ON P3.id = MF.id

unused/prediction.sql ADDED Viewed

	@@ -0,0 +1,68 @@

+WITH dedup_master_files AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      LOWER(
+        REGEXP_REPLACE(original_filename, r'^(Copy of\s*)+', '')
+      ) AS cleaned_filename,
+      ROW_NUMBER() OVER (
+        PARTITION BY original_filename
+        ORDER BY created_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
+  )
+  WHERE rn = 1
+),
+dedup_page1_cover AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY file_id
+        ORDER BY created_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page1_cover`
+  )
+  WHERE rn = 1
+),
+dedup_page2_identitas AS (
+  SELECT *
+  FROM (
+    SELECT
+      *,
+      ROW_NUMBER() OVER (
+        PARTITION BY file_id
+        ORDER BY inserted_at DESC
+      ) AS rn
+    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page2_identitas`
+  )
+  WHERE rn = 1
+)
+SELECT
+  MF.original_filename AS `filename`,
+  REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
+  P1.nama_ibu_cover,
+  P1.dikeluarkan_oleh_fasilitas_kesehatan,
+  P1.kabupaten_kota,
+  P2.nama_ibu,
+  P2.nik_ibu,
+  P2.tempat_tanggal_lahir_ibu,
+  P2.alamat_rumah_ibu,
+  P2.telepon_ibu,
+  P2.no_jkn_ibu,
+  P2.pendidikan_ibu,
+  P2.pekerjaan_ibu,
+  P2.golongan_darah_ibu,
+  P2.usia_ibu,
+  P2.kehamilan_ke,
+  P2.jumlah_anak_lahir_hidup,
+  P2.riwayat_keguguran,
+  P2.riwayat_penyakit_ibu
+FROM dedup_master_files MF
+LEFT JOIN dedup_page1_cover P1 ON MF.id = P1.id
+LEFT JOIN dedup_page2_identitas P2 ON MF.id = P2.id

unused/sample_accuracy_output.json ADDED Viewed

	@@ -0,0 +1,719 @@

+{
+  "nama_ibu_cover": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 1.6666666666666667,
+    "average_normalized_accuracy": 0.9333333333333332,
+    "min_accuracy": 0.8,
+    "max_edit_distance": 5,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Sonika Laksono Putri, Ny",
+        "predict": "SoniKa Laksana Putri, NY.",
+        "edit_distance": 5,
+        "accuracy": 0.8
+      }
+    ]
+  },
+  "dikeluarkan_oleh_fasilitas_kesehatan": {
+    "exact_match_rate": 0.3333333333333333,
+    "average_edit_distance": 1.3333333333333333,
+    "average_normalized_accuracy": 0.6388888888888888,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 3,
+    "exact_matches": 1,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "DWB",
+        "predict": "PUS",
+        "edit_distance": 3,
+        "accuracy": 0.0
+      },
+      {
+        "record_idx": 2,
+        "label": "UTD PKM KMK",
+        "predict": "UPTD PKM KMK",
+        "edit_distance": 1,
+        "accuracy": 0.9166666666666666
+      }
+    ]
+  },
+  "kabupaten_kota": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "tanggal_dikeluarkannya_buku": {
+    "exact_match_rate": 0.3333333333333333,
+    "average_edit_distance": 1.6666666666666667,
+    "average_normalized_accuracy": 0.8250000000000001,
+    "min_accuracy": 0.6,
+    "max_edit_distance": 4,
+    "exact_matches": 1,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "8-9-2025",
+        "predict": "8-9-2024",
+        "edit_distance": 1,
+        "accuracy": 0.875
+      },
+      {
+        "record_idx": 2,
+        "label": "30-09-2025",
+        "predict": "30/09/25",
+        "edit_distance": 4,
+        "accuracy": 0.6
+      }
+    ]
+  },
+  "provinsi": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nama_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 2.0,
+    "average_normalized_accuracy": 0.9,
+    "min_accuracy": 0.7,
+    "max_edit_distance": 6,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Sonika Laksono P",
+        "predict": "Sonika Laksana Putri",
+        "edit_distance": 6,
+        "accuracy": 0.7
+      }
+    ]
+  },
+  "nik_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9791666666666666,
+    "min_accuracy": 0.9375,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "3303071004010001",
+        "predict": "3303271004010001",
+        "edit_distance": 1,
+        "accuracy": 0.9375
+      }
+    ]
+  },
+  "no_jkn_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.6666666666666666,
+    "average_normalized_accuracy": 0.9333333333333332,
+    "min_accuracy": 0.8,
+    "max_edit_distance": 2,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "0001389281",
+        "predict": "8801389281",
+        "edit_distance": 2,
+        "accuracy": 0.8
+      }
+    ]
+  },
+  "fasilitas_kesehatan_tk1_ibu": {
+    "exact_match_rate": 0.3333333333333333,
+    "average_edit_distance": 0.6666666666666666,
+    "average_normalized_accuracy": 0.9388888888888888,
+    "min_accuracy": 0.9,
+    "max_edit_distance": 1,
+    "exact_matches": 1,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Klinik PMI",
+        "predict": "Klinik PMU",
+        "edit_distance": 1,
+        "accuracy": 0.9
+      },
+      {
+        "record_idx": 1,
+        "label": "Pusk Bojong",
+        "predict": "Pusk. Bojong",
+        "edit_distance": 1,
+        "accuracy": 0.9166666666666666
+      }
+    ]
+  },
+  "fasilitas_kesehatan_rujukan_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.6666666666666666,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "-",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "tempat_tanggal_lahir_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9848484848484849,
+    "min_accuracy": 0.9545454545454546,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Purbalingga 17/6/2000",
+        "predict": "Purbalingga, 17/6/2000",
+        "edit_distance": 1,
+        "accuracy": 0.9545454545454546
+      }
+    ]
+  },
+  "pendidikan_ibu": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "pekerjaan_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9333333333333332,
+    "min_accuracy": 0.8,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "Buruh",
+        "predict": "Bunuh",
+        "edit_distance": 1,
+        "accuracy": 0.8
+      }
+    ]
+  },
+  "alamat_rumah_ibu": {
+    "exact_match_rate": 0.3333333333333333,
+    "average_edit_distance": 4.666666666666667,
+    "average_normalized_accuracy": 0.7333333333333334,
+    "min_accuracy": 0.44999999999999996,
+    "max_edit_distance": 11,
+    "exact_matches": 1,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Kd. mjgn 2/1",
+        "predict": "Kd. Myun 2/1",
+        "edit_distance": 3,
+        "accuracy": 0.75
+      },
+      {
+        "record_idx": 1,
+        "label": "Kedung Menjangan 1/1",
+        "predict": "Wedy Wijayars 1/1",
+        "edit_distance": 11,
+        "accuracy": 0.44999999999999996
+      }
+    ]
+  },
+  "telepon_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9722222222222222,
+    "min_accuracy": 0.9166666666666666,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "089682122648",
+        "predict": "089682122698",
+        "edit_distance": 1,
+        "accuracy": 0.9166666666666666
+      }
+    ]
+  },
+  "anak_ke_ibu": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nomor_akta_kelahiran_ibu": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "golongan_darah_ibu": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nama_suami_keluarga": {
+    "exact_match_rate": 0.3333333333333333,
+    "average_edit_distance": 3.0,
+    "average_normalized_accuracy": 0.7678571428571429,
+    "min_accuracy": 0.4285714285714286,
+    "max_edit_distance": 8,
+    "exact_matches": 1,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Safrudin",
+        "predict": "Saprudin",
+        "edit_distance": 1,
+        "accuracy": 0.875
+      },
+      {
+        "record_idx": 1,
+        "label": "Sugiato Sujono",
+        "predict": "Sugrah Fuzmi",
+        "edit_distance": 8,
+        "accuracy": 0.4285714285714286
+      }
+    ]
+  },
+  "nik_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9791666666666666,
+    "min_accuracy": 0.9375,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "3303050711800001",
+        "predict": "3363050711800001",
+        "edit_distance": 1,
+        "accuracy": 0.9375
+      }
+    ]
+  },
+  "no_jkn_suami_keluarga": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "fasilitas_kesehatan_tk1_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 3.0,
+    "average_normalized_accuracy": 0.7692307692307692,
+    "min_accuracy": 0.3076923076923077,
+    "max_edit_distance": 9,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Pusk. Bojong",
+        "predict": "PUSK. BOJONG.",
+        "edit_distance": 9,
+        "accuracy": 0.3076923076923077
+      }
+    ]
+  },
+  "fasilitas_kesehatan_rujukan_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.6666666666666666,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "-",
+        "predict": "",
+        "edit_distance": 1,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "tempat_tanggal_lahir_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9777777777777779,
+    "min_accuracy": 0.9333333333333333,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "Pbgg, 10/4/2001",
+        "predict": "Pbg, 10/4/2001",
+        "edit_distance": 1,
+        "accuracy": 0.9333333333333333
+      }
+    ]
+  },
+  "pendidikan_suami_keluarga": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "pekerjaan_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 2.3333333333333335,
+    "average_normalized_accuracy": 0.7407407407407408,
+    "min_accuracy": 0.2222222222222222,
+    "max_edit_distance": 7,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "Buruh",
+        "predict": "Kary Bumi",
+        "edit_distance": 7,
+        "accuracy": 0.2222222222222222
+      }
+    ]
+  },
+  "alamat_rumah_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 4.0,
+    "average_normalized_accuracy": 0.6666666666666666,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 12,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "",
+        "predict": "Kd. Myun 2/1",
+        "edit_distance": 12,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "telepon_suami_keluarga": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 4.0,
+    "average_normalized_accuracy": 0.6666666666666666,
+    "min_accuracy": 0.0,
+    "max_edit_distance": 12,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 0,
+        "label": "",
+        "predict": "085776323896",
+        "edit_distance": 12,
+        "accuracy": 0.0
+      }
+    ]
+  },
+  "anak_ke_suami_keluarga": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nomor_akta_kelahiran_suami_keluarga": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "golongan_darah_suami_keluarga": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nama_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nik_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "no_jkn_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "fasilitas_kesehatan_tk1_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "fasilitas_kesehatan_rujukan_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "tempat_tanggal_lahir_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "pendidikan_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "pekerjaan_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "alamat_rumah_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "telepon_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "anak_ke_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "nomor_akta_kelahiran_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "golongan_darah_anak": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "usia_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9444444444444445,
+    "min_accuracy": 0.8333333333333334,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "35 th",
+        "predict": "35 th.",
+        "edit_distance": 1,
+        "accuracy": 0.8333333333333334
+      }
+    ]
+  },
+  "kehamilan_ke": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 2.6666666666666665,
+    "average_normalized_accuracy": 0.8431372549019608,
+    "min_accuracy": 0.5294117647058824,
+    "max_edit_distance": 8,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "GV P2 A2 (26 th)",
+        "predict": "6E PI A2Y (26ln).",
+        "edit_distance": 8,
+        "accuracy": 0.5294117647058824
+      }
+    ]
+  },
+  "jumlah_anak_lahir_hidup": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 2.6666666666666665,
+    "average_normalized_accuracy": 0.8333333333333334,
+    "min_accuracy": 0.5,
+    "max_edit_distance": 8,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "2 (17 th, 11 th)",
+        "predict": "2. (17R, 11R).",
+        "edit_distance": 8,
+        "accuracy": 0.5
+      }
+    ]
+  },
+  "riwayat_keguguran": {
+    "exact_match_rate": 1.0,
+    "average_edit_distance": 0.0,
+    "average_normalized_accuracy": 1.0,
+    "min_accuracy": 1.0,
+    "max_edit_distance": 0,
+    "exact_matches": 3,
+    "total_records": 3,
+    "examples": []
+  },
+  "riwayat_penyakit_ibu": {
+    "exact_match_rate": 0.6666666666666666,
+    "average_edit_distance": 0.3333333333333333,
+    "average_normalized_accuracy": 0.9166666666666666,
+    "min_accuracy": 0.75,
+    "max_edit_distance": 1,
+    "exact_matches": 2,
+    "total_records": 3,
+    "examples": [
+      {
+        "record_idx": 1,
+        "label": "Maag",
+        "predict": "Maas",
+        "edit_distance": 1,
+        "accuracy": 0.75
+      }
+    ]
+  }
+}