Spaces:

curtizz
/

show_eval_result

Sleeping

App Files Files Community

curtizz commited on Jun 3, 2025

Commit

4598d9f

verified ·

1 Parent(s): 834bce8

Update app.py

Browse files

Files changed (1) hide show

app.py +502 -0

app.py CHANGED Viewed

	@@ -0,0 +1,502 @@

+import gradio as gr
+import json
+import pandas as pd
+import html
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def load_and_display_json(file):
+    try:
+        # Read the uploaded JSON file
+        with open(file.name, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # Validate that the data is a list of dictionaries
+        if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
+            return "Error: JSON file must contain a list of dictionaries."
+        # Prepare data for DataFrame
+        table_data = []
+        for idx, item in enumerate(data):
+            # Extract relevant fields, handling missing keys
+            eval_metrics = item.get('evaluation_metrics', [])
+            # Create a formatted string with each score on its own line with 3 decimal places
+            if eval_metrics:
+                eval_scores = "<br>".join(
+                    f"{metric.get('name', 'Unknown')}: {format(float(metric.get('score', 0)), '.3f') if isinstance(metric.get('score'), (int, float)) else metric.get('score', 'N/A')}"
+                    for metric in eval_metrics
+                )
+            else:
+                eval_scores = "N/A"
+            # Format time_spent value if it exists and is numeric
+            time_spent = item.get('time_spent', 'N/A')
+            if isinstance(time_spent, (int, float)):
+                formatted_time = f"{time_spent:.2f}s"
+            else:
+                formatted_time = time_spent
+            row = {
+                'Index': idx,
+                'User ID': item.get('user_id', 'N/A'),
+                'Question': item.get('question', 'N/A'),
+                'Confidence': item.get('confidence_score', 'N/A'),
+                'Send to Human': item.get('send_to_human', 'N/A'),
+                'Call Human Message': item.get('call_human_message', 'N/A'),
+                'Time Spent': formatted_time,
+                'Eval Scores': eval_scores,
+                'Response': item.get('chat_response', 'N/A'),
+                'Source': item.get('source', 'N/A'),
+                'Tools': ', '.join(item.get('tools', [])),
+                'Retrieval Context': item.get('retrieval_context', 'N/A'),
+                'Ground Truth': item.get('ground_truth', 'N/A'),
+                'Evaluation Metrics': eval_metrics
+            }
+            table_data.append(row)
+        # Create DataFrame
+        df = pd.DataFrame(table_data)
+        # Create HTML output for display
+        html_output = """
+        <style>
+            table {
+                width: 100%;
+                border-collapse: collapse;
+                margin-bottom: 20px;
+                font-family: Arial, sans-serif;
+            }
+            th, td {
+                border: 1px solid #e0e0e0;
+                padding: 12px;
+                text-align: left;
+                font-size: 14px;
+                vertical-align: top;
+            }
+            th {
+                background-color: #4CAF50;
+                color: white;
+                font-weight: bold;
+            }
+            tr:nth-child(even) {
+                background-color: #f9f9f9;
+            }
+            .send-to-human {
+                background-color: #ffcccc !important;
+            }
+            .low-validity {
+                background-color: #fff2cc !important;
+            }
+            .low-correctness {
+                background-color: #dddddd !important;
+            }
+            .expandable {
+                cursor: pointer;
+                color: #1a73e8;
+                font-weight: bold;
+                text-decoration: none;
+                display: inline-block;
+                padding: 8px;
+                transition: color 0.2s;
+                background-color: #e8f0fe;
+                border-radius: 4px;
+                border: 1px solid #c6dafc;
+                position: relative;
+            }
+            .expandable:hover {
+                color: #1557b0;
+                background-color: #d4e6fc;
+            }
+            .details {
+                display: none;
+                padding: 20px;
+                background-color: #ffffff;
+                border: 1px solid #e0e0e0;
+                border-radius: 5px;
+                margin-top: 10px;
+                box-shadow: 0 4px 8px rgba(0,0,0,0.2);
+                position: fixed;
+                z-index: 1000;
+                width: 80%;
+                min-width: 600px;
+                max-width: 1200px;
+                height: auto;
+                min-height: 400px;
+                max-height: 85vh;
+                overflow-y: auto;
+                left: 50%;
+                top: 50%;
+                transform: translate(-50%, -50%);
+            }
+            .human-message-popup {
+                display: none;
+                padding: 20px;
+                background-color: #ffffff;
+                border: 1px solid #e0e0e0;
+                border-radius: 5px;
+                box-shadow: 0 4px 8px rgba(0,0,0,0.2);
+                position: fixed;
+                z-index: 1000;
+                width: 70%;
+                min-width: 500px;
+                max-width: 1000px;
+                height: auto;
+                min-height: 200px;
+                max-height: 80vh;
+                overflow-y: auto;
+                left: 50%;
+                top: 50%;
+                transform: translate(-50%, -50%);
+                background-color: #fff9f9;
+                border: 1px solid #d32f2f;
+            }
+            input[type="checkbox"] {
+                display: none !important;
+                appearance: none;
+                -webkit-appearance: none;
+                -moz-appearance: none;
+            }
+            input[type="checkbox"]:checked ~ .details {
+                display: block;
+            }
+            input[type="checkbox"]:checked ~ .human-message-popup {
+                display: block;
+            }
+            input[type="checkbox"]:checked + .expandable::after {
+                content: " (Close)";
+            }
+            .details strong {
+                color: #333;
+                font-size: 16px;
+                display: block;
+                margin-bottom: 5px;
+            }
+            .details p {
+                margin: 10px 0;
+                line-height: 1.5;
+            }
+            .json-viewer {
+                background-color: #f5f5f5;
+                padding: 10px;
+                border-radius: 5px;
+                font-family: monospace;
+                font-size: 13px;
+                overflow-x: auto;
+                white-space: pre-wrap;
+            }
+            pre {
+                white-space: pre-wrap;
+                word-wrap: break-word;
+                margin: 0;
+            }
+            .color-legend {
+                margin: 20px 0;
+                padding: 15px;
+                border: 1px solid #e0e0e0;
+                border-radius: 5px;
+                background-color: #f9f9f9;
+            }
+            .legend-item {
+                display: flex;
+                align-items: center;
+                margin-bottom: 10px;
+            }
+            .color-box {
+                width: 20px;
+                height: 20px;
+                margin-right: 10px;
+                border: 1px solid #ccc;
+            }
+            .red-box {
+                background-color: #ffcccc;
+            }
+            .yellow-box {
+                background-color: #fff2cc;
+            }
+            .gray-box {
+                background-color: #dddddd;
+            }
+            .detail-container {
+                position: relative;
+            }
+            .close-details {
+                position: absolute;
+                top: 5px;
+                right: 5px;
+                cursor: pointer;
+                background-color: #f44336;
+                color: white;
+                border: none;
+                border-radius: 50%;
+                width: 24px;
+                height: 24px;
+                display: flex;
+                align-items: center;
+                justify-content: center;
+                font-weight: bold;
+            }
+            .overlay {
+                display: none;
+                position: fixed;
+                top: 0;
+                left: 0;
+                width: 100%;
+                height: 100%;
+                background-color: rgba(0,0,0,0.5);
+                z-index: 900;
+            }
+            input[type="checkbox"]:checked ~ .overlay {
+                display: block;
+            }
+            /* Column width adjustments */
+            table th:nth-child(1),
+            table td:nth-child(1) {
+                width: 15%;
+                white-space: nowrap;
+                overflow: hidden;
+                text-overflow: ellipsis;
+            }
+            table th:nth-child(2),
+            table td:nth-child(2) {
+                width: 25%;
+                max-width: 350px;
+                white-space: nowrap;
+                overflow: hidden;
+                text-overflow: ellipsis;
+            }
+            table th:nth-child(3),
+            table td:nth-child(3),
+            table th:nth-child(4),
+            table td:nth-child(4),
+            table th:nth-child(5),
+            table td:nth-child(5) {
+                width: 8%;
+                min-width: 70px;
+                text-align: center;
+            }
+            table th:nth-child(6),
+            table td:nth-child(6) {
+                width: 21%;
+            }
+            table th:nth-child(7),
+            table td:nth-child(7) {
+                width: 15%;
+                text-align: center;
+            }
+            /* Add tooltips for truncated content */
+            table td:nth-child(1),
+            table td:nth-child(2) {
+                position: relative;
+            }
+            table td:nth-child(1):hover::after,
+            table td:nth-child(2):hover::after {
+                content: attr(title);
+                position: absolute;
+                left: 0;
+                top: 100%;
+                z-index: 500;
+                background-color: #333;
+                color: #fff;
+                padding: 5px 10px;
+                border-radius: 4px;
+                white-space: pre-wrap;
+                max-width: 400px;
+                box-shadow: 0 2px 5px rgba(0,0,0,0.2);
+            }
+        </style>
+        <div class="color-legend">
+            <h3>Row Color Legend</h3>
+            <div class="legend-item">
+                <div class="color-box red-box"></div>
+                <div>Red: "Send to Human" is true - The system flagged this query to be sent to a human operator</div>
+            </div>
+            <div class="legend-item">
+                <div class="color-box yellow-box"></div>
+                <div>Yellow: Low Input Validity - Input Validity score is below 0.8</div>
+            </div>
+            <div class="legend-item">
+                <div class="color-box gray-box"></div>
+                <div>Gray: Low Correctness - Correctness score is below 0.6</div>
+            </div>
+        </div>
+        <script>
+            document.addEventListener('keydown', function(event) {
+                if (event.key === 'Escape') {
+                    // Find all checked checkboxes and uncheck them
+                    document.querySelectorAll('input[type="checkbox"]:checked').forEach(function(checkbox) {
+                        checkbox.checked = false;
+                    });
+                }
+            });
+        </script>
+        """
+        # Add table
+        html_output += "<table>"
+        html_output += "<tr>" + "".join(f"<th>{col}</th>" for col in ['User ID', 'Question', 'Confidence', 'Send to Human', 'Time Spent', 'Eval Scores', 'Details']) + "</tr>"
+        for _, row in df.iterrows():
+            # Check for low input validity score
+            low_validity = False
+            low_correctness = False
+            input_validity_found = False
+            correctness_found = False
+            if row['Evaluation Metrics']:
+                logger.info(f"Checking evaluation metrics for row {row['Index']}")
+                for metric in row['Evaluation Metrics']:
+                    # Check for metrics by name
+                    metric_name = metric.get('name', '')
+                    if isinstance(metric_name, str):
+                        # Check for Input Validity
+                        if metric_name == 'Input Validity (GEval)':
+                            input_validity_found = True
+                            input_validity_value = metric.get('score')
+                            logger.info(f"Found Input Validity score: {input_validity_value} (type: {type(input_validity_value).__name__})")
+                            # Try to convert to float and check if < 0.8
+                            try:
+                                if input_validity_value is not None:
+                                    float_value = float(input_validity_value)
+                                    logger.info(f"Converted to float: {float_value}")
+                                    if float_value < 0.8:
+                                        low_validity = True
+                                        logger.info(f"Low Input Validity detected: {float_value}")
+                            except (ValueError, TypeError) as e:
+                                logger.warning(f"Could not convert {input_validity_value} to float: {e}")
+                        # Check for Correctness
+                        elif metric_name == 'Correctness (GEval)':
+                            correctness_found = True
+                            correctness_value = metric.get('score')
+                            logger.info(f"Found Correctness score: {correctness_value} (type: {type(correctness_value).__name__})")
+                            # Try to convert to float and check if < 0.6
+                            try:
+                                if correctness_value is not None:
+                                    float_value = float(correctness_value)
+                                    logger.info(f"Converted to float: {float_value}")
+                                    if float_value < 0.6:
+                                        low_correctness = True
+                                        logger.info(f"Low Correctness detected: {float_value}")
+                            except (ValueError, TypeError) as e:
+                                logger.warning(f"Could not convert {correctness_value} to float: {e}")
+            # Determine row class (prioritize in order: send_to_human, low_validity, low_correctness)
+            row_class = ""
+            if row['Send to Human'] is True:
+                row_class = " class='send-to-human'"
+                logger.info(f"Row {row['Index']} marked as 'Send to Human'")
+            elif low_validity:
+                row_class = " class='low-validity'"
+                logger.info(f"Row {row['Index']} marked as 'Low Validity'")
+            elif low_correctness:
+                row_class = " class='low-correctness'"
+                logger.info(f"Row {row['Index']} marked as 'Low Correctness'")
+            html_output += f"<tr{row_class}>"
+            html_output += f"<td title=\"{html.escape(str(row['User ID']))}\">{html.escape(str(row['User ID']))}</td>"
+            html_output += f"<td title=\"{html.escape(str(row['Question']))}\">{html.escape(str(row['Question']))}</td>"
+            html_output += f"<td>{row['Confidence']}</td>"
+            # Add Send to Human cell with conditional message display
+            if row['Send to Human'] is True:
+                # Format call_human_message as JSON if it's a dictionary
+                call_human_message = row['Call Human Message']
+                try:
+                    if isinstance(call_human_message, dict):
+                        formatted_message = json.dumps(call_human_message, indent=2, ensure_ascii=False)
+                    else:
+                        formatted_message = str(call_human_message)
+                except:
+                    formatted_message = str(call_human_message)
+                html_output += f"""<td>
+                    <span style='font-weight: bold; color: #d32f2f;'>True</span>
+                    <input type='checkbox' id='message_toggle_{row["Index"]}'>
+                    <label for='message_toggle_{row["Index"]}' class='expandable' style='margin-top: 5px; background-color: #d32f2f; color: white; border: none; border-radius: 4px; padding: 5px 10px; cursor: pointer; font-size: 12px; display: block; width: calc(100% - 16px); text-align: center;'>Show Message</label>
+                    <div class='overlay' onclick="document.getElementById('message_toggle_{row["Index"]}').checked = false;"></div>
+                    <div class='human-message-popup' onclick="event.stopPropagation();">
+                        <button class="close-details" onclick="document.getElementById('message_toggle_{row["Index"]}').checked = false;">×</button>
+                        <strong style='color: #d32f2f; font-size: 16px; margin-bottom: 10px;'>Call Human Message:</strong>
+                        <div style='font-size: 14px; color: #000; white-space: pre-wrap; overflow-x: auto; background-color: #f5f5f5; padding: 10px; border-radius: 5px; border: 1px solid #ddd;'>
+                            {html.escape(formatted_message)}
+                        </div>
+                    </div>
+                </td>"""
+            else:
+                html_output += f"<td><span style='color: #555;'>False</span></td>"
+            html_output += f"<td>{row['Time Spent']}</td>"
+            html_output += f"<td>{row['Eval Scores']}</td>"
+            # Convert JSON objects to pretty-printed strings
+            retrieval_context_json = html.escape(str(row['Retrieval Context'])) if row['Retrieval Context'] else "N/A"
+            # Format evaluation metrics with 3 decimal places for scores
+            if row['Evaluation Metrics']:
+                formatted_metrics = []
+                for metric in row['Evaluation Metrics']:
+                    metric_copy = dict(metric)
+                    if 'score' in metric_copy and isinstance(metric_copy['score'], (int, float)):
+                        metric_copy['score'] = format(float(metric_copy['score']), '.3f')
+                    formatted_metrics.append(metric_copy)
+                eval_metrics_json = json.dumps(formatted_metrics, indent=2, ensure_ascii=False)
+            else:
+                eval_metrics_json = "N/A"
+            # Add details in the same row
+            html_output += f"""
+                <td class="detail-container">
+                    <input type='checkbox' id='toggle_{row["Index"]}'>
+                    <label for='toggle_{row["Index"]}' class='expandable'>Show Details</label>
+                    <div class='overlay' onclick="document.getElementById('toggle_{row["Index"]}').checked = false;"></div>
+                    <div class='details' onclick="event.stopPropagation();">
+                        <button class="close-details" onclick="document.getElementById('toggle_{row["Index"]}').checked = false;">×</button>
+                        <strong>Ground Truth:</strong>
+                        <p>{html.escape(str(row['Ground Truth']))}</p>
+                        <strong>Response:</strong>
+                        <p>{html.escape(str(row['Response']))}</p>
+                        <strong>Source:</strong>
+                        <p>{html.escape(str(row['Source']))}</p>
+                        <strong>Tools:</strong>
+                        <p>{html.escape(str(row['Tools']))}</p>
+                        {f"<strong style='color: #d32f2f;'>Call Human Message:</strong><p style='color: #d32f2f; white-space: pre-wrap;'>{html.escape(formatted_message)}</p>" if row['Send to Human'] is True else ""}
+                        <strong>Evaluation Metrics:</strong>
+                        <div class='json-viewer'><pre>{html.escape(eval_metrics_json)}</pre></div>
+                        <strong>Retrieval Context:</strong>
+                        <div class='json-viewer'><pre>{retrieval_context_json}</pre></div>
+                    </div>
+                </td>
+            """
+            html_output += "</tr>"
+        html_output += "</table>"
+        return html_output
+    except Exception as e:
+        return f"Error processing JSON file: {str(e)}"
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# JSON Data Visualization")
+    gr.Markdown("Upload a JSON file containing a list of dictionaries to visualize the data.")
+    with gr.Accordion("Row Color Legend", open=True):
+        gr.Markdown("""
+        * **Red rows**: "Send to Human" is true - The system flagged this query to be sent to a human operator
+        * **Yellow rows**: Low Input Validity - Input Validity score is below 0.8
+        * **Gray rows**: Low Correctness - Correctness score is below 0.6
+        """)
+    file_input = gr.File(label="Upload JSON File", file_types=[".json"])
+    output = gr.HTML(label="Data Visualization")
+    file_input.change(load_and_display_json, inputs=file_input, outputs=output)
+# Launch the interface
+demo.launch()