Spaces:

YChang1112
/

Titan_Project_Dashboard

Sleeping

App Files Files Community

YChang1112 commited on Apr 4, 2025

Commit

f5fb58c

verified ·

1 Parent(s): 21d363b

Initial prototype

Browse files

Files changed (1) hide show

app.py +227 -0

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import streamlit as st
+import requests
+import pandas as pd
+import json
+import os
+from datasets import load_dataset
+# Set page configuration
+st.set_page_config(
+    page_title="Huggingface Repository Explorer",
+    page_icon="🤗",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Title and description
+st.title("🤗 Huggingface Repository Explorer")
+st.markdown("""
+This dashboard showcases our models and datasets on Huggingface.
+Select a dataset to view sample data.
+""")
+# Access token will be set up via environment variable in the Huggingface Space
+# This way it's not exposed in the code and users don't need to enter it
+AUTH_TOKEN = os.environ.get("HF_TOKEN", "")
+# HF API endpoints
+HF_API_BASE = "https://huggingface.co/api"
+# Function to fetch dataset samples using the pre-configured token
+def fetch_dataset_samples(dataset_id, n=10):
+    try:
+        # Load the dataset in streaming mode
+        dataset = load_dataset(dataset_id,
+                              split="train",
+                              streaming=True,
+                              token=AUTH_TOKEN)
+        # Get the first n examples
+        samples = []
+        for i, example in enumerate(dataset):
+            if i >= n:
+                break
+            samples.append(example)
+        return samples
+    except Exception as e:
+        st.error(f"Error loading dataset samples: {e}")
+        return None
+# Hard-coded model list
+model_data = {
+    "Model Name": [
+        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp",
+        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python",
+        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C",
+        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java",
+        "TitanCAProject/CodeBERT-javascript"
+    ],
+    "Description": [
+        "Qwen2.5 model for the Csharp language",
+        "Qwen2.5 model for the Python language",
+        "Qwen2.5 model for the C language",
+        "Qwen2.5 model for the Jave language",
+        "CodeBERT model for the Javascript language"
+    ],
+    "Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3],
+    "Last Updated": [
+        "2024-11-15",
+        "2024-10-30",
+        "2024-12-05",
+        "2024-11-20",
+        "2024-12-10"
+    ]
+}
+# Convert to DataFrames
+df_models = pd.DataFrame(model_data)
+# Function to fetch dataset info including size and sample count
+def fetch_dataset_info(dataset_id):
+    headers = {"Authorization": f"Bearer {AUTH_TOKEN}"}
+    size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
+    url = f"{HF_API_BASE}/datasets/{dataset_id}"
+    try:
+        response = requests.get(size_url, headers=headers)
+        if response.status_code != 200:
+            st.warning(f"Error fetching dataset size info: {response.status_code}")
+            return None
+        dataset_info = response.json()
+        # Get size information - need to calculate
+        size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0)
+        # Convert to MB for display
+        size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None
+        # Get row count information
+        sample_count = dataset_info['size']['dataset'].get('num_rows', 0)
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            st.warning(f"Error fetching dataset info: {response.status_code}")
+            return None
+        dataset_info = response.json()
+        result = {
+            'id': dataset_id,
+            'description': dataset_info.get('description', 'No description available'),
+            'size_mb': size_mb,
+            'sample_count': sample_count,
+            'last_modified': dataset_info.get('lastModified', 'Unknown')
+        }
+        return result
+    except Exception as e:
+        st.error(f"Error processing dataset info: {e}")
+        return None
+# Main tabs
+tab1, tab2 = st.tabs(["Models", "Datasets"])
+# Models Tab
+with tab1:
+    st.header("Models")
+    # Display models table
+    st.dataframe(df_models, use_container_width=True)
+    # Selected model details
+    st.subheader("Model Details")
+    selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select")
+    if selected_model:
+        model_details = df_models[df_models["Model Name"] == selected_model].iloc[0]
+        st.markdown("### " + model_details["Model Name"])
+        st.markdown(f"**Description**: {model_details['Description']}")
+        st.markdown(f"**Size**: {model_details['Size (GB)']} GB")
+        st.markdown(f"**Last Updated**: {model_details['Last Updated']}")
+with tab2:
+    st.header("Datasets")
+    # List of dataset IDs to display
+    dataset_ids = [
+        "YChang1112/test-dataset",
+        "Anthropic/EconomicIndex"
+    ]
+    # Get actual dataset info from API
+    dataset_info_list = []
+    if AUTH_TOKEN:
+        with st.spinner("Loading dataset information..."):
+            for dataset_id in dataset_ids:
+                info = fetch_dataset_info(dataset_id)
+                if info:
+                    dataset_info_list.append(info)
+    else:
+        st.warning("Authentication token not configured. Unable to fetch dataset information.")
+    # Create a DataFrame from the collected information
+    if dataset_info_list:
+        df_datasets = pd.DataFrame({
+            "Dataset Name": [info['id'] for info in dataset_info_list],
+            "Description": [info['description'] for info in dataset_info_list],
+            "Size (MB)": [info['size_mb'] for info in dataset_info_list],
+            "Samples": [info['sample_count'] for info in dataset_info_list],
+            "Last Modified": [info['last_modified'] for info in dataset_info_list]
+        })
+        # Display datasets table
+        st.dataframe(df_datasets, use_container_width=True)
+    else:
+        st.error("No dataset information available. Please check your dataset IDs and authentication token.")
+    # Dataset details with sample preview
+    st.subheader("Dataset Preview")
+    if dataset_info_list:
+        selected_dataset = st.selectbox("Select a dataset to preview",
+                                      [info['id'] for info in dataset_info_list],
+                                      key="dataset_select")
+        if selected_dataset:
+            # Find the dataset info
+            dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None)
+            if dataset_info:
+                st.markdown(f"### {dataset_info['id']}")
+                st.markdown(f"**Description**: {dataset_info['description']}")
+                st.markdown(f"**Size**: {dataset_info['size_mb']} MB")
+                st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}")
+                st.markdown(f"**Last Modified**: {dataset_info['last_modified']}")
+                # Show dataset samples
+                st.markdown("### Sample Train Data")
+                with st.spinner("Fetching dataset samples..."):
+                    samples = fetch_dataset_samples(selected_dataset)
+                    if samples:
+                        # Convert samples to DataFrame if possible
+                        try:
+                            # If it's a list of samples
+                            if isinstance(samples, list) and len(samples) > 0:
+                                # Try to normalize to handle nested structures
+                                df_sample = pd.json_normalize(samples)
+                                st.dataframe(df_sample, use_container_width=True)
+                            # If it's a single sample object
+                            elif isinstance(samples, dict):
+                                df_sample = pd.DataFrame([samples])
+                                st.dataframe(df_sample, use_container_width=True)
+                            else:
+                                st.json(samples)
+                        except Exception as e:
+                            st.error(f"Error displaying samples: {e}")
+                            st.json(samples)  # Fallback to raw JSON display
+                    else:
+                        st.warning("Could not fetch dataset samples.")
+# Footer
+st.markdown("---")
+st.markdown("Repository Explorer | Last updated: April 2025")