Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import difflib | |
| import requests | |
| import os | |
| import json | |
| FIREBASE_URL = os.getenv("FIREBASE_URL") | |
| def fetch_from_firebase(model_id, data_type): | |
| response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json") | |
| if response.status_code == 200: | |
| return response.json() | |
| return None | |
| def save_to_firebase(model_id, data, data_type): | |
| response = requests.put( | |
| f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data) | |
| ) | |
| return response.status_code == 200 | |
| def get_model_structure(model_id) -> list[str]: | |
| struct_lines = fetch_from_firebase(model_id, "model_structures") | |
| if struct_lines: | |
| return struct_lines | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True | |
| ) | |
| structure = {k: str(v.shape) for k, v in model.state_dict().items()} | |
| struct_lines = [f"{k}: {v}" for k, v in structure.items()] | |
| save_to_firebase(model_id, struct_lines, "model_structures") | |
| return struct_lines | |
| def get_tokenizer_vocab_size(model_id) -> int: | |
| vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes") | |
| if vocab_size: | |
| return vocab_size | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| vocab_size = tokenizer.vocab_size | |
| save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes") | |
| return vocab_size | |
| def compare_structures(struct1_lines: list[str], struct2_lines: list[str]): | |
| diff = difflib.ndiff(struct1_lines, struct2_lines) | |
| return diff | |
| def display_diff(diff): | |
| left_lines = [] | |
| right_lines = [] | |
| diff_found = False | |
| for line in diff: | |
| if line.startswith("- "): | |
| left_lines.append( | |
| f'<span style="background-color: #ffdddd;">{line[2:]}</span>' | |
| ) | |
| right_lines.append("") | |
| diff_found = True | |
| elif line.startswith("+ "): | |
| right_lines.append( | |
| f'<span style="background-color: #ddffdd;">{line[2:]}</span>' | |
| ) | |
| left_lines.append("") | |
| diff_found = True | |
| elif line.startswith(" "): | |
| left_lines.append(line[2:]) | |
| right_lines.append(line[2:]) | |
| else: | |
| pass | |
| left_html = "<br>".join(left_lines) | |
| right_html = "<br>".join(right_lines) | |
| return left_html, right_html, diff_found | |
| # Set Streamlit page configuration to wide mode | |
| st.set_page_config(layout="wide") | |
| # Apply custom CSS for wider layout | |
| st.markdown( | |
| """ | |
| <style> | |
| .reportview-container .main .block-container { | |
| max-width: 100%; | |
| padding-left: 10%; | |
| padding-right: 10%; | |
| } | |
| .stMarkdown { | |
| white-space: pre-wrap; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.title("Model Structure Comparison Tool") | |
| model_id1 = st.text_input("Enter the first HuggingFace Model ID") | |
| model_id2 = st.text_input("Enter the second HuggingFace Model ID") | |
| if st.button("Compare Models"): | |
| with st.spinner("Comparing models and loading tokenizers..."): | |
| if model_id1 and model_id2: | |
| # Get model structures | |
| struct1 = get_model_structure(model_id1) | |
| struct2 = get_model_structure(model_id2) | |
| # Compare model structures | |
| diff = compare_structures(struct1, struct2) | |
| left_html, right_html, diff_found = display_diff(diff) | |
| st.write("### Comparison Result") | |
| if not diff_found: | |
| st.success("The model structures are identical.") | |
| col1, col2 = st.columns( | |
| [1.5, 1.5] | |
| ) # Adjust the ratio to make columns wider | |
| with col1: | |
| st.write(f"### Model 1: {model_id1}") | |
| st.markdown(left_html, unsafe_allow_html=True) | |
| with col2: | |
| st.write(f"### Model 2: {model_id2}") | |
| st.markdown(right_html, unsafe_allow_html=True) | |
| # Tokenizer verification | |
| try: | |
| vocab_size1 = get_tokenizer_vocab_size(model_id1) | |
| vocab_size2 = get_tokenizer_vocab_size(model_id2) | |
| if vocab_size1 == vocab_size2: | |
| st.success("The tokenizer vocab sizes are identical.") | |
| else: | |
| st.warning("The tokenizer vocab sizes are different.") | |
| st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}") | |
| st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}") | |
| except Exception as e: | |
| st.error(f"Error loading tokenizers: {e}") | |
| else: | |
| st.error("Please enter both model IDs.") | |