Spaces:

SaProtHub
/

SaprotHub-search

Running

App Files Files Community

LTEnjoy commited on Sep 3, 2025

Commit

f0b07c6

verified ·

1 Parent(s): 3f7cec9

Upload 3 files

Browse files

Files changed (3) hide show

app.py +83 -0
loop_retrieve_cards.py +56 -0
utils.py +89 -0

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import sys
+root_dir = __file__.rsplit("/", 2)[0]
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+import gradio as gr
+from utils import set_text_bg_color
+from loop_retrieve_cards import get_models, get_datasets, get_readme_dict
+def match_card(input: str, card_id: str, card_type: str) -> str:
+    """
+    Search the input in a card. If the input string is contained in the card_id or its README, display this card.
+    Args:
+        input:  Input string
+        card_id: HuggingFace card id
+        card_type: Type of card, either "model" or "dataset"
+    """
+    display_str = ""
+    readme_dict = get_readme_dict()
+    if input.lower() in card_id.lower() or input.lower() in readme_dict[card_id].lower():
+        # Add card id
+        if card_type == "model":
+            display_str += f"## [{set_text_bg_color(input, card_id)}](https://huggingface.co/{card_id})\n\n"
+        else:
+            display_str += f"## [{set_text_bg_color(input, card_id)}](https://huggingface.co/datasets/{card_id})\n\n"
+        # Highlight lines that contain the input string
+        show_lines = []
+        for line in readme_dict[card_id].split("\n"):
+            if input.lower() in line.lower() and "<!--" not in line:
+                show_lines.append(set_text_bg_color(input, line))
+        # Add README
+        display_str += "\n\n".join(show_lines)
+        # Add a separator
+        display_str = f"\n\n{display_str}\n\n---\n\n"
+        # In case that the keyword is only contained in comments
+        if input.lower() not in card_id.lower() and len(show_lines) == 0:
+            display_str = ""
+    return display_str
+def show_card_info(input: str):
+    retrieval_str = ""
+    if input != "":
+        # Search models
+        retrieval_str += "# Models\n\n"
+        for model in get_models():
+            retrieval_str += match_card(input, model, "model")
+        # Search datasets
+        retrieval_str += "# Datasets\n\n"
+        for dataset in get_datasets():
+            retrieval_str += match_card(input, dataset, "dataset")
+    return gr.Markdown(retrieval_str, visible=True)
+# Build demo
+with gr.Blocks(title="SaprotHub", fill_width=True) as demo:
+    gr.Label("SaprotHub search", visible=True, show_label=False)
+    search_box = gr.Textbox(label="Search box", placeholder="Input keywords to search", interactive=True, scale=0, container=True)
+    # Display search results
+    search_hint = gr.Markdown("# Search results:", visible=True)
+    items = gr.Markdown(visible=False)
+    # Set events
+    search_box.change(show_card_info, inputs=[search_box], outputs=[items])
+if __name__ == '__main__':
+    # Run the demo
+    demo.launch(server_name="0.0.0.0")

loop_retrieve_cards.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import threading
+import time
+from utils import fetch_models, fetch_datasets, fetch_readme
+from tqdm import tqdm
+# Define global variables
+models = None
+datasets = None
+readme_dict = {}
+# Provide an API to get models
+def get_models():
+    return models
+# Provide an API to get datasets
+def get_datasets():
+    return datasets
+# Provide an API to get READMEs
+def get_readme_dict():
+    return readme_dict
+# Start a thread to continuously update cards
+def run():
+    global models, datasets, readme_dict, cnt
+    while True:
+        try:
+            new_models = fetch_models()
+            new_datasets = fetch_datasets()
+            # Add READMEs
+            new_readme_dict = {}
+            for model in new_models:
+                new_readme_dict[model] = fetch_readme(model, "model")
+            for dataset in new_datasets:
+                new_readme_dict[dataset] = fetch_readme(dataset, "dataset")
+            # Update global variables
+            models = new_models
+            datasets = new_datasets
+            readme_dict = new_readme_dict
+        except Exception as e:
+            print(e)
+t = threading.Thread(target=run)
+t.start()

utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import requests
+import re
+def fetch_models(author: str = "SaProtHub") -> list:
+    """
+    Retrieve models belonging to a specific author
+    Args:
+        author: Author name
+    Returns:
+        models: List of models
+    """
+    url = f"https://hf-mirror.com/api/models?author={author}"
+    response = requests.get(url)
+    models_dict = response.json()
+    models = [item["id"] for item in models_dict]
+    return models
+def fetch_datasets(author: str = "SaProtHub") -> list:
+    """
+    Retrieve datasets belonging to a specific author
+    Args:
+        author: Author name
+    Returns:
+        datasets: List of datasets
+    """
+    url = f"https://hf-mirror.com/api/datasets?author={author}"
+    response = requests.get(url)
+    datasets_dict = response.json()
+    datasets = [item["id"] for item in datasets_dict]
+    return datasets
+def fetch_readme(card_id: str, card_type: str) -> str:
+    """
+    Retrieve the README file of a model or dataset
+    Args:
+        card_id: Model or dataset ID
+        card_type: Type of card, either "model" or "dataset"
+    Returns:
+        readme: README text
+    """
+    if card_type == "model":
+        url = f"https://hf-mirror.com/{card_id}/raw/main/README.md"
+    else:
+        url = f"https://hf-mirror.com/datasets/{card_id}/raw/main/README.md"
+    response = requests.get(url)
+    readme = response.text.split("---")[-1]
+    return readme
+def set_text_bg_color(pattern: str, text: str, color: str = "yellow") -> str:
+    """
+    Set the background color of a pattern in a text
+    Args:
+        pattern: Pattern to highlight
+        text: Text to search
+        color: Background color
+    Returns:
+        text: Text with highlighted pattern
+    """
+    # Find all matches
+    matches = set(re.findall(re.escape(pattern), text, flags=re.IGNORECASE))
+    if len(matches) == 0:
+        # No matches found
+        return text
+    replace_dict = {re.escape(m): f'<span style="background-color:{color}">{m}</span>' for m in matches}
+    pattern = re.compile("|".join(replace_dict.keys()))
+    text = pattern.sub(lambda m: replace_dict[re.escape(m.group(0))], text)
+    return text