Spaces:

librarian-bots
/

collection-reading-list-generator

Runtime error

App Files Files Community

davanstrien HF Staff commited on Sep 29, 2023

Commit

32c6187

1 Parent(s): 5112501

basic version

Browse files

Files changed (3) hide show

app.py +174 -0
requirements.in +4 -0
requirements.txt +184 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import gradio as gr
+from gradio_client import Client
+import json
+from cachetools import cached, TTLCache
+from typing import Optional, Any, List, Union, Dict
+import httpx
+import requests
+from typing import Dict, Any
+from toolz import groupby
+CACHE_TIME = 60 * 60 * 1  # 1 hour
+client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")
+@cached(cache=TTLCache(maxsize=500, ttl=10))
+def get_arxiv_ids_from_slug(
+    slug: str,
+) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
+    result = client.predict(slug, api_name="/predict")
+    with open(result) as f:
+        data = json.load(f)
+    return data
+def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
+    return f"ArXiv:{arxiv_id}"
+def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
+    arxiv_ids = []
+    if exclude_keys is not None:
+        data = {k: v for k, v in data.items() if k not in exclude_keys}
+        # check if dict now empty
+        if not data:
+            return []
+    for repo in data.values():
+        if repo is None:
+            continue
+        for item in repo.values():
+            arxiv_ids.extend(item["arxiv_ids"])
+    # format for semantic scholar
+    return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]
+@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
+def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
+    paper_ids = list(paper_ids)
+    print(paper_ids)
+    r = httpx.post(
+        "https://api.semanticscholar.org/recommendations/v1/papers/",
+        json={
+            "positivePaperIds": paper_ids,
+        },
+        params={"fields": "externalIds,title,year", "limit": 10},
+        timeout=30,
+    )
+    print(r.text)
+    return r.json()
+def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
+    return recommendation["externalIds"].get("ArXiv", None) is not None
+def group_by_is_arxiv_paper(
+    recommendations: List[Dict[str, Any]]
+) -> Dict[bool, List[Dict[str, Any]]]:
+    return groupby(is_arxiv_paper, recommendations)
+def format_recommendation_into_markdown(
+    grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
+):
+    comment = "The following papers were recommended by the Semantic Scholar API \n\n"
+    arxiv_papers = grouped_recommendations.get(True)
+    if arxiv_papers:
+        comment += "## Papers available on Hugging Face Papers:\n\n"
+        for r in arxiv_papers:
+            hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
+            comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
+    other_papers = grouped_recommendations.get(False)
+    if other_papers:
+        comment += "\n\n## Other papers:\n\n"
+        for r in other_papers:
+            comment += f"* {r['title']} ({r['year']})\n"
+    return comment
+def map_repo_name_to_api_key(repo_name: str) -> str:
+    return {
+        "datasets": "dataset papers",
+        "models": "model papers",
+        "papers": "papers",
+    }[repo_name]
+def get_recommendations_from_slug(
+    slug: str, excluded_repo_types: Optional[list[str]] = None
+):
+    excluded_repo_types = tuple(excluded_repo_types)
+    return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)
+@cached(cache=TTLCache(maxsize=500, ttl=60))
+def _get_recommendations_from_slug(
+    slug: str, excluded_repo_types: Optional[tuple[str]] = None
+):
+    data = get_arxiv_ids_from_slug(slug)
+    if excluded_repo_types:
+        excluded_repo_types = list(excluded_repo_types)
+        excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
+        print(f"excluded_repo_types_remapped={excluded_repo_types}")
+    ids = format_ids(data, exclude_keys=excluded_repo_types)
+    if not ids:
+        return (
+            "Based on your collection and exclusions"
+            f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try removing some excluded repo types or adding more items to your collection."
+        )
+    ids = tuple(ids)
+    recommendations = get_recommendations_from_semantic_scholar(ids)
+    recommendations = recommendations.get("recommendedPapers")
+    if recommendations is None:
+        raise gr.Error("Something went wrong with the Semantic Scholar API")
+    grouped = group_by_is_arxiv_paper(recommendations)
+    return format_recommendation_into_markdown(grouped)
+title = """📚 Collections Reading List Generator                 📚"""
+description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg"
+alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;">
+\n\n
+Hugging Face Collections allow you to curate models, datasets, spaces,
+and papers from the Hugging Face Hub.
+This Space will generate a reading list based on the items in your collection.
+This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!
+The Space works by:
+- finding any papers in your collection
+- finding papers related to the models and datasets in your collection
+- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.
+You can optionally exclude certain repo types fromm consideration when generating the reading list.
+"""
+slug_input = gr.Textbox(
+    lines=1,
+    label="Collection Slug",
+    placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
+)
+example_slugs = [
+    ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
+    ["osanseviero/model-merging-65097893623330a3a51ead66", []],
+    ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264",[]]
+]
+gr.Interface(
+    get_recommendations_from_slug,
+    inputs=[
+        slug_input,
+        gr.Dropdown(
+            label="Repos to exclude from contributing to recommendations",
+            choices=["datasets", "models", "papers"],
+            multiselect=True,
+        ),
+    ],
+    outputs="markdown",
+    description=description,
+    title=title,
+    allow_flagging="never",
+    examples=example_slugs,
+).launch(debug=True)

requirements.in ADDED Viewed

	@@ -0,0 +1,4 @@

+cachetools
+gradio
+gradio_client
+httpx

requirements.txt ADDED Viewed

	@@ -0,0 +1,184 @@

+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile
+#
+aiofiles==23.2.1
+    # via gradio
+altair==5.1.1
+    # via gradio
+annotated-types==0.5.0
+    # via pydantic
+anyio==3.7.1
+    # via
+    #   fastapi
+    #   httpcore
+    #   starlette
+attrs==23.1.0
+    # via
+    #   jsonschema
+    #   referencing
+cachetools==5.3.1
+    # via -r requirements.in
+certifi==2023.7.22
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.2.0
+    # via requests
+click==8.1.7
+    # via uvicorn
+contourpy==1.1.1
+    # via matplotlib
+cycler==0.11.0
+    # via matplotlib
+fastapi==0.103.1
+    # via gradio
+ffmpy==0.3.1
+    # via gradio
+filelock==3.12.4
+    # via huggingface-hub
+fonttools==4.42.1
+    # via matplotlib
+fsspec==2023.9.2
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio==3.45.1
+    # via -r requirements.in
+gradio-client==0.5.2
+    # via
+    #   -r requirements.in
+    #   gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==0.18.0
+    # via httpx
+httpx==0.25.0
+    # via
+    #   -r requirements.in
+    #   gradio
+    #   gradio-client
+huggingface-hub==0.17.3
+    # via
+    #   gradio
+    #   gradio-client
+idna==3.4
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+importlib-resources==6.1.0
+    # via gradio
+jinja2==3.1.2
+    # via
+    #   altair
+    #   gradio
+jsonschema==4.19.1
+    # via altair
+jsonschema-specifications==2023.7.1
+    # via jsonschema
+kiwisolver==1.4.5
+    # via matplotlib
+markupsafe==2.1.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib==3.8.0
+    # via gradio
+numpy==1.26.0
+    # via
+    #   altair
+    #   contourpy
+    #   gradio
+    #   matplotlib
+    #   pandas
+orjson==3.9.7
+    # via gradio
+packaging==23.1
+    # via
+    #   altair
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   matplotlib
+pandas==2.1.1
+    # via
+    #   altair
+    #   gradio
+pillow==10.0.1
+    # via
+    #   gradio
+    #   matplotlib
+pydantic==2.4.2
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.10.1
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pyparsing==3.1.1
+    # via matplotlib
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+python-multipart==0.0.6
+    # via gradio
+pytz==2023.3.post1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   gradio
+    #   huggingface-hub
+referencing==0.30.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.31.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+rpds-py==0.10.3
+    # via
+    #   jsonschema
+    #   referencing
+semantic-version==2.10.0
+    # via gradio
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.0
+    # via
+    #   anyio
+    #   httpcore
+    #   httpx
+starlette==0.27.0
+    # via fastapi
+toolz==0.12.0
+    # via altair
+tqdm==4.66.1
+    # via huggingface-hub
+typing-extensions==4.8.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+tzdata==2023.3
+    # via pandas
+urllib3==2.0.5
+    # via requests
+uvicorn==0.23.2
+    # via gradio
+websockets==11.0.3
+    # via
+    #   gradio
+    #   gradio-client