nbconvert

Sleeping

App Files Files Community

davanstrien HF Staff commited on Mar 10, 2023

Commit

14c6f3f

1 Parent(s): 49ef0df

cache hub checks

Browse files

Files changed (1) hide show

app.py +42 -22

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import contextlib
-from typing import Literal, Tuple, Dict, List
 import httpx
 import nbformat
 from nbformat import NotebookNode, ValidationError
@@ -14,26 +14,45 @@ import re
 from traitlets.config import Config
 from huggingface_hub import model_info, dataset_info
 from huggingface_hub.utils import RepositoryNotFoundError
 hub_id_regex = re.compile(r"[^\w]([a-zA-Z\d-]{3,32}\/[\w\-._]{3,64})[^\w/]")
 class HubIDCell(Preprocessor):
     def preprocess_cell(self, cell, resources, index):
         if cell["cell_type"] == "code":
-            resources.setdefault("dataset_matches", [])
-            resources.setdefault("model_matches", [])
-            match = re.search(hub_id_regex, cell["source"])
-            if match:
                 hub_id_match = match.groups(0)[0]
-                print(hub_id_match)
-                try:
-                    model = model_info(hub_id_match)
-                    resources["model_matches"].append(model.modelId)
-                except RepositoryNotFoundError:
-                    with contextlib.suppress(RepositoryNotFoundError):
-                        dataset = dataset_info(hub_id_match)
-                        resources["dataset_matches"].append(dataset.id)
         return cell, resources
@@ -52,6 +71,7 @@ async def healthz(_):
     return JSONResponse({"success": True})
 def convert(
     s: str, theme: Literal["light", "dark"], debug_info: str
 ) -> Tuple[str, List[str], List[str]]:
@@ -63,7 +83,7 @@ def convert(
         )
     except nbformat.reader.NotJSONError:
         print(400, f"Notebook is not JSON. {debug_info}")
-        raise HTTPException(400, f"Notebook is not JSON.")
     except ValidationError as e:
         print(
             400,
@@ -117,14 +137,14 @@ async def convert_from_url(req: Request):
     html_text, model_matches, dataset_matches = convert(
         r.text, theme=theme, debug_info=f"url={url}"
     )
-    return HTMLResponse(content=html_text)
-    # return JSONResponse(
-    #     content={
-    #         "html": html_text,
-    #         "model_matches": model_matches,
-    #         "dataset_matches": dataset_matches,
-    #     }
-    # )
 async def convert_from_upload(req: Request):

 import contextlib
+from typing import Literal, Tuple, List
 import httpx
 import nbformat
 from nbformat import NotebookNode, ValidationError
 from traitlets.config import Config
 from huggingface_hub import model_info, dataset_info
 from huggingface_hub.utils import RepositoryNotFoundError
+from functools import lru_cache
 hub_id_regex = re.compile(r"[^\w]([a-zA-Z\d-]{3,32}\/[\w\-._]{3,64})[^\w/]")
+@lru_cache(
+    maxsize=4096
+)  # TODO possibly make async but might be tricky to call inside PreProcessor
+def check_hub_item(hub_id_match):
+    with contextlib.suppress(RepositoryNotFoundError):
+        model_info(hub_id_match)
+        return hub_id_match, "model"
+    with contextlib.suppress(RepositoryNotFoundError):
+        dataset_info(hub_id_match)
+        return hub_id_match, "dataset"
+# async def check_repo_exists(regex_hub_id_match: str) -> Optional[Tuple[str, str]]:
+#     r = await client.get(f"https://huggingface.co/api/models/{regex_hub_id_match}")
+#     if r.status_code == 200:
+#         return regex_hub_id_match, 'model'
+#     r = await client.get(f"https://huggingface.co/api/datasets/{regex_hub_id_match}")
+#     if r.status_code == 200:
+#         return regex_hub_id_match, 'dataset'
 class HubIDCell(Preprocessor):
     def preprocess_cell(self, cell, resources, index):
         if cell["cell_type"] == "code":
+            resources.setdefault("dataset_matches", set())
+            resources.setdefault("model_matches", set())
+            if match := re.search(hub_id_regex, cell["source"]):
                 hub_id_match = match.groups(0)[0]
+                if hub_check := check_hub_item(hub_id_match):
+                    hub_id_match, repo_item_type = hub_check
+                    if repo_item_type == "model":
+                        resources["model_matches"].add(hub_id_match)
+                    if repo_item_type == "dataset":
+                        resources["dataset_matches"].add(hub_id_match)
         return cell, resources
     return JSONResponse({"success": True})
+@lru_cache(maxsize=2048)
 def convert(
     s: str, theme: Literal["light", "dark"], debug_info: str
 ) -> Tuple[str, List[str], List[str]]:
         )
     except nbformat.reader.NotJSONError:
         print(400, f"Notebook is not JSON. {debug_info}")
+        raise HTTPException(400, "Notebook is not JSON.")
     except ValidationError as e:
         print(
             400,
     html_text, model_matches, dataset_matches = convert(
         r.text, theme=theme, debug_info=f"url={url}"
     )
+    # return HTMLResponse(content=html_text)
+    return JSONResponse(
+        content={
+            "html": html_text,
+            "model_matches": list(model_matches),
+            "dataset_matches": list(dataset_matches),
+        }
+    )
 async def convert_from_upload(req: Request):