Spaces:

monetjoe
/

cv_backbones

Running

App Files Files

admin commited on Apr 17, 2025

Commit

4ab8727

1 Parent(s): 6ab8ddf

sync ms

Browse files

Files changed (2) hide show

app.py +68 -51
requirements.txt +1 -3

app.py CHANGED Viewed

@@ -7,18 +7,17 @@ import pandas as pd
 from tqdm import tqdm
 from bs4 import BeautifulSoup
-cache_json = "cv_backbones.json"
-def parse_url(url):
     response = requests.get(url)
     html = response.text
     return BeautifulSoup(html, "html.parser")
-def special_type(m_ver):
     m_type = re.search("[a-zA-Z]+", m_ver).group(0)
     if m_type == "wide" or m_type == "resnext":
         return "resnet"
@@ -31,7 +30,7 @@ def special_type(m_ver):
     return m_type
-def info_on_dataset(m_ver, m_type, in1k_span):
     url_span = in1k_span.find_next_sibling("span", {"class": "s2"})
     size_span = url_span.find_next_sibling("span", {"class": "mi"})
     m_url = str(url_span.text[1:-1])
@@ -45,94 +44,112 @@ def gen_dataframe(url="https://pytorch.org/vision/main/_modules/"):
     article = torch_page.find("article", {"id": "pytorch-article"})
     ul = article.find("ul").find("ul")
     in1k_v1, in1k_v2 = [], []
     for li in tqdm(ul.find_all("li"), desc="Crawling cv backbone info..."):
         name = str(li.text)
         if name.__contains__("torchvision.models.") and len(name.split(".")) == 3:
-            if (
-                name.__contains__("_api")
-                or name.__contains__("feature_extraction")
-                or name.__contains__("maxvit")
-            ):
                 continue
             href = li.find("a").get("href")
             model_page = parse_url(url + href)
             divs = model_page.select("div.viewcode-block")
             for div in divs:
                 div_id = str(div["id"])
                 if div_id.__contains__("_Weights"):
                     m_ver = div_id.split("_Weight")[0].lower()
-                    if m_ver.__contains__("swin_v2_"):
-                        continue
                     m_type = special_type(m_ver)
                     in1k_v1_span = div.find(
-                        name="span", attrs={"class": "n"}, string="IMAGENET1K_V1"
                     )
                     if not in1k_v1_span:
                         continue
                     m_dict, size_span = info_on_dataset(m_ver, m_type, in1k_v1_span)
                     in1k_v1.append(m_dict)
                     in1k_v2_span = size_span.find_next_sibling(
-                        name="span", attrs={"class": "n"}, string="IMAGENET1K_V2"
                     )
                     if in1k_v2_span:
                         m_dict, _ = info_on_dataset(m_ver, m_type, in1k_v2_span)
                         in1k_v2.append(m_dict)
     dataset = {"IMAGENET1K_V1": in1k_v1, "IMAGENET1K_V2": in1k_v2}
-    with open("IMAGENET1K_V1.jsonl", "w", encoding="utf-8") as jsonl_file:
         for item in in1k_v1:
             jsonl_file.write(json.dumps(item) + "\n")
-    with open("IMAGENET1K_V2.jsonl", "w", encoding="utf-8") as jsonl_file:
         for item in in1k_v2:
             jsonl_file.write(json.dumps(item) + "\n")
     return dataset
-def inference(subset):
-    cache_json = f"{subset}.jsonl"
-    if os.path.exists(cache_json):
-        with open(cache_json, "r", encoding="utf-8") as jsonl_file:
-            dataset = [json.loads(line) for line in jsonl_file]
-    else:
-        dataset = gen_dataframe()[subset]
-    return pd.DataFrame(dataset), cache_json
-def sync(subset):
-    cache_json = f"{subset}.jsonl"
-    if os.path.exists(cache_json):
-        os.remove(cache_json)
-    return None
-with gr.Blocks() as demo:
-    with gr.Row():
-        subset_opt = gr.Dropdown(
-            choices=["IMAGENET1K_V1", "IMAGENET1K_V2"], value="IMAGENET1K_V1"
-        )
-        sync_btn = gr.Button("Clean cache")
-        dld_file = gr.components.File(label="Download JSON lines")
-    with gr.Row():
-        data_frame = gr.Dataframe(headers=["ver", "type", "input_size", "url"])
-    subset_opt.change(inference, inputs=subset_opt, outputs=[data_frame, dld_file])
-    sync_btn.click(sync, inputs=subset_opt, outputs=dld_file)
-demo.launch()

 from tqdm import tqdm
 from bs4 import BeautifulSoup
+V_TO_SPLIT = {"IMAGENET1K_V1": "train", "IMAGENET1K_V2": "test"}
+def parse_url(url: str):
     response = requests.get(url)
     html = response.text
     return BeautifulSoup(html, "html.parser")
+def special_type(m_ver: str):
     m_type = re.search("[a-zA-Z]+", m_ver).group(0)
     if m_type == "wide" or m_type == "resnext":
         return "resnet"
     return m_type
+def info_on_dataset(m_ver: str, m_type: str, in1k_span):
     url_span = in1k_span.find_next_sibling("span", {"class": "s2"})
     size_span = url_span.find_next_sibling("span", {"class": "mi"})
     m_url = str(url_span.text[1:-1])
     article = torch_page.find("article", {"id": "pytorch-article"})
     ul = article.find("ul").find("ul")
     in1k_v1, in1k_v2 = [], []
     for li in tqdm(ul.find_all("li"), desc="Crawling cv backbone info..."):
         name = str(li.text)
         if name.__contains__("torchvision.models.") and len(name.split(".")) == 3:
+            if name.__contains__("_api") or name.__contains__("feature_extraction"):
                 continue
             href = li.find("a").get("href")
             model_page = parse_url(url + href)
             divs = model_page.select("div.viewcode-block")
             for div in divs:
                 div_id = str(div["id"])
                 if div_id.__contains__("_Weights"):
                     m_ver = div_id.split("_Weight")[0].lower()
                     m_type = special_type(m_ver)
                     in1k_v1_span = div.find(
+                        name="span",
+                        attrs={"class": "n"},
+                        string="IMAGENET1K_V1",
                     )
                     if not in1k_v1_span:
                         continue
                     m_dict, size_span = info_on_dataset(m_ver, m_type, in1k_v1_span)
                     in1k_v1.append(m_dict)
                     in1k_v2_span = size_span.find_next_sibling(
+                        name="span",
+                        attrs={"class": "n"},
+                        string="IMAGENET1K_V2",
                     )
                     if in1k_v2_span:
                         m_dict, _ = info_on_dataset(m_ver, m_type, in1k_v2_span)
                         in1k_v2.append(m_dict)
     dataset = {"IMAGENET1K_V1": in1k_v1, "IMAGENET1K_V2": in1k_v2}
+    with open("train.jsonl", "w", encoding="utf-8") as jsonl_file:
         for item in in1k_v1:
             jsonl_file.write(json.dumps(item) + "\n")
+    with open("test.jsonl", "w", encoding="utf-8") as jsonl_file:
         for item in in1k_v2:
             jsonl_file.write(json.dumps(item) + "\n")
     return dataset
+# outer func
+def infer(subset: str):
+    status = "Success"
+    prewiew = out_json = None
+    try:
+        cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
+        if os.path.exists(cache_json):
+            with open(cache_json, "r", encoding="utf-8") as jsonl_file:
+                dataset = [json.loads(line) for line in jsonl_file]
+        else:
+            dataset = gen_dataframe()[subset]
+        prewiew = pd.DataFrame(dataset)
+        out_json = cache_json
+    except Exception as e:
+        status = f"{e}"
+    return status, prewiew, out_json
+# outer func
+def sync(subset: str):
+    status = "Success"
+    try:
+        cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
+        if os.path.exists(cache_json):
+            os.remove(cache_json)
+        if os.path.exists(cache_json):
+            raise Exception(f"Failed to clean {cache_json}")
+    except Exception as e:
+        status = f"{e}"
+    return status, None
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                subset_opt = gr.Dropdown(
+                    choices=["IMAGENET1K_V1", "IMAGENET1K_V2"],
+                    value="IMAGENET1K_V1",
+                )
+                sync_btn = gr.Button("Clean cache")
+            with gr.Column():
+                status_bar = gr.Textbox(label="Status", show_copy_button=True)
+                dld_file = gr.File(label="Download JSON lines")
+        with gr.Row():
+            data_frame = gr.Dataframe(headers=["ver", "type", "input_size", "url"])
+        subset_opt.change(
+            infer,
+            inputs=subset_opt,
+            outputs=[status_bar, data_frame, dld_file],
+        )
+        sync_btn.click(sync, inputs=subset_opt, outputs=[status_bar, dld_file])
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
-pandas
-tqdm
 bs4
-requests




1	bs4
2	+ pandas