Spaces:

Hacker1337
/

article_classifier

Sleeping

App Files Files Community

Hacker1337 commited on Jun 22, 2025

Commit

7c398ad

1 Parent(s): 0c21c10

Debugged app locally and customized it.

Browse files

Files changed (2) hide show

app.py +65 -46
dataset.py +12 -6

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import os
-from transformers import AutoModelForSequenceClassification
 import gradio as gr
-import numpy as np
-import random
 # import spaces #[uncomment to use ZeroGPU]
 # from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"
 if torch.cuda.is_available():
     torch_dtype = torch.float16
@@ -17,49 +15,61 @@ else:
     torch_dtype = torch.float32
-from dataset import labels, id2label, label2id, categorie2human
-# model_path = "distilbert/distilbert-base-cased" # todo, replace with hacker1337/article-classifier
-# model_path = os.path.expanduser(r"~\cache\huggingface\checkpoints\distilbert-arxiv\runs\Jun21_00-41-18_amir-xp")
-model_path = os.path.expanduser("~/.cache/huggingface/checkpoints/distilbert-arxiv")
-model = AutoModelForSequenceClassification.from_pretrained(
-    model_path,
-    num_labels=len(id2label),
-    id2label=id2label,
-    label2id=label2id,
-    problem_type="multi_label_classification",
 )
 # @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
     progress=gr.Progress(track_tqdm=True),
 ):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    # image = pipe(
-    #     prompt=prompt,
-    #     negative_prompt=negative_prompt,
-    #     guidance_scale=guidance_scale,
-    #     num_inference_steps=num_inference_steps,
-    #     width=width,
-    #     height=height,
-    #     generator=generator,
-    # ).images[0]
-    # return image, seed
 examples_titles = [
@@ -79,11 +89,6 @@ a comparison among the state of art networks both in terms of accuracy and in
 terms of speed which are of higher importance in real-time applications.""",
 ]
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
 css = """
 #col-container {
@@ -95,6 +100,13 @@ css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(" # Text-to-Image Gradio Template")
         title_prompt = gr.Text(
             label="Title Prompt",
@@ -111,10 +123,17 @@ with gr.Blocks(css=css) as demo:
             container=False,
         )
         run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
         # with gr.Accordion("Advanced Settings", open=False):
@@ -127,7 +146,7 @@ with gr.Blocks(css=css) as demo:
             title_prompt,
             summary_prompt,
         ],
-        outputs=[result],
     )
 if __name__ == "__main__":

 import os
+import socket
+import pandas as pd
 import gradio as gr
 # import spaces #[uncomment to use ZeroGPU]
 # from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if torch.cuda.is_available():
     torch_dtype = torch.float16
     torch_dtype = torch.float32
+from dataset import category2human, create_prompt
+LOCAL_COMPUTER_NAMES = ["amir-xps"]
+def is_local_machine():
+    return socket.gethostname().lower() in [
+        name.lower() for name in LOCAL_COMPUTER_NAMES
+    ]
+if is_local_machine():
+    model_path = os.path.expanduser("~/.cache/huggingface/checkpoints/distilbert-arxiv")
+else:
+    model_path = "Hacker1337/distilbert-arxiv-checkpoint"
+from transformers import pipeline
+classifier = pipeline(
+    "text-classification",
+    model=model_path,
+    tokenizer=model_path,
 )
 # @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
+    title_prompt,
+    summary_prompt,
     progress=gr.Progress(track_tqdm=True),
 ):
+    sample_prompt_full = create_prompt(
+        title_prompt,
+        summary_prompt,
+    )
+    predictions = classifier(sample_prompt_full, top_k=None)
+    target_probs_sum = 0.95
+    print(predictions)
+    df = pd.DataFrame(predictions)
+    df["label"] = df["label"].apply(lambda x: category2human[x])
+    label_dict = {}
+    bar_plot_dict = {}
+    total_prop = sum([prediction["score"] for prediction in predictions])
+    gained_prob = 0
+    for prediction in predictions:
+        bar_plot_dict[prediction["label"]] = prediction["score"]
+        if (gained_prob + prediction["score"]) / total_prop < target_probs_sum:
+            label_dict[category2human[prediction["label"]]] = (
+                prediction["score"] / total_prop
+            )
+            gained_prob += prediction["score"]
+    if gained_prob < total_prop:
+        label_dict["Other"] = (total_prop - gained_prob) / total_prop
+    return df, label_dict
 examples_titles = [
 terms of speed which are of higher importance in real-time applications.""",
 ]
 css = """
 #col-container {
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(" # Text-to-Image Gradio Template")
+        gr.Markdown(
+            "This space classifies scientific machine learning papers into categories based on their title and abstract."
+            "Bar plot shows probabilities of belonging to each category."
+        )
+        gr.Markdown(
+            "Second thing predicts most probable single class classification. It shows only first 95\% of categories."
+        )
         title_prompt = gr.Text(
             label="Title Prompt",
             container=False,
         )
         run_button = gr.Button("Run", scale=0, variant="primary")
+        result_bar = gr.BarPlot(
+            label="Multi class classification",
+            show_label=True,
+            x="label",
+            y="score",
+            x_label_angle=30,
+        )
+        result_label = gr.Label(label="Single class selection")
         # with gr.Accordion("Advanced Settings", open=False):
             title_prompt,
             summary_prompt,
         ],
+        outputs=[result_bar, result_label],
     )
 if __name__ == "__main__":

dataset.py CHANGED Viewed

@@ -1,13 +1,14 @@
 labels = ["CV", "AI", "ML", "NE", "CL"]
 id2label = {i: label for i, label in enumerate(labels)}
 label2id = {label: i for i, label in enumerate(labels)}
-categorie2human = {
     "CV": "Computer Vision",
     "AI": "Artificial Intelligence",
     "ML": "Machine Learning",
     "NE": "Neural and Evolutionary Computing",
-    "CL": "Computation and Language"
 }
@@ -19,13 +20,11 @@ def load_arxiv_dataset():
     # Download latest version
     path = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts")
     dataset = load_dataset(
         "csv",
         data_files=os.path.join(path, "arxiv_data.csv"),
         encoding="utf-8",
-        split="train"
     )
     # convert string to lists
@@ -37,4 +36,11 @@ def load_arxiv_dataset():
     dataset = dataset.map(parse_terms)
-    return dataset

 labels = ["CV", "AI", "ML", "NE", "CL"]
 id2label = {i: label for i, label in enumerate(labels)}
 label2id = {label: i for i, label in enumerate(labels)}
+category2human = {
     "CV": "Computer Vision",
     "AI": "Artificial Intelligence",
     "ML": "Machine Learning",
     "NE": "Neural and Evolutionary Computing",
+    "CL": "Computation and Language",
 }
     # Download latest version
     path = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts")
     dataset = load_dataset(
         "csv",
         data_files=os.path.join(path, "arxiv_data.csv"),
         encoding="utf-8",
+        split="train",
     )
     # convert string to lists
     dataset = dataset.map(parse_terms)
+    return dataset
+def create_prompt(title, summary):
+    """
+    Create a prompt for the model from the title and summary.
+    """
+    return f"# title:\n{title}\n# abstract:\n{summary}"