asjc-classification
/

scibert_multilabel_asjc_classifier

@@ -78,32 +78,80 @@ For **26 parent subjects**, F1-score improves to **0.934** with full metadata.
 ## 🔍 Example Usage
 ```python
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
-import json
 # Load model and tokenizer
-model_name = "your-hf-username/open-asjc-multilabel"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-# Load sample input
-with open("small_example.json") as f:
-    data = json.load(f)
-text = data["title"] + " " + data.get("abstract", "")
-inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
-# Predict
-with torch.no_grad():
-    outputs = model(**inputs)
-    probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
-# Apply threshold
-threshold = 0.3
-predicted_labels = [label for label, prob in zip(model.config.id2label.values(), probs) if prob >= threshold]
 ```
 ## 📖 Citation
 If you use this work, please cite:

 ## 🔍 Example Usage
 ```python
+# Load packages
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import pandas as pd
+import json     # for json files
+import torch    # for tensor computation and deep learning
 # Load model and tokenizer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModelForSequenceClassification.from_pretrained("asjc-classification/scibert_multilabel_asjc_classifier")
+model.to(device)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained("asjc-classification/scibert_multilabel_asjc_classifier", do_lower_case=True)
+# Load the JSON file
+with open("small_example.json", "r") as file:
+    data = json.load(file)
+# Access the "all" array
+all_articles = data["all"]
+# Load the categories from the CSV file
+path = 'Categories.csv'
+# Read the CSV file into a pandas DataFrame
+df_categories = pd.read_csv(path, delimiter=';')
+# Extract the 'SUBJECT TERM' column as a list of class names
+classes = df_categories['SUBJECT TERM'].tolist()
+# Create mappings from class names to integer IDs and vice versa
+class2id = {class_: id for id, class_ in enumerate(classes)}
+id2class = {id: class_ for class_, id in class2id.items()}
+# Iterate over each example in the "all" array
+for example_data in all_articles:
+    # Extract the text and labels
+    example_article = example_data["string"]
+    true_labels_example = json.loads(example_data["subject"])  # Load the labels as a list
+    # Tokenize the article metadata
+    inputs = tokenizer(example_article, return_tensors='pt', truncation=True, padding=True, max_length=512)
+    # Move inputs to the correct device (CPU or GPU)
+    inputs = {key: value.to(device) for key, value in inputs.items()}
+    # Make predictions with the model
+    model.eval()            # Set the model to evaluation mode
+    with torch.no_grad():   # No gradient computation
+        outs = model(**inputs)
+        logits = outs[0]    # Raw predictions (logits)
+        pred_probs = torch.sigmoid(logits)  # Convert to probabilities using Sigmoid
+    # Convert probabilities to NumPy array
+    pred_probs = pred_probs.cpu().numpy().flatten()
+    # Create a DataFrame with probabilities and label names
+    df_probs = pd.DataFrame([pred_probs], columns=classes)
+    # Sort by highest probabilities and output the top 5 labels
+    top_5_predictions = df_probs.iloc[0].sort_values(ascending=False).head(5)
+    print(f"\n Text: {example_article}")
+    print(f"\n🔹 **Top 5 predicted labels for the example:**")
+    for label, prob in top_5_predictions.items():
+        print(f"   - {label}: {prob:.4f}")
+    # Display the actual labels of the example (True Labels)
+    print("\n✅ **Actual labels for the example:**")
+    for label in true_labels_example:
+        print(f"   - {label}")
 ```
+---
 ## 📖 Citation
 If you use this work, please cite: