Spaces:

peterkros
/

COFOG-Bert-AutoClassifier

Build error

App Files Files Community

peterkros commited on Dec 13, 2023

Commit

f9b0725

1 Parent(s): ae85134

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -3

app.py CHANGED Viewed

@@ -3,6 +3,98 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import torch
 import pickle
 # Model names for level1 and level2
 model_name_level1 = "peterkros/COFOG-bert2"
@@ -37,14 +129,29 @@ def predict(text):
     predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
     predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]
-    # Predict Level2 (assuming level2 model uses both text and predicted level1 label)
     combined_input = text + " " + predicted_label_level1
     inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
     with torch.no_grad():
         outputs_level2 = model_level2(**inputs_level2)
     probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
-    predicted_class_level2 = torch.argmax(probs_level2, dim=-1).item()
-    predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
     combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
     return combined_prediction

 import torch
 import pickle
+level1_to_level2_mapping = {
+    "General public services": [
+        "Executive and legislative organs, financial and fiscal affairs, external affairs",
+        "Foreign economic aid",
+        "General services",
+        "Basic research",
+        "R&D General public services",
+        "General public services n.e.c.",
+        "Public debt transactions",
+        "Transfers of a general character between different levels of government"
+    ],
+    "Defence": [
+        "Military defence",
+        "Civil defence",
+        "Foreign military aid",
+        "R&D Defence",
+        "Defence n.e.c."
+    ],
+    "Public order and safety": [
+        "Police services",
+        "Fire-protection services",
+        "Law courts",
+        "Prisons",
+        "R&D Public order and safety",
+        "Public order and safety n.e.c."
+    ],
+    "Economic affairs": [
+        "General economic, commercial and labour affairs",
+        "Agriculture, forestry, fishing and hunting",
+        "Fuel and energy",
+        "Mining, manufacturing and construction",
+        "Transport",
+        "Communication",
+        "Other industries",
+        "R&D Economic affairs",
+        "Economic affairs n.e.c."
+    ],
+    "Environmental protection": [
+        "Waste management",
+        "Waste water management",
+        "Pollution abatement",
+        "Protection of biodiversity and landscape",
+        "R&D Environmental protection",
+        "Environmental protection n.e.c."
+    ],
+    "Housing and community amenities": [
+        "Housing development",
+        "Community development",
+        "Water supply",
+        "Street lighting",
+        "R&D Housing and community amenities",
+        "Housing and community amenities n.e.c."
+    ],
+    "Health": [
+        "Medical products, appliances and equipment",
+        "Outpatient services",
+        "Hospital services",
+        "Public health services",
+        "R&D Health",
+        "Health n.e.c."
+    ],
+    "Recreation, culture and religion": [
+        "Recreational and sporting services",
+        "Cultural services",
+        "Broadcasting and publishing services",
+        "Religious and other community services",
+        "R&D Recreation, culture and religion",
+        "Recreation, culture and religion n.e.c."
+    ],
+    "Education": [
+        "Pre-primary and primary education",
+        "Secondary education",
+        "Post-secondary non-tertiary education",
+        "Tertiary education",
+        "Education not definable by level",
+        "Subsidiary services to education",
+        "R&D Education",
+        "Education n.e.c."
+    ],
+    "Social protection": [
+        "Sickness and disability",
+        "Old age",
+        "Survivors",
+        "Family and children",
+        "Unemployment",
+        "Housing",
+        "Social exclusion n.e.c.",
+        "R&D Social protection",
+        "Social protection n.e.c."
+    ]
+}
 # Model names for level1 and level2
 model_name_level1 = "peterkros/COFOG-bert2"
     predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
     predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]
+   # Predict Level2 (assuming level2 model uses both text and predicted level1 label)
     combined_input = text + " " + predicted_label_level1
     inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
     with torch.no_grad():
         outputs_level2 = model_level2(**inputs_level2)
     probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
+    # Extract the probabilities for the candidate level2 categories
+    level2_candidates = level1_to_level2_mapping.get(predicted_label_level1, [])
+    candidate_indices = [label_encoder_level2.transform([candidate])[0] for candidate in level2_candidates if candidate in label_encoder_level2.classes_]
+    # Filter the probabilities
+    filtered_probs = probs_level2[0, candidate_indices]
+    # Get the highest probability label from the filtered list
+    if len(filtered_probs) > 0:
+        highest_prob_index = torch.argmax(filtered_probs).item()
+        predicted_class_level2 = candidate_indices[highest_prob_index]
+        predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
+    else:
+        predicted_label_level2 = "n.e.c"
     combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
     return combined_prediction