Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| import torch | |
| import pickle | |
| from dotenv import load_dotenv | |
| import os | |
| import pandas as pd | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Name of dataset to save flagged data to | |
| HF_dataset = "peterkros/COFOG-feedback" # <-- Replace with your dataset repo ID | |
| # Load the HuggingFaceDatasetSaver logger | |
| hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, HF_dataset) | |
| level1_to_level2_mapping = { | |
| "General public services": [ | |
| "Executive and legislative organs, financial and fiscal affairs, external affairs", | |
| "Foreign economic aid", | |
| "General services", | |
| "Basic research", | |
| "R&D General public services", | |
| "General public services n.e.c.", | |
| "Public debt transactions", | |
| "Transfers of a general character between different levels of government" | |
| ], | |
| "Defence": [ | |
| "Military defence", | |
| "Civil defence", | |
| "Foreign military aid", | |
| "R&D Defence", | |
| "Defence n.e.c." | |
| ], | |
| "Public order and safety": [ | |
| "Police services", | |
| "Fire-protection services", | |
| "Law courts", | |
| "Prisons", | |
| "R&D Public order and safety", | |
| "Public order and safety n.e.c." | |
| ], | |
| "Economic affairs": [ | |
| "General economic, commercial and labour affairs", | |
| "Agriculture, forestry, fishing and hunting", | |
| "Fuel and energy", | |
| "Mining, manufacturing and construction", | |
| "Transport", | |
| "Communication", | |
| "Other industries", | |
| "R&D Economic affairs", | |
| "Economic affairs n.e.c." | |
| ], | |
| "Environmental protection": [ | |
| "Waste management", | |
| "Waste water management", | |
| "Pollution abatement", | |
| "Protection of biodiversity and landscape", | |
| "R&D Environmental protection", | |
| "Environmental protection n.e.c." | |
| ], | |
| "Housing and community amenities": [ | |
| "Housing development", | |
| "Community development", | |
| "Water supply", | |
| "Street lighting", | |
| "R&D Housing and community amenities", | |
| "Housing and community amenities n.e.c." | |
| ], | |
| "Health": [ | |
| "Medical products, appliances and equipment", | |
| "Outpatient services", | |
| "Hospital services", | |
| "Public health services", | |
| "R&D Health", | |
| "Health n.e.c." | |
| ], | |
| "Recreation, culture and religion": [ | |
| "Recreational and sporting services", | |
| "Cultural services", | |
| "Broadcasting and publishing services", | |
| "Religious and other community services", | |
| "R&D Recreation, culture and religion", | |
| "Recreation, culture and religion n.e.c." | |
| ], | |
| "Education": [ | |
| "Pre-primary and primary education", | |
| "Secondary education", | |
| "Post-secondary non-tertiary education", | |
| "Tertiary education", | |
| "Education not definable by level", | |
| "Subsidiary services to education", | |
| "R&D Education", | |
| "Education n.e.c." | |
| ], | |
| "Social protection": [ | |
| "Sickness and disability", | |
| "Old age", | |
| "Survivors", | |
| "Family and children", | |
| "Unemployment", | |
| "Housing", | |
| "Social exclusion n.e.c.", | |
| "R&D Social protection", | |
| "Social protection n.e.c." | |
| ] | |
| } | |
| # Model names for level1 and level2 | |
| model_name_level1 = "peterkros/COFOG-bert2" | |
| model_name_level2 = "peterkros/COFOG-bert-level2" | |
| # Load models and tokenizers for both levels | |
| model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1) | |
| tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1) | |
| model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2) | |
| tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2) | |
| # Load the label encoder | |
| with open('label_encoder_level1.pkl', 'rb') as file: | |
| label_encoder_level1 = pickle.load(file) | |
| with open('label_encoder_level2.pkl', 'rb') as file: | |
| label_encoder_level2 = pickle.load(file) | |
| def predict(text): | |
| # Check if the input has at least two words | |
| if len(text.split()) < 2: | |
| return "Input must have at least two words." | |
| # Predict Level1 | |
| inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs_level1 = model_level1(**inputs_level1) | |
| probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1) | |
| predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item() | |
| predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0] | |
| # Predict Level2 (assuming level2 model uses both text and predicted level1 label) | |
| combined_input = text + " " + predicted_label_level1 | |
| inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs_level2 = model_level2(**inputs_level2) | |
| probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1) | |
| # Extract the probabilities for the candidate level2 categories | |
| level2_candidates = level1_to_level2_mapping.get(predicted_label_level1, []) | |
| candidate_indices = [label_encoder_level2.transform([candidate])[0] for candidate in level2_candidates if candidate in label_encoder_level2.classes_] | |
| # Filter the probabilities | |
| filtered_probs = probs_level2[0, candidate_indices] | |
| # Get the highest probability label from the filtered list | |
| if len(filtered_probs) > 0: | |
| highest_prob_index = torch.argmax(filtered_probs).item() | |
| predicted_class_level2 = candidate_indices[highest_prob_index] | |
| predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0] | |
| else: | |
| predicted_label_level2 = "n.e.c" | |
| combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}" | |
| return combined_prediction | |
| def classify_csv(file_obj): | |
| # Read the CSV file | |
| df = pd.read_csv(file_obj) | |
| # Check if the 'text' column is in the CSV file | |
| if 'text' not in df.columns: | |
| return "There is no column named 'text' in the file." | |
| # Process the file if the 'text' column exists | |
| results = [] | |
| for i in range(len(df)): | |
| # Combine the current line with the 5 preceding lines for context | |
| context_start = max(0, i - 5) | |
| context = " ".join(df['text'][context_start:i+1]) | |
| # Truncate the context to fit within the model's max length | |
| inputs = tokenizer_level1(context, truncation=True, max_length=512, return_tensors="pt") | |
| # Extract the truncated text for prediction | |
| truncated_context = tokenizer_level1.decode(inputs['input_ids'][0]) | |
| # Make a prediction using the truncated context | |
| prediction = predict(truncated_context) | |
| results.append((df['text'][i], prediction)) | |
| # Convert the results to a DataFrame with columns 'Line' and 'Prediction' | |
| results_df = pd.DataFrame(results, columns=["Budget Line", "Prediction"]) | |
| return results_df | |
| # Define the markdown text with bullet points | |
| markdown_text = """ | |
| - Trained with ~1500 rows of data on bert-base-uncased, English. | |
| - Input one budget line per time with min 2 words. | |
| - Accuracy of the model is ~88%. | |
| """ | |
| markdown_text_file_upload = """ | |
| - Trained with ~1500 rows of data on bert-base-uncased, English. | |
| - Upload CSV ONLY and name your column with budget line item as **text**. | |
| - Using RAG (Retrieval-augmented generation) aproach to feed context into classifier using preceding lines of budget. | |
| - Accuracy of the model is ~88%. | |
| """ | |
| html_table = """ | |
| <h2 style="text-align: center;">COFOG Budget AutoClassification</h2> | |
| <p style="text-align: justify; margin-left: 30px; margin-right: 30px;"> | |
| This classifier was developed utilizing the pre-trained BERT | |
| (Bidirectional Encoder Representations from Transformers) model | |
| with an uncased configuration, with over 1500 manually | |
| labeled dataset comprising budget line items extracted from | |
| various budgetary documents. To balance the data, additional data | |
| was generated using GPT-4 where categories were not available | |
| in budget documents. The model training was executed | |
| on a Google Colab environment, specifically utilizing a Tesla T4 GPU. | |
| The model is designed to predict the primary classification level | |
| of the Classification of the Functions of Government (COFOG), | |
| with the predictions from the first level serving as contextual | |
| input for subsequent second-level classification. The project | |
| is conducted with an exclusive focus on academic and research | |
| objectives.<br>For batch prediction we integrated Retriever-Augmented Generator (RAG) | |
| approach. This approach enriches the prediction process | |
| by incorporating contextual information from up to 5 preceding | |
| lines in the dataset, significantly enhancing the model's | |
| ability to understand and classify each entry in the context | |
| of related data.<br>Detailed metrics of the training process are as follows: | |
| <code>TrainOutput(global_step=395, training_loss=1.1497593360611156, | |
| metrics={'train_runtime': 650.0119, 'train_samples_per_second': | |
| 9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0, | |
| 'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>. | |
| </p> | |
| </div> | |
| """ | |
| # First interface for single line input | |
| iface1 = gr.Interface( | |
| fn=predict, | |
| inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"), | |
| outputs=gr.components.Label(label="Classification Output"), | |
| title="COFOG AutoClassification - Single Line", | |
| description=markdown_text, | |
| article=html_table, | |
| allow_flagging="manual", # Enables flagging | |
| flagging_options=["Incorect Level1", "Incorect Level2"], | |
| flagging_callback=hf_writer, | |
| ) | |
| # Second interface (for CSV file upload) | |
| iface2 = gr.Interface( | |
| fn=classify_csv, | |
| inputs=gr.components.File(label="Upload CSV File"), | |
| outputs=gr.components.DataFrame(label="Classification Results"), | |
| description=markdown_text_file_upload, | |
| article=html_table, | |
| title="COFOG AutoClassification - Batch Classification" | |
| ) | |
| # Combine the interfaces in a tabbed interface | |
| tabbed_interface = gr.TabbedInterface( | |
| [iface1, iface2], | |
| ["Single Prediction", "Batch Prediction"] | |
| ) | |
| # Run the interface | |
| if __name__ == "__main__": | |
| tabbed_interface.launch() |