Commit ·
d802797
1
Parent(s): b52b33f
Initial commit: NFQA multilingual question classifier
Browse files- XLM-RoBERTa fine-tuned for 8-way question classification
- 88.6% accuracy, 86.7% macro F1 on 49 languages
- 62,932 examples: LLM-annotated WebFAQ + generated data
- Ensemble annotation: LLaMA 3.1, Gemma 2, Qwen 2.5
- Quality threshold: 0.6 confidence (2/3 models agree)
- Includes model weights, tokenizer, metrics, and documentation
- Ready for Hugging Face deployment
- .gitattributes +1 -0
- .gitignore +44 -0
- README.md +438 -3
- classification_report.txt +14 -0
- config.json +48 -0
- confusion_matrix.png +3 -0
- model.safetensors +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +15 -0
- test_results.json +22 -0
- tokenizer.json +3 -0
- tokenizer_config.json +55 -0
- training_curves.png +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# Jupyter Notebook
|
| 27 |
+
.ipynb_checkpoints
|
| 28 |
+
|
| 29 |
+
# macOS
|
| 30 |
+
.DS_Store
|
| 31 |
+
.AppleDouble
|
| 32 |
+
.LSOverride
|
| 33 |
+
|
| 34 |
+
# Editor
|
| 35 |
+
.vscode/
|
| 36 |
+
.idea/
|
| 37 |
+
*.swp
|
| 38 |
+
*.swo
|
| 39 |
+
*~
|
| 40 |
+
|
| 41 |
+
# Temporary files
|
| 42 |
+
*.tmp
|
| 43 |
+
*.bak
|
| 44 |
+
*.log
|
README.md
CHANGED
|
@@ -1,3 +1,438 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NFQA Multilingual Question Classifier
|
| 2 |
+
|
| 3 |
+
A multilingual question classification model that categorizes questions into 8 distinct types based on the Non-Factoid Question Answering (NFQA) taxonomy.
|
| 4 |
+
|
| 5 |
+
## Model Description
|
| 6 |
+
|
| 7 |
+
This model classifies questions across **49 languages** into **8 categories** of question types, enabling better understanding of user intent and question characteristics for information retrieval and question answering systems.
|
| 8 |
+
|
| 9 |
+
### Model Details
|
| 10 |
+
|
| 11 |
+
- **Model Type**: Multilingual Text Classification
|
| 12 |
+
- **Base Model**: [xlm-roberta-base](https://huggingface.co/xlm-roberta-base)
|
| 13 |
+
- **Languages**: 49 languages (European, Asian, and Middle Eastern languages)
|
| 14 |
+
- **Categories**: 8 NFQA question types
|
| 15 |
+
- **Parameters**: ~278M parameters
|
| 16 |
+
- **Training Date**: January 2026
|
| 17 |
+
- **License**: [Specify your license]
|
| 18 |
+
|
| 19 |
+
### Developers
|
| 20 |
+
|
| 21 |
+
Developed by [Your Name/Organization] for research in multilingual question understanding and classification.
|
| 22 |
+
|
| 23 |
+
## Intended Use
|
| 24 |
+
|
| 25 |
+
### Primary Use Cases
|
| 26 |
+
|
| 27 |
+
- **Question Type Classification**: Automatically categorize user questions to route them to appropriate answering systems
|
| 28 |
+
- **Search Intent Understanding**: Enhance search engines by understanding the type of information users seek
|
| 29 |
+
- **Chatbot Development**: Improve conversational AI by identifying question types
|
| 30 |
+
- **FAQ Organization**: Automatically organize FAQ databases by question type
|
| 31 |
+
- **Content Recommendation**: Suggest relevant content based on question type
|
| 32 |
+
|
| 33 |
+
### Out-of-Scope Use
|
| 34 |
+
|
| 35 |
+
- This model is NOT designed for content moderation or filtering
|
| 36 |
+
- Should not be used as the sole decision-maker in high-stakes applications
|
| 37 |
+
- Not suitable for detecting malicious intent or harmful content
|
| 38 |
+
|
| 39 |
+
## Training Data
|
| 40 |
+
|
| 41 |
+
### Dataset Composition
|
| 42 |
+
|
| 43 |
+
The model was trained on a carefully curated and balanced multilingual dataset:
|
| 44 |
+
|
| 45 |
+
- **Total Examples**: 62,932 question-label pairs
|
| 46 |
+
- **Source Data**:
|
| 47 |
+
- ~49,000 examples from the WebFAQ dataset (LLM-annotated with ensemble voting)
|
| 48 |
+
- ~14,000 examples generated and validated using LLMs to balance categories and languages
|
| 49 |
+
- **Data Split**:
|
| 50 |
+
- Training: 44,051 examples (70%)
|
| 51 |
+
- Validation: 6,294 examples (10%)
|
| 52 |
+
- Test: 12,587 examples (20%)
|
| 53 |
+
|
| 54 |
+
### Data Annotation & Balancing Process
|
| 55 |
+
|
| 56 |
+
The dataset was created through a rigorous multi-step process combining LLM annotation and validation:
|
| 57 |
+
|
| 58 |
+
**Phase 1: LLM Ensemble Annotation**
|
| 59 |
+
- The original ~49,000 WebFAQ question-answer pairs were annotated using an ensemble of three language models:
|
| 60 |
+
- **LLaMA 3.1**
|
| 61 |
+
- **Gemma 2**
|
| 62 |
+
- **Qwen 2.5**
|
| 63 |
+
|
| 64 |
+
**Phase 2: Quality Filtering**
|
| 65 |
+
- Only high-quality annotations were retained using ensemble voting with a minimum confidence threshold of **0.6**:
|
| 66 |
+
- **Confidence 1.0**: All 3 models agree on the same label (unanimous)
|
| 67 |
+
- **Confidence 0.67**: At least 2 out of 3 models agree (majority vote)
|
| 68 |
+
- Annotations below 0.6 confidence were excluded to ensure label reliability
|
| 69 |
+
|
| 70 |
+
**Phase 3: Gap Analysis**
|
| 71 |
+
- After filtering, gaps were identified across language-category combinations
|
| 72 |
+
- Target: 125 questions per category per language (1,000 per language total)
|
| 73 |
+
|
| 74 |
+
**Phase 4: Synthetic Data Generation**
|
| 75 |
+
- Missing question-answer pairs were generated using **LLaMA 3.1** to fill identified gaps
|
| 76 |
+
- Generation followed category-specific templates and linguistic patterns
|
| 77 |
+
|
| 78 |
+
**Phase 5: Validation**
|
| 79 |
+
- All generated pairs underwent the same ensemble validation process (LLaMA 3.1, Gemma 2, Qwen 2.5)
|
| 80 |
+
- Applied the same 0.6 confidence threshold to ensure quality consistency
|
| 81 |
+
|
| 82 |
+
**Phase 6: Final Dataset**
|
| 83 |
+
- Combined high-quality annotated and validated data achieving balanced representation:
|
| 84 |
+
- Each language: ~1,000 questions
|
| 85 |
+
- Each category per language: ~125 questions
|
| 86 |
+
- Diverse coverage across all 49 languages and 8 categories
|
| 87 |
+
|
| 88 |
+
### Languages Supported
|
| 89 |
+
|
| 90 |
+
**European Languages** (29): English (en), German (de), French (fr), Spanish (es), Italian (it), Portuguese (pt), Dutch (nl), Polish (pl), Romanian (ro), Czech (cs), Slovak (sk), Bulgarian (bg), Croatian (hr), Serbian (sr), Slovenian (sl), Albanian (sq), Estonian (et), Latvian (lv), Lithuanian (lt), Danish (da), Norwegian (no), Swedish (sv), Finnish (fi), Icelandic (is), Greek (el), Turkish (tr), Ukrainian (uk), Russian (ru), Hungarian (hu)
|
| 91 |
+
|
| 92 |
+
**Asian Languages** (12): Chinese (zh), Japanese (ja), Korean (ko), Hindi (hi), Bengali (bn), Marathi (mr), Thai (th), Vietnamese (vi), Indonesian (id), Malay (ms), Tagalog/Filipino (tl), Urdu (ur)
|
| 93 |
+
|
| 94 |
+
**Middle Eastern Languages** (8): Arabic (ar), Persian/Farsi (fa), Hebrew (he), Georgian (ka), Azerbaijani (az), Kazakh (kk), Uzbek (uz)
|
| 95 |
+
|
| 96 |
+
## Classification Categories
|
| 97 |
+
|
| 98 |
+
The model classifies questions into 8 distinct categories:
|
| 99 |
+
|
| 100 |
+
### 1. NOT-A-QUESTION (Label 0)
|
| 101 |
+
Statements or phrases that are not actual questions.
|
| 102 |
+
|
| 103 |
+
**Examples:**
|
| 104 |
+
- "Price of dental treatment"
|
| 105 |
+
- "Best restaurants nearby"
|
| 106 |
+
- "Weather today"
|
| 107 |
+
|
| 108 |
+
### 2. FACTOID (Label 1)
|
| 109 |
+
Questions seeking factual, objective answers (who, what, when, where).
|
| 110 |
+
|
| 111 |
+
**Examples:**
|
| 112 |
+
- "What is the capital of France?"
|
| 113 |
+
- "When was the Eiffel Tower built?"
|
| 114 |
+
- "Who invented the telephone?"
|
| 115 |
+
|
| 116 |
+
### 3. INSTRUCTION (Label 2)
|
| 117 |
+
How-to questions requiring step-by-step procedural answers.
|
| 118 |
+
|
| 119 |
+
**Examples:**
|
| 120 |
+
- "How do I reset my password?"
|
| 121 |
+
- "How to bake chocolate chip cookies?"
|
| 122 |
+
- "How can I install Python on Windows?"
|
| 123 |
+
|
| 124 |
+
### 4. REASON (Label 3)
|
| 125 |
+
Why/how questions seeking explanations or reasoning.
|
| 126 |
+
|
| 127 |
+
**Examples:**
|
| 128 |
+
- "Why is the sky blue?"
|
| 129 |
+
- "How does photosynthesis work?"
|
| 130 |
+
- "Why do birds migrate?"
|
| 131 |
+
|
| 132 |
+
### 5. EVIDENCE-BASED (Label 4)
|
| 133 |
+
Questions about definitions, features, or characteristics.
|
| 134 |
+
|
| 135 |
+
**Examples:**
|
| 136 |
+
- "What are the symptoms of flu?"
|
| 137 |
+
- "What features does this phone have?"
|
| 138 |
+
- "What is machine learning?"
|
| 139 |
+
|
| 140 |
+
### 6. COMPARISON (Label 5)
|
| 141 |
+
Questions comparing two or more options.
|
| 142 |
+
|
| 143 |
+
**Examples:**
|
| 144 |
+
- "iPhone vs Android: which is better?"
|
| 145 |
+
- "What's the difference between RNA and DNA?"
|
| 146 |
+
- "Compare electric and gas cars"
|
| 147 |
+
|
| 148 |
+
### 7. EXPERIENCE (Label 6)
|
| 149 |
+
Questions seeking personal experiences, recommendations, or advice.
|
| 150 |
+
|
| 151 |
+
**Examples:**
|
| 152 |
+
- "What's the best laptop for students?"
|
| 153 |
+
- "Has anyone tried this restaurant?"
|
| 154 |
+
- "Which hotel would you recommend?"
|
| 155 |
+
|
| 156 |
+
### 8. DEBATE (Label 7)
|
| 157 |
+
Hypothetical, opinion-based, or debatable questions.
|
| 158 |
+
|
| 159 |
+
**Examples:**
|
| 160 |
+
- "Is artificial intelligence dangerous?"
|
| 161 |
+
- "Should we colonize Mars?"
|
| 162 |
+
- "Is remote work better than office work?"
|
| 163 |
+
|
| 164 |
+
## Model Performance
|
| 165 |
+
|
| 166 |
+
### Test Set Results (12,587 examples)
|
| 167 |
+
|
| 168 |
+
- **Overall Accuracy**: 88.6%
|
| 169 |
+
- **Macro-Average F1**: 86.7%
|
| 170 |
+
- **Best Validation F1**: 86.8% (achieved at epoch 27)
|
| 171 |
+
|
| 172 |
+
### Per-Category Performance
|
| 173 |
+
|
| 174 |
+
| Category | Precision | Recall | F1-Score | Support |
|
| 175 |
+
|----------|-----------|--------|----------|---------|
|
| 176 |
+
| NOT-A-QUESTION | 0.92 | 0.91 | 0.92 | 957 |
|
| 177 |
+
| FACTOID | 0.92 | 0.92 | 0.92 | 5,679 |
|
| 178 |
+
| INSTRUCTION | 0.89 | 0.87 | 0.88 | 295 |
|
| 179 |
+
| REASON | 0.77 | 0.82 | 0.80 | 664 |
|
| 180 |
+
| EVIDENCE-BASED | 0.85 | 0.92 | 0.88 | 1,466 |
|
| 181 |
+
| COMPARISON | 0.84 | 0.83 | 0.83 | 885 |
|
| 182 |
+
| EXPERIENCE | 0.82 | 0.76 | 0.79 | 1,556 |
|
| 183 |
+
| DEBATE | 0.92 | 0.92 | 0.92 | 1,085 |
|
| 184 |
+
|
| 185 |
+
### Key Observations
|
| 186 |
+
|
| 187 |
+
- **Strongest Performance**: FACTOID, DEBATE, and NOT-A-QUESTION categories (F1 ≥ 0.92)
|
| 188 |
+
- **Good Performance**: INSTRUCTION, EVIDENCE-BASED, and COMPARISON categories (F1 ≥ 0.83)
|
| 189 |
+
- **Moderate Performance**: REASON and EXPERIENCE categories (F1 ~ 0.79-0.80)
|
| 190 |
+
- The model generalizes well across all 49 languages despite language imbalance in real-world data
|
| 191 |
+
|
| 192 |
+
## Training Procedure
|
| 193 |
+
|
| 194 |
+
### Hardware
|
| 195 |
+
|
| 196 |
+
- Training Device: CUDA-enabled GPU (NVIDIA)
|
| 197 |
+
- Training Time: ~27 epochs to reach best performance
|
| 198 |
+
|
| 199 |
+
### Hyperparameters
|
| 200 |
+
|
| 201 |
+
```python
|
| 202 |
+
{
|
| 203 |
+
"model_name": "xlm-roberta-base",
|
| 204 |
+
"max_length": 128, # Maximum sequence length
|
| 205 |
+
"batch_size": 16, # Training batch size
|
| 206 |
+
"learning_rate": 2e-5, # AdamW learning rate
|
| 207 |
+
"num_epochs": 30, # Total epochs trained
|
| 208 |
+
"warmup_steps": 500, # Linear warmup steps
|
| 209 |
+
"weight_decay": 0.01, # L2 regularization
|
| 210 |
+
"optimizer": "AdamW", # Optimizer
|
| 211 |
+
"scheduler": "linear_warmup", # Learning rate scheduler
|
| 212 |
+
"gradient_clipping": 1.0, # Max gradient norm
|
| 213 |
+
"test_size": 0.2, # 20% test split
|
| 214 |
+
"val_size": 0.1, # 10% validation split
|
| 215 |
+
"random_seed": 42 # Reproducibility
|
| 216 |
+
}
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
### Training Process
|
| 220 |
+
|
| 221 |
+
1. **Data Preparation**: Balanced dataset across 49 languages and 8 categories
|
| 222 |
+
2. **Preprocessing**: Tokenization using XLM-RoBERTa tokenizer (max length: 128 tokens)
|
| 223 |
+
3. **Training Strategy**: Supervised fine-tuning with stratified train/val/test splits
|
| 224 |
+
4. **Optimization**: AdamW optimizer with linear warmup and gradient clipping
|
| 225 |
+
5. **Best Model Selection**: Model checkpoint with highest validation F1 score (epoch 27)
|
| 226 |
+
6. **Evaluation**: Comprehensive testing on held-out test set
|
| 227 |
+
|
| 228 |
+
## Usage
|
| 229 |
+
|
| 230 |
+
### Installation
|
| 231 |
+
|
| 232 |
+
```bash
|
| 233 |
+
pip install transformers torch
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
### Quick Start
|
| 237 |
+
|
| 238 |
+
```python
|
| 239 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 240 |
+
import torch
|
| 241 |
+
|
| 242 |
+
# Load model and tokenizer
|
| 243 |
+
model_name = "AliSalman29/nfqa-multilingual-classifier"
|
| 244 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 245 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 246 |
+
|
| 247 |
+
# Example questions in different languages
|
| 248 |
+
questions = [
|
| 249 |
+
"What is the capital of France?", # English - FACTOID
|
| 250 |
+
"¿Cómo hacer una tortilla española?", # Spanish - INSTRUCTION
|
| 251 |
+
"Warum ist der Himmel blau?", # German - REASON
|
| 252 |
+
"iPhone還是Android更好?", # Chinese - COMPARISON
|
| 253 |
+
]
|
| 254 |
+
|
| 255 |
+
# Classify questions
|
| 256 |
+
for question in questions:
|
| 257 |
+
inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=128)
|
| 258 |
+
|
| 259 |
+
with torch.no_grad():
|
| 260 |
+
outputs = model(**inputs)
|
| 261 |
+
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
| 262 |
+
predicted_class = torch.argmax(predictions, dim=-1).item()
|
| 263 |
+
confidence = predictions[0][predicted_class].item()
|
| 264 |
+
|
| 265 |
+
# Get category name
|
| 266 |
+
category = model.config.id2label[predicted_class]
|
| 267 |
+
|
| 268 |
+
print(f"Question: {question}")
|
| 269 |
+
print(f"Category: {category}")
|
| 270 |
+
print(f"Confidence: {confidence:.2%}\n")
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
### Output Example
|
| 274 |
+
|
| 275 |
+
```
|
| 276 |
+
Question: What is the capital of France?
|
| 277 |
+
Category: FACTOID
|
| 278 |
+
Confidence: 94.32%
|
| 279 |
+
|
| 280 |
+
Question: ¿Cómo hacer una tortilla española?
|
| 281 |
+
Category: INSTRUCTION
|
| 282 |
+
Confidence: 89.17%
|
| 283 |
+
|
| 284 |
+
Question: Warum ist der Himmel blau?
|
| 285 |
+
Category: REASON
|
| 286 |
+
Confidence: 85.63%
|
| 287 |
+
|
| 288 |
+
Question: iPhone還是Android更好?
|
| 289 |
+
Category: COMPARISON
|
| 290 |
+
Confidence: 91.24%
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
### Batch Processing
|
| 294 |
+
|
| 295 |
+
```python
|
| 296 |
+
def classify_questions_batch(questions, model, tokenizer, batch_size=32):
|
| 297 |
+
"""Classify multiple questions efficiently"""
|
| 298 |
+
model.eval()
|
| 299 |
+
results = []
|
| 300 |
+
|
| 301 |
+
for i in range(0, len(questions), batch_size):
|
| 302 |
+
batch = questions[i:i+batch_size]
|
| 303 |
+
|
| 304 |
+
# Tokenize batch
|
| 305 |
+
inputs = tokenizer(
|
| 306 |
+
batch,
|
| 307 |
+
return_tensors="pt",
|
| 308 |
+
truncation=True,
|
| 309 |
+
max_length=128,
|
| 310 |
+
padding=True
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
# Get predictions
|
| 314 |
+
with torch.no_grad():
|
| 315 |
+
outputs = model(**inputs)
|
| 316 |
+
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
| 317 |
+
predicted_classes = torch.argmax(predictions, dim=-1)
|
| 318 |
+
confidences = predictions[range(len(batch)), predicted_classes]
|
| 319 |
+
|
| 320 |
+
# Store results
|
| 321 |
+
for j, question in enumerate(batch):
|
| 322 |
+
results.append({
|
| 323 |
+
'question': question,
|
| 324 |
+
'category': model.config.id2label[predicted_classes[j].item()],
|
| 325 |
+
'label_id': predicted_classes[j].item(),
|
| 326 |
+
'confidence': confidences[j].item()
|
| 327 |
+
})
|
| 328 |
+
|
| 329 |
+
return results
|
| 330 |
+
|
| 331 |
+
# Usage
|
| 332 |
+
questions = ["Question 1", "Question 2", ...]
|
| 333 |
+
results = classify_questions_batch(questions, model, tokenizer)
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
### Integration with Pipelines
|
| 337 |
+
|
| 338 |
+
```python
|
| 339 |
+
from transformers import pipeline
|
| 340 |
+
|
| 341 |
+
# Create classification pipeline
|
| 342 |
+
classifier = pipeline(
|
| 343 |
+
"text-classification",
|
| 344 |
+
model="AliSalman29/nfqa-multilingual-classifier",
|
| 345 |
+
tokenizer="AliSalman29/nfqa-multilingual-classifier",
|
| 346 |
+
device=0 # Use GPU if available (0), or -1 for CPU
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
# Classify single question
|
| 350 |
+
result = classifier("How do I learn Python?", truncation=True, max_length=128)
|
| 351 |
+
print(result)
|
| 352 |
+
# Output: [{'label': 'INSTRUCTION', 'score': 0.91}]
|
| 353 |
+
|
| 354 |
+
# Classify multiple questions
|
| 355 |
+
results = classifier(
|
| 356 |
+
["What is AI?", "Why do cats purr?", "Best pizza in town?"],
|
| 357 |
+
truncation=True,
|
| 358 |
+
max_length=128
|
| 359 |
+
)
|
| 360 |
+
for r in results:
|
| 361 |
+
print(f"{r['label']}: {r['score']:.2%}")
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
+
## Limitations and Biases
|
| 365 |
+
|
| 366 |
+
### Known Limitations
|
| 367 |
+
|
| 368 |
+
1. **Language Imbalance**: While supporting 49 languages, the model may perform better on high-resource languages (English, Spanish, French) compared to low-resource languages
|
| 369 |
+
2. **Domain Specificity**: Trained primarily on FAQ-style questions; may not generalize perfectly to other question formats (e.g., academic questions, technical queries)
|
| 370 |
+
3. **Category Overlap**: Some questions may legitimately belong to multiple categories, but the model outputs a single prediction
|
| 371 |
+
4. **Short Questions**: Very short questions (1-2 words) may lack sufficient context for accurate classification
|
| 372 |
+
5. **Context Dependency**: The model analyzes questions in isolation without conversational context
|
| 373 |
+
|
| 374 |
+
### Potential Biases
|
| 375 |
+
|
| 376 |
+
- **Annotation Bias**: Labels are based on LLM ensemble predictions (LLaMA 3.1, Gemma 2, Qwen 2.5) rather than human annotations, which may introduce systematic biases from these underlying models
|
| 377 |
+
- **Training Data Bias**: The model inherits biases from the LLM-annotated WebFAQ dataset and LLM-generated examples
|
| 378 |
+
- **Language Representation**: European languages are better represented than other language families in the original WebFAQ data
|
| 379 |
+
- **Category Distribution**: FACTOID questions are more prevalent in training data, which may affect classification thresholds
|
| 380 |
+
- **LLM Consensus Bias**: The 0.6 confidence threshold favors categories where LLMs show higher agreement, potentially underrepresenting ambiguous or nuanced question types
|
| 381 |
+
|
| 382 |
+
### Recommendations for Use
|
| 383 |
+
|
| 384 |
+
- Use confidence scores to identify uncertain predictions
|
| 385 |
+
- Consider ensemble approaches for critical applications
|
| 386 |
+
- Validate performance on your specific domain and languages before production deployment
|
| 387 |
+
- Implement human review for high-stakes decisions
|
| 388 |
+
- Monitor performance across different language groups in your application
|
| 389 |
+
|
| 390 |
+
## Ethical Considerations
|
| 391 |
+
|
| 392 |
+
- **Transparency**: Users should be informed when interacting with automated classification systems
|
| 393 |
+
- **Privacy**: The model processes text locally and does not store or transmit user queries
|
| 394 |
+
- **Fairness**: Regular audits should be conducted to ensure equitable performance across languages and user groups
|
| 395 |
+
- **Accountability**: Human oversight is recommended for applications affecting user experience or decisions
|
| 396 |
+
|
| 397 |
+
## Citation
|
| 398 |
+
|
| 399 |
+
If you use this model in your research, please cite:
|
| 400 |
+
|
| 401 |
+
```bibtex
|
| 402 |
+
@misc{nfqa-multilingual-2026,
|
| 403 |
+
author = {[Your Name]},
|
| 404 |
+
title = {NFQA Multilingual Question Classifier},
|
| 405 |
+
year = {2026},
|
| 406 |
+
publisher = {HuggingFace},
|
| 407 |
+
journal = {HuggingFace Model Hub},
|
| 408 |
+
howpublished = {\url{https://huggingface.co/AliSalman29/nfqa-multilingual-classifier}}
|
| 409 |
+
}
|
| 410 |
+
```
|
| 411 |
+
|
| 412 |
+
## Related Resources
|
| 413 |
+
|
| 414 |
+
- **WebFAQ Dataset**:
|
| 415 |
+
- **XLM-RoBERTa**: https://huggingface.co/xlm-roberta-base
|
| 416 |
+
- **Paper**: [Link to your paper if published]
|
| 417 |
+
- **GitHub Repository**: [Link to your code repository]
|
| 418 |
+
|
| 419 |
+
## Model Card Contact
|
| 420 |
+
|
| 421 |
+
For questions, feedback, or issues:
|
| 422 |
+
- **GitHub Issues**: https://github.com/Ali-Salman29/nfqa-multilingual-classifier
|
| 423 |
+
- **Email**: salman.khuwaja29@gmail.com
|
| 424 |
+
- **Organization**: University of Passau
|
| 425 |
+
|
| 426 |
+
## Acknowledgments
|
| 427 |
+
|
| 428 |
+
- Training data sourced from the WebFAQ dataset
|
| 429 |
+
- LLM annotation ensemble: LLaMA 3.1, Gemma 2, and Qwen 2.5
|
| 430 |
+
- Balanced data generation using LLaMA 3.1
|
| 431 |
+
- Built on the XLM-RoBERTa foundation model by Facebook AI (now Meta AI)
|
| 432 |
+
- Training infrastructure provided by University of Passau LLM inference server
|
| 433 |
+
|
| 434 |
+
---
|
| 435 |
+
|
| 436 |
+
**Model Version**: 1.0
|
| 437 |
+
**Last Updated**: January 2026
|
| 438 |
+
**Status**: Production Ready
|
classification_report.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
precision recall f1-score support
|
| 2 |
+
|
| 3 |
+
NOT-A-QUESTION 0.92 0.91 0.92 957
|
| 4 |
+
FACTOID 0.92 0.92 0.92 5679
|
| 5 |
+
INSTRUCTION 0.89 0.87 0.88 295
|
| 6 |
+
REASON 0.77 0.82 0.80 664
|
| 7 |
+
EVIDENCE-BASED 0.85 0.92 0.88 1466
|
| 8 |
+
COMPARISON 0.84 0.83 0.83 885
|
| 9 |
+
EXPERIENCE 0.82 0.76 0.79 1556
|
| 10 |
+
DEBATE 0.92 0.92 0.92 1085
|
| 11 |
+
|
| 12 |
+
accuracy 0.89 12587
|
| 13 |
+
macro avg 0.87 0.87 0.87 12587
|
| 14 |
+
weighted avg 0.89 0.89 0.89 12587
|
config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"id2label": {
|
| 13 |
+
"0": "NOT-A-QUESTION",
|
| 14 |
+
"1": "FACTOID",
|
| 15 |
+
"2": "INSTRUCTION",
|
| 16 |
+
"3": "REASON",
|
| 17 |
+
"4": "EVIDENCE-BASED",
|
| 18 |
+
"5": "COMPARISON",
|
| 19 |
+
"6": "EXPERIENCE",
|
| 20 |
+
"7": "DEBATE"
|
| 21 |
+
},
|
| 22 |
+
"initializer_range": 0.02,
|
| 23 |
+
"intermediate_size": 3072,
|
| 24 |
+
"label2id": {
|
| 25 |
+
"COMPARISON": 5,
|
| 26 |
+
"DEBATE": 7,
|
| 27 |
+
"EVIDENCE-BASED": 4,
|
| 28 |
+
"EXPERIENCE": 6,
|
| 29 |
+
"FACTOID": 1,
|
| 30 |
+
"INSTRUCTION": 2,
|
| 31 |
+
"NOT-A-QUESTION": 0,
|
| 32 |
+
"REASON": 3
|
| 33 |
+
},
|
| 34 |
+
"layer_norm_eps": 1e-05,
|
| 35 |
+
"max_position_embeddings": 514,
|
| 36 |
+
"model_type": "xlm-roberta",
|
| 37 |
+
"num_attention_heads": 12,
|
| 38 |
+
"num_hidden_layers": 12,
|
| 39 |
+
"output_past": true,
|
| 40 |
+
"pad_token_id": 1,
|
| 41 |
+
"position_embedding_type": "absolute",
|
| 42 |
+
"problem_type": "single_label_classification",
|
| 43 |
+
"torch_dtype": "float32",
|
| 44 |
+
"transformers_version": "4.50.3",
|
| 45 |
+
"type_vocab_size": 1,
|
| 46 |
+
"use_cache": true,
|
| 47 |
+
"vocab_size": 250002
|
| 48 |
+
}
|
confusion_matrix.png
ADDED
|
Git LFS Details
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fcf678e92084d404bd4a8055450df84c09e126dfa9d6b2869745895fecd450ff
|
| 3 |
+
size 1112223464
|
sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"cls_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"mask_token": {
|
| 6 |
+
"content": "<mask>",
|
| 7 |
+
"lstrip": true,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false
|
| 11 |
+
},
|
| 12 |
+
"pad_token": "<pad>",
|
| 13 |
+
"sep_token": "</s>",
|
| 14 |
+
"unk_token": "<unk>"
|
| 15 |
+
}
|
test_results.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"test_loss": 1.1942661555863936,
|
| 3 |
+
"test_accuracy": 0.8855168030507666,
|
| 4 |
+
"test_f1_macro": 0.8672321792444992,
|
| 5 |
+
"best_epoch": 27,
|
| 6 |
+
"best_val_f1": 0.8676620754981998,
|
| 7 |
+
"num_train_examples": 44051,
|
| 8 |
+
"num_val_examples": 6294,
|
| 9 |
+
"num_test_examples": 12587,
|
| 10 |
+
"config": {
|
| 11 |
+
"model_name": "xlm-roberta-base",
|
| 12 |
+
"max_length": 128,
|
| 13 |
+
"batch_size": 16,
|
| 14 |
+
"learning_rate": 2e-05,
|
| 15 |
+
"num_epochs": 30,
|
| 16 |
+
"warmup_steps": 500,
|
| 17 |
+
"weight_decay": 0.01,
|
| 18 |
+
"test_size": 0.2,
|
| 19 |
+
"val_size": 0.1
|
| 20 |
+
},
|
| 21 |
+
"timestamp": "2026-01-16T19:09:44.473503"
|
| 22 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c088c06cf975b7097e469bd69630cdb0d675c6db1ce3af1042b6e19c6d01f22
|
| 3 |
+
size 17082999
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": false,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"model_max_length": 512,
|
| 51 |
+
"pad_token": "<pad>",
|
| 52 |
+
"sep_token": "</s>",
|
| 53 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 54 |
+
"unk_token": "<unk>"
|
| 55 |
+
}
|
training_curves.png
ADDED
|
Git LFS Details
|