Deploy Mbaza Legal AI Model with inference endpoint
Browse files- README.md +295 -1
- assistant.py +199 -0
- config.py +340 -0
- dataset-all.csv +0 -0
- greetings.csv +26 -0
- inference.py +73 -0
- law_embeddings.npy +3 -0
- law_meta.json +0 -0
- penal_code.csv +0 -0
- requirements.txt +7 -0
- retriever.py +207 -0
README.md
CHANGED
|
@@ -1,3 +1,297 @@
|
|
| 1 |
---
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Mbaza Legal AI Model
|
| 3 |
+
language:
|
| 4 |
+
- rw
|
| 5 |
+
- en
|
| 6 |
+
- fr
|
| 7 |
+
tags:
|
| 8 |
+
- legal
|
| 9 |
+
- kinyarwanda
|
| 10 |
+
- nlp
|
| 11 |
+
- sentence-transformers
|
| 12 |
+
- question-answering
|
| 13 |
+
license: mit
|
| 14 |
---
|
| 15 |
+
|
| 16 |
+
# 🇷🇼 Mbaza Legal AI Model
|
| 17 |
+
|
| 18 |
+
**Multilingual Legal Assistant for Rwandan Laws** - Supporting Kinyarwanda, English, and French.
|
| 19 |
+
|
| 20 |
+
## Model Description
|
| 21 |
+
|
| 22 |
+
This model provides intelligent legal assistance for Rwandan laws, punishments, and legal procedures. It uses semantic search with sentence embeddings to match user queries with relevant legal articles and punishment information.
|
| 23 |
+
|
| 24 |
+
### Key Features
|
| 25 |
+
|
| 26 |
+
- **Multilingual Support**: Kinyarwanda, English, and French
|
| 27 |
+
- **Greeting Detection**: Natural conversation in multiple languages
|
| 28 |
+
- **Legal Article Retrieval**: Semantic search across Rwandan legal code
|
| 29 |
+
- **Punishment Information**: Detailed penalty and sentencing information
|
| 30 |
+
- **Context Tracking**: Maintains conversation history
|
| 31 |
+
|
| 32 |
+
### Technical Details
|
| 33 |
+
|
| 34 |
+
- **Base Model**: `sentence-transformers/all-MiniLM-L6-v2`
|
| 35 |
+
- **Embedding Dimension**: 384
|
| 36 |
+
- **Similarity Metric**: Cosine similarity
|
| 37 |
+
- **Framework**: PyTorch + SentenceTransformers
|
| 38 |
+
|
| 39 |
+
## Usage
|
| 40 |
+
|
| 41 |
+
### Via Hugging Face Inference API
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
import requests
|
| 45 |
+
|
| 46 |
+
API_URL = "https://api-inference.huggingface.co/models/mugwaneza/mbaza-model"
|
| 47 |
+
headers = {"Authorization": f"Bearer {YOUR_HF_TOKEN}"}
|
| 48 |
+
|
| 49 |
+
def query(prompt):
|
| 50 |
+
response = requests.post(API_URL, headers=headers, json={"inputs": [prompt]})
|
| 51 |
+
return response.json()
|
| 52 |
+
|
| 53 |
+
# Example usage
|
| 54 |
+
result = query("Ibihano by'ubujura ni ibihe?")
|
| 55 |
+
print(result["text"])
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Via Python Client
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
from huggingface_hub import InferenceClient
|
| 62 |
+
|
| 63 |
+
client = InferenceClient(token=YOUR_HF_TOKEN)
|
| 64 |
+
|
| 65 |
+
response = client.post(
|
| 66 |
+
"mugwaneza/mbaza-model",
|
| 67 |
+
json={"inputs": ["Kwinjira aho umuntu atuye bitemewe namategeko"]}
|
| 68 |
+
)
|
| 69 |
+
print(response)
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### Via cURL
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
curl -X POST \
|
| 76 |
+
-H "Authorization: Bearer YOUR_HF_TOKEN" \
|
| 77 |
+
-H "Content-Type: application/json" \
|
| 78 |
+
-d '{"inputs":["Mwaramutse neza"]}' \
|
| 79 |
+
https://api-inference.huggingface.co/models/mugwaneza/mbaza-model
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
### For Mobile Apps (React Native, Flutter, Swift)
|
| 83 |
+
|
| 84 |
+
```javascript
|
| 85 |
+
// React Native / JavaScript
|
| 86 |
+
const API_URL = "https://api-inference.huggingface.co/models/mugwaneza/mbaza-model";
|
| 87 |
+
const HF_TOKEN = "your_token_here";
|
| 88 |
+
|
| 89 |
+
async function queryLegalAI(prompt) {
|
| 90 |
+
const response = await fetch(API_URL, {
|
| 91 |
+
method: "POST",
|
| 92 |
+
headers: {
|
| 93 |
+
"Authorization": `Bearer ${HF_TOKEN}`,
|
| 94 |
+
"Content-Type": "application/json"
|
| 95 |
+
},
|
| 96 |
+
body: JSON.stringify({ inputs: [prompt] })
|
| 97 |
+
});
|
| 98 |
+
|
| 99 |
+
return await response.json();
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
// Usage
|
| 103 |
+
const result = await queryLegalAI("What is the punishment for theft?");
|
| 104 |
+
console.log(result.text);
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### For Laravel Backend
|
| 108 |
+
|
| 109 |
+
```php
|
| 110 |
+
<?php
|
| 111 |
+
|
| 112 |
+
namespace App\Services;
|
| 113 |
+
|
| 114 |
+
use Illuminate\Support\Facades\Http;
|
| 115 |
+
|
| 116 |
+
class MbazaLegalAI
|
| 117 |
+
{
|
| 118 |
+
protected $apiUrl = 'https://api-inference.huggingface.co/models/mugwaneza/mbaza-model';
|
| 119 |
+
protected $token;
|
| 120 |
+
|
| 121 |
+
public function __construct()
|
| 122 |
+
{
|
| 123 |
+
$this->token = config('services.huggingface.token');
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
public function query($prompt, $userId = 'web_user')
|
| 127 |
+
{
|
| 128 |
+
$response = Http::withHeaders([
|
| 129 |
+
'Authorization' => "Bearer {$this->token}",
|
| 130 |
+
'Content-Type' => 'application/json'
|
| 131 |
+
])->post($this->apiUrl, [
|
| 132 |
+
'inputs' => [$prompt, $userId]
|
| 133 |
+
]);
|
| 134 |
+
|
| 135 |
+
return $response->json();
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
// Usage in Controller
|
| 140 |
+
$ai = new MbazaLegalAI();
|
| 141 |
+
$result = $ai->query("Ibihano by'ubujura ni ibihe?");
|
| 142 |
+
return response()->json($result);
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
## Example Queries
|
| 146 |
+
|
| 147 |
+
### Greetings (Multilingual)
|
| 148 |
+
|
| 149 |
+
```python
|
| 150 |
+
# Kinyarwanda
|
| 151 |
+
query("Mwaramutse neza")
|
| 152 |
+
# Response: Mwaramutse neza, amakuru yawe?
|
| 153 |
+
|
| 154 |
+
# English
|
| 155 |
+
query("Good morning")
|
| 156 |
+
# Response: Good morning, how can I help you with legal matters?
|
| 157 |
+
|
| 158 |
+
# French
|
| 159 |
+
query("Bonjour")
|
| 160 |
+
# Response: Bonjour, comment puis-je vous aider?
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
### Legal Questions
|
| 164 |
+
|
| 165 |
+
```python
|
| 166 |
+
# Kinyarwanda
|
| 167 |
+
query("Ibihano by'ubujura ni ibihe?")
|
| 168 |
+
# Returns: Punishment information for theft
|
| 169 |
+
|
| 170 |
+
# English
|
| 171 |
+
query("What are the laws about corruption in Rwanda?")
|
| 172 |
+
# Returns: Relevant legal articles on corruption
|
| 173 |
+
|
| 174 |
+
# Mixed
|
| 175 |
+
query("Kwinjira aho umuntu atuye bitemewe namategeko")
|
| 176 |
+
# Returns: Laws about trespassing and unauthorized entry
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
### Punishment Queries
|
| 180 |
+
|
| 181 |
+
```python
|
| 182 |
+
query("Igihano cy'umuntu wakubise undi")
|
| 183 |
+
# Returns: Punishment for assault
|
| 184 |
+
|
| 185 |
+
query("What is the penalty for fraud?")
|
| 186 |
+
# Returns: Detailed penalty information
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
## Response Format
|
| 190 |
+
|
| 191 |
+
```json
|
| 192 |
+
{
|
| 193 |
+
"text": "Main response text (formatted for display)",
|
| 194 |
+
"intent": "greeting|law|punishment|fallback",
|
| 195 |
+
"laws": [
|
| 196 |
+
{
|
| 197 |
+
"article": "Article 166",
|
| 198 |
+
"description": "...",
|
| 199 |
+
"punishment": "...",
|
| 200 |
+
"similarity": 0.85
|
| 201 |
+
}
|
| 202 |
+
],
|
| 203 |
+
"punishments": [
|
| 204 |
+
{
|
| 205 |
+
"crime": "Theft",
|
| 206 |
+
"category": "Property crimes",
|
| 207 |
+
"penalty": "..."
|
| 208 |
+
}
|
| 209 |
+
]
|
| 210 |
+
}
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
## Datasets
|
| 214 |
+
|
| 215 |
+
The model uses the following datasets:
|
| 216 |
+
|
| 217 |
+
1. **Legal Code Dataset** (`dataset-all.csv`)
|
| 218 |
+
- Rwandan laws and articles
|
| 219 |
+
- Descriptions in Kinyarwanda, English, and French
|
| 220 |
+
- Article numbers, chapters, and categories
|
| 221 |
+
|
| 222 |
+
2. **Penal Code** (`penal_code.csv`)
|
| 223 |
+
- Crime categories
|
| 224 |
+
- Punishment details
|
| 225 |
+
- Sentencing guidelines
|
| 226 |
+
|
| 227 |
+
3. **Greetings** (`greetings.csv`)
|
| 228 |
+
- Multilingual greetings and responses
|
| 229 |
+
- Conversational patterns
|
| 230 |
+
|
| 231 |
+
## Model Files
|
| 232 |
+
|
| 233 |
+
- `inference.py` - Main inference endpoint
|
| 234 |
+
- `assistant.py` - Core assistant logic
|
| 235 |
+
- `retriever.py` - Semantic search and embedding management
|
| 236 |
+
- `config.py` - Configuration and utilities
|
| 237 |
+
- `law_embeddings.npy` - Precomputed embeddings (384-dim vectors)
|
| 238 |
+
- `law_meta.json` - Metadata for legal articles
|
| 239 |
+
- `conversation_contexts.json` - Context tracking storage
|
| 240 |
+
|
| 241 |
+
## Installation (Self-Hosted)
|
| 242 |
+
|
| 243 |
+
```bash
|
| 244 |
+
# Clone the model repository
|
| 245 |
+
git clone https://huggingface.co/mugwaneza/mbaza-model
|
| 246 |
+
cd mbaza-model
|
| 247 |
+
|
| 248 |
+
# Install dependencies
|
| 249 |
+
pip install -r requirements.txt
|
| 250 |
+
|
| 251 |
+
# Test locally
|
| 252 |
+
python inference.py
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
## API Limits
|
| 256 |
+
|
| 257 |
+
- **Free Tier**: ~30,000 requests/month
|
| 258 |
+
- **Rate Limiting**: ~100 requests/hour (burst)
|
| 259 |
+
- **Cold Start**: First request may take 10-30 seconds
|
| 260 |
+
- **Warm**: Subsequent requests ~1-3 seconds
|
| 261 |
+
|
| 262 |
+
For production with higher limits, consider:
|
| 263 |
+
1. Upgrading to Hugging Face PRO ($9/month)
|
| 264 |
+
2. Self-hosting with this model
|
| 265 |
+
3. Using dedicated inference endpoints
|
| 266 |
+
|
| 267 |
+
## License
|
| 268 |
+
|
| 269 |
+
MIT License - See LICENSE file for details
|
| 270 |
+
|
| 271 |
+
## Citation
|
| 272 |
+
|
| 273 |
+
```bibtex
|
| 274 |
+
@misc{mbaza-legal-ai,
|
| 275 |
+
author = {Mugwaneza Manzi},
|
| 276 |
+
title = {Mbaza Legal AI: Multilingual Legal Assistant for Rwanda},
|
| 277 |
+
year = {2025},
|
| 278 |
+
publisher = {Hugging Face},
|
| 279 |
+
url = {https://huggingface.co/mugwaneza/mbaza-model}
|
| 280 |
+
}
|
| 281 |
+
```
|
| 282 |
+
|
| 283 |
+
## Related Resources
|
| 284 |
+
|
| 285 |
+
- **Demo Space**: https://huggingface.co/spaces/mugwaneza/mbaza
|
| 286 |
+
- **GitHub**: https://github.com/MUGWANEZAMANZI/Model
|
| 287 |
+
- **Documentation**: See DEPLOYMENT.md in repository
|
| 288 |
+
|
| 289 |
+
## Contact
|
| 290 |
+
|
| 291 |
+
For questions, issues, or collaboration:
|
| 292 |
+
- Create an issue on the [GitHub repository](https://github.com/MUGWANEZAMANZI/Model)
|
| 293 |
+
- Email: mugwaneza@example.com (replace with your email)
|
| 294 |
+
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
**Note**: This model is designed for informational purposes. Always consult with a qualified legal professional for official legal advice.
|
assistant.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, Any, List, Optional
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from retriever import get_retriever
|
| 10 |
+
from config import KINYARWANDA_STOPWORDS, calculate_similarity_score
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).parent
|
| 13 |
+
CONTEXT_PATH = ROOT / 'conversation_contexts.json'
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Assistant:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.retriever = get_retriever()
|
| 19 |
+
# load datasets
|
| 20 |
+
self.laws = None
|
| 21 |
+
self.punishments = None
|
| 22 |
+
self.greetings = None
|
| 23 |
+
self.contexts: Dict[str, List[Dict[str, Any]]] = {}
|
| 24 |
+
self.intent_keywords = {
|
| 25 |
+
'greeting': ['mwaramutse', 'muraho', 'amakuru', 'bite', 'mwiriwe', 'urabeho', 'urakomeye', 'ndabona'],
|
| 26 |
+
'law': ['itegeko', 'ingingo', 'article', 'ingingo ya', 'itegeko rya', 'law', 'article', 'ingingo'],
|
| 27 |
+
'punishment': ['igihano', 'ibihano', 'ihazabu', 'igifungo', 'fine', 'imyaka', 'years', 'punishment']
|
| 28 |
+
}
|
| 29 |
+
self.load_datasets()
|
| 30 |
+
self._load_contexts()
|
| 31 |
+
|
| 32 |
+
def load_datasets(self):
|
| 33 |
+
# laws are loaded by retriever; ensure available
|
| 34 |
+
try:
|
| 35 |
+
if self.retriever.laws_df is None:
|
| 36 |
+
self.retriever.load_laws()
|
| 37 |
+
self.laws = self.retriever.laws_df
|
| 38 |
+
except Exception:
|
| 39 |
+
self.laws = None
|
| 40 |
+
|
| 41 |
+
# punishments fallback to penal_code.csv
|
| 42 |
+
ppath = ROOT / 'penal_code.csv'
|
| 43 |
+
if ppath.exists():
|
| 44 |
+
try:
|
| 45 |
+
self.punishments = pd.read_csv(ppath).fillna('')
|
| 46 |
+
except Exception:
|
| 47 |
+
self.punishments = None
|
| 48 |
+
else:
|
| 49 |
+
self.punishments = None
|
| 50 |
+
|
| 51 |
+
# greetings
|
| 52 |
+
gpath = ROOT / 'greetings.csv'
|
| 53 |
+
if gpath.exists():
|
| 54 |
+
try:
|
| 55 |
+
self.greetings = pd.read_csv(gpath).fillna('')
|
| 56 |
+
except Exception:
|
| 57 |
+
self.greetings = None
|
| 58 |
+
else:
|
| 59 |
+
# retriever uses greetings.csv or kin... so try retriever load
|
| 60 |
+
try:
|
| 61 |
+
self.greetings = self.retriever.greetings_df
|
| 62 |
+
except Exception:
|
| 63 |
+
self.greetings = None
|
| 64 |
+
|
| 65 |
+
def _load_contexts(self):
|
| 66 |
+
if CONTEXT_PATH.exists():
|
| 67 |
+
try:
|
| 68 |
+
with open(CONTEXT_PATH, 'r', encoding='utf-8') as f:
|
| 69 |
+
self.contexts = json.load(f)
|
| 70 |
+
except Exception:
|
| 71 |
+
self.contexts = {}
|
| 72 |
+
|
| 73 |
+
def _save_contexts(self):
|
| 74 |
+
try:
|
| 75 |
+
with open(CONTEXT_PATH, 'w', encoding='utf-8') as f:
|
| 76 |
+
json.dump(self.contexts, f, ensure_ascii=False, indent=2)
|
| 77 |
+
except Exception:
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
def tokenize(self, text: str) -> List[str]:
|
| 81 |
+
if not text:
|
| 82 |
+
return []
|
| 83 |
+
txt = str(text).lower()
|
| 84 |
+
# simple cleaning
|
| 85 |
+
txt = re.sub(r"[\r\n]+", " ", txt)
|
| 86 |
+
txt = re.sub(r"[^\w\s\u00C0-\u017F]", " ", txt)
|
| 87 |
+
toks = [t for t in txt.split() if t and t not in KINYARWANDA_STOPWORDS]
|
| 88 |
+
return toks
|
| 89 |
+
|
| 90 |
+
def detect_intent(self, text: str) -> str:
|
| 91 |
+
t = str(text).lower()
|
| 92 |
+
toks = set(self.tokenize(t))
|
| 93 |
+
# keyword scoring
|
| 94 |
+
scores = {k: 0 for k in self.intent_keywords}
|
| 95 |
+
for intent, keywords in self.intent_keywords.items():
|
| 96 |
+
for kw in keywords:
|
| 97 |
+
if kw in t or kw in toks:
|
| 98 |
+
scores[intent] += 1
|
| 99 |
+
|
| 100 |
+
# greeting detection via retriever as fallback
|
| 101 |
+
if scores.get('greeting', 0) > 0:
|
| 102 |
+
return 'greeting'
|
| 103 |
+
|
| 104 |
+
# if punishment keywords present
|
| 105 |
+
if scores.get('punishment', 0) > 0:
|
| 106 |
+
return 'punishment'
|
| 107 |
+
|
| 108 |
+
if scores.get('law', 0) > 0:
|
| 109 |
+
return 'law'
|
| 110 |
+
|
| 111 |
+
# try to detect if users mention known law numbers or article numbers
|
| 112 |
+
if re.search(r'\bingingo\b|\bingingo ya\b|\barticle\b|\bitegeko\b', t):
|
| 113 |
+
return 'law'
|
| 114 |
+
|
| 115 |
+
# default unclear
|
| 116 |
+
return 'unclear'
|
| 117 |
+
|
| 118 |
+
def _update_context(self, user_id: str, entry: Dict[str, Any]):
|
| 119 |
+
self.contexts.setdefault(user_id, []).append(entry)
|
| 120 |
+
# keep only last 50
|
| 121 |
+
if len(self.contexts[user_id]) > 50:
|
| 122 |
+
self.contexts[user_id] = self.contexts[user_id][-50:]
|
| 123 |
+
self._save_contexts()
|
| 124 |
+
|
| 125 |
+
def handle_query(self, user_id: str, text: str) -> Dict[str, Any]:
|
| 126 |
+
# first, try language-aware greeting detection (this handles ky/en/fr greetings)
|
| 127 |
+
# record the raw user message
|
| 128 |
+
self._update_context(user_id, {'role': 'user', 'text': text})
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
reply = self.retriever.detect_and_reply_greeting(text)
|
| 132 |
+
except Exception:
|
| 133 |
+
reply = None
|
| 134 |
+
|
| 135 |
+
if reply:
|
| 136 |
+
out = {'type': 'greeting', 'response': reply.get('response', ''), 'followup': reply.get('followup', '')}
|
| 137 |
+
# record assistant response
|
| 138 |
+
self._update_context(user_id, {'role': 'assistant', 'text': out})
|
| 139 |
+
return out
|
| 140 |
+
|
| 141 |
+
# no greeting detected, continue with intent detection
|
| 142 |
+
intent = self.detect_intent(text)
|
| 143 |
+
# update the last user message with intent information as well
|
| 144 |
+
self._update_context(user_id, {'role': 'user', 'text': text, 'intent': intent})
|
| 145 |
+
|
| 146 |
+
if intent == 'law':
|
| 147 |
+
# use retriever find_similar
|
| 148 |
+
try:
|
| 149 |
+
# ensure embeddings built
|
| 150 |
+
self.retriever.build_or_load_embeddings()
|
| 151 |
+
results = self.retriever.find_similar(text, top_k=1)
|
| 152 |
+
except Exception:
|
| 153 |
+
results = []
|
| 154 |
+
|
| 155 |
+
if results:
|
| 156 |
+
score, meta = results[0]
|
| 157 |
+
law_row = meta.get('row')
|
| 158 |
+
out = {'type': 'law', 'score': score, 'law': law_row}
|
| 159 |
+
self._update_context(user_id, {'role': 'assistant', 'text': out})
|
| 160 |
+
return out
|
| 161 |
+
|
| 162 |
+
return {'type': 'unclear', 'text': "I couldn't find a matching law. Can you be more specific?"}
|
| 163 |
+
|
| 164 |
+
if intent == 'punishment':
|
| 165 |
+
# use simple matching against penal_code.csv if available, otherwise use overlap scoring
|
| 166 |
+
if self.punishments is not None:
|
| 167 |
+
# score by overlap
|
| 168 |
+
best = None
|
| 169 |
+
best_score = 0.0
|
| 170 |
+
for _, row in self.punishments.iterrows():
|
| 171 |
+
desc = ' '.join([str(row.get(c, '')) for c in row.index])
|
| 172 |
+
s = calculate_similarity_score(text, desc)
|
| 173 |
+
if s > best_score:
|
| 174 |
+
best_score = s
|
| 175 |
+
best = row.to_dict()
|
| 176 |
+
|
| 177 |
+
if best is not None and best_score > 0:
|
| 178 |
+
out = {'type': 'punishment', 'score': best_score, 'punishment_row': best}
|
| 179 |
+
self._update_context(user_id, {'role': 'assistant', 'text': out})
|
| 180 |
+
return out
|
| 181 |
+
|
| 182 |
+
# fallback: return unclear
|
| 183 |
+
return {'type': 'unclear', 'text': "I couldn't find a matching punishment. Can you provide more detail?"}
|
| 184 |
+
|
| 185 |
+
# unclear
|
| 186 |
+
out = {'type': 'unclear', 'text': "Can you please try a legal question? I'm here to assist you."}
|
| 187 |
+
self._update_context(user_id, {'role': 'assistant', 'text': out})
|
| 188 |
+
return out
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# Singleton assistant
|
| 192 |
+
_ASSISTANT: Optional[Assistant] = None
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def get_assistant() -> Assistant:
|
| 196 |
+
global _ASSISTANT
|
| 197 |
+
if _ASSISTANT is None:
|
| 198 |
+
_ASSISTANT = Assistant()
|
| 199 |
+
return _ASSISTANT
|
config.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration and utility functions for Rwanda Legal NLP System
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from typing import Dict, Any
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO,
|
| 15 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class ModelConfig:
|
| 20 |
+
"""Configuration for model settings"""
|
| 21 |
+
model_name: str = "EleutherAI/gpt-j-6B"
|
| 22 |
+
max_length: int = 512
|
| 23 |
+
temperature: float = 0.7
|
| 24 |
+
top_p: float = 0.9
|
| 25 |
+
top_k: int = 50
|
| 26 |
+
do_sample: bool = True
|
| 27 |
+
num_return_sequences: int = 1
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class TrainingConfig:
|
| 31 |
+
"""Configuration for training settings"""
|
| 32 |
+
output_dir: str = "./trained_legal_model"
|
| 33 |
+
num_epochs: int = 3
|
| 34 |
+
batch_size: int = 2
|
| 35 |
+
learning_rate: float = 5e-5
|
| 36 |
+
warmup_steps: int = 500
|
| 37 |
+
save_steps: int = 1000
|
| 38 |
+
eval_steps: int = 1000
|
| 39 |
+
max_grad_norm: float = 1.0
|
| 40 |
+
gradient_accumulation_steps: int = 4
|
| 41 |
+
weight_decay: float = 0.01
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class DataConfig:
|
| 45 |
+
"""Configuration for data processing"""
|
| 46 |
+
dataset_path: str = "dataset-all.csv"
|
| 47 |
+
max_text_length: int = 512
|
| 48 |
+
test_size: float = 0.1
|
| 49 |
+
random_state: int = 42
|
| 50 |
+
|
| 51 |
+
class ConfigManager:
|
| 52 |
+
"""Manage configuration for the legal NLP system"""
|
| 53 |
+
|
| 54 |
+
def __init__(self, config_path: str = "config.json"):
|
| 55 |
+
self.config_path = config_path
|
| 56 |
+
self.model_config = ModelConfig()
|
| 57 |
+
self.training_config = TrainingConfig()
|
| 58 |
+
self.data_config = DataConfig()
|
| 59 |
+
|
| 60 |
+
self.load_config()
|
| 61 |
+
|
| 62 |
+
def load_config(self):
|
| 63 |
+
"""Load configuration from JSON file if exists"""
|
| 64 |
+
if os.path.exists(self.config_path):
|
| 65 |
+
try:
|
| 66 |
+
with open(self.config_path, 'r') as f:
|
| 67 |
+
config_data = json.load(f)
|
| 68 |
+
|
| 69 |
+
# Update configurations
|
| 70 |
+
if 'model' in config_data:
|
| 71 |
+
for key, value in config_data['model'].items():
|
| 72 |
+
if hasattr(self.model_config, key):
|
| 73 |
+
setattr(self.model_config, key, value)
|
| 74 |
+
|
| 75 |
+
if 'training' in config_data:
|
| 76 |
+
for key, value in config_data['training'].items():
|
| 77 |
+
if hasattr(self.training_config, key):
|
| 78 |
+
setattr(self.training_config, key, value)
|
| 79 |
+
|
| 80 |
+
if 'data' in config_data:
|
| 81 |
+
for key, value in config_data['data'].items():
|
| 82 |
+
if hasattr(self.data_config, key):
|
| 83 |
+
setattr(self.data_config, key, value)
|
| 84 |
+
|
| 85 |
+
logging.info(f"Configuration loaded from {self.config_path}")
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logging.warning(f"Could not load config from {self.config_path}: {e}")
|
| 89 |
+
|
| 90 |
+
def save_config(self):
|
| 91 |
+
"""Save current configuration to JSON file"""
|
| 92 |
+
config_data = {
|
| 93 |
+
'model': self.model_config.__dict__,
|
| 94 |
+
'training': self.training_config.__dict__,
|
| 95 |
+
'data': self.data_config.__dict__
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
with open(self.config_path, 'w') as f:
|
| 100 |
+
json.dump(config_data, f, indent=2)
|
| 101 |
+
|
| 102 |
+
logging.info(f"Configuration saved to {self.config_path}")
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logging.error(f"Could not save config to {self.config_path}: {e}")
|
| 106 |
+
|
| 107 |
+
# Kinyarwanda language utilities
|
| 108 |
+
KINYARWANDA_STOPWORDS = {
|
| 109 |
+
'ni', 'na', 'ku', 'mu', 'nk', 'no', 'cyangwa', 'ariko', 'naho', 'none',
|
| 110 |
+
'kandi', 'rero', 'ubwo', 'uko', 'ubu', 'aha', 'aho', 'iyo', 'ese',
|
| 111 |
+
'nta', 'nti', 'nte', 'nto', 'ntu', 'ntw', 'aba', 'ari', 'hari',
|
| 112 |
+
'kuri', 'muri', 'buri', 'abantu', 'umuntu', 'ibintu', 'ikintu'
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
# Kinyarwanda legal terminology
|
| 116 |
+
KINYARWANDA_LEGAL_TERMS = {
|
| 117 |
+
'gusambanya': 'sexual defilement',
|
| 118 |
+
'kwiba': 'theft',
|
| 119 |
+
'gukoresha_imbaraga': 'use of force/violence',
|
| 120 |
+
'kwinjira': 'enter/trespass',
|
| 121 |
+
'kwica': 'kill/murder',
|
| 122 |
+
'gukubita': 'assault/beat',
|
| 123 |
+
'uburiganya': 'fraud/deception',
|
| 124 |
+
'ubuhemu': 'embezzlement',
|
| 125 |
+
'igifungo': 'imprisonment',
|
| 126 |
+
'ihazabu': 'fine',
|
| 127 |
+
'igihano': 'punishment',
|
| 128 |
+
'ingingo': 'article',
|
| 129 |
+
'itegeko': 'law',
|
| 130 |
+
'umwana': 'child',
|
| 131 |
+
'imyaka': 'years',
|
| 132 |
+
'amezi': 'months',
|
| 133 |
+
'burundu': 'life (imprisonment)',
|
| 134 |
+
'gahato': 'force/violence',
|
| 135 |
+
'imibonano_mpuzabitsina': 'sexual intercourse',
|
| 136 |
+
'inyamaswa': 'animals',
|
| 137 |
+
'rugo': 'home/house'
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
def clean_kinyarwanda_text(text: str) -> str:
|
| 141 |
+
"""Clean and normalize Kinyarwanda text"""
|
| 142 |
+
import re
|
| 143 |
+
|
| 144 |
+
if not text or pd.isna(text):
|
| 145 |
+
return ""
|
| 146 |
+
|
| 147 |
+
text = str(text)
|
| 148 |
+
|
| 149 |
+
# Remove extra whitespace
|
| 150 |
+
text = re.sub(r'\s+', ' ', text)
|
| 151 |
+
|
| 152 |
+
# Keep Kinyarwanda specific characters
|
| 153 |
+
text = re.sub(r'[^\w\s\-\.\,\;\:\!\?\'\"\u00C0-\u017F]', '', text)
|
| 154 |
+
|
| 155 |
+
# Remove common noise patterns
|
| 156 |
+
text = re.sub(r'\b[0-9]+\b', ' NUMBER ', text)
|
| 157 |
+
text = re.sub(r'\bFRW\s*[0-9,\.]+\b', ' AMOUNT ', text)
|
| 158 |
+
text = re.sub(r'\b[0-9]+-[0-9]+\b', ' RANGE ', text)
|
| 159 |
+
|
| 160 |
+
return text.strip()
|
| 161 |
+
|
| 162 |
+
def extract_keywords_kinyarwanda(text: str, max_keywords: int = 10) -> list:
|
| 163 |
+
"""Extract keywords from Kinyarwanda text"""
|
| 164 |
+
if not text:
|
| 165 |
+
return []
|
| 166 |
+
|
| 167 |
+
# Simple keyword extraction
|
| 168 |
+
words = clean_kinyarwanda_text(text).lower().split()
|
| 169 |
+
|
| 170 |
+
# Filter out stopwords and short words
|
| 171 |
+
keywords = [
|
| 172 |
+
word for word in words
|
| 173 |
+
if len(word) > 2 and word not in KINYARWANDA_STOPWORDS
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
# Count frequency and return most common
|
| 177 |
+
from collections import Counter
|
| 178 |
+
word_counts = Counter(keywords)
|
| 179 |
+
|
| 180 |
+
return [word for word, count in word_counts.most_common(max_keywords)]
|
| 181 |
+
|
| 182 |
+
# Legal category mappings with Kinyarwanda terms
|
| 183 |
+
LEGAL_CATEGORIES = {
|
| 184 |
+
'sexual_offence': {
|
| 185 |
+
'english': ['sexual', 'rape', 'assault', 'child', 'defilement'],
|
| 186 |
+
'kinyarwanda': ['gukoresha', 'gusambanya', 'igitsina', 'umwana', 'imibonano']
|
| 187 |
+
},
|
| 188 |
+
'theft': {
|
| 189 |
+
'english': ['theft', 'robbery', 'stealing', 'property'],
|
| 190 |
+
'kinyarwanda': ['kwiba', 'gufata', 'umutungo', 'imbaraga']
|
| 191 |
+
},
|
| 192 |
+
'privacy': {
|
| 193 |
+
'english': ['privacy', 'domicile', 'recording', 'entry'],
|
| 194 |
+
'kinyarwanda': ['kwinjira', 'kumviriza', 'rugo', 'ubuzima_bwite']
|
| 195 |
+
},
|
| 196 |
+
'morality': {
|
| 197 |
+
'english': ['adultery', 'bigamy', 'concubinage', 'marriage'],
|
| 198 |
+
'kinyarwanda': ['ubusambanyi', 'ubushoreke', 'gushyingirwa', 'guta_urugo']
|
| 199 |
+
},
|
| 200 |
+
'violence': {
|
| 201 |
+
'english': ['violence', 'murder', 'genocide', 'torture', 'assault'],
|
| 202 |
+
'kinyarwanda': ['kwica', 'gukubita', 'jenoside', 'ihohotera', 'imbaraga']
|
| 203 |
+
},
|
| 204 |
+
'fraud': {
|
| 205 |
+
'english': ['fraud', 'forgery', 'deception', 'embezzlement'],
|
| 206 |
+
'kinyarwanda': ['uburiganya', 'kwigana', 'ubuhemu', 'kwibeshya']
|
| 207 |
+
}
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
def categorize_case(description: str) -> str:
|
| 211 |
+
"""Categorize a case based on description keywords (English and Kinyarwanda)"""
|
| 212 |
+
if not description:
|
| 213 |
+
return "unknown"
|
| 214 |
+
|
| 215 |
+
description_lower = description.lower()
|
| 216 |
+
|
| 217 |
+
category_scores = {}
|
| 218 |
+
|
| 219 |
+
for category, terms in LEGAL_CATEGORIES.items():
|
| 220 |
+
score = 0
|
| 221 |
+
# Check English terms
|
| 222 |
+
for keyword in terms['english']:
|
| 223 |
+
if keyword in description_lower:
|
| 224 |
+
score += 2
|
| 225 |
+
|
| 226 |
+
# Check Kinyarwanda terms
|
| 227 |
+
for keyword in terms['kinyarwanda']:
|
| 228 |
+
if keyword in description_lower:
|
| 229 |
+
score += 3 # Higher weight for Kinyarwanda terms
|
| 230 |
+
|
| 231 |
+
if score > 0:
|
| 232 |
+
category_scores[category] = score
|
| 233 |
+
|
| 234 |
+
if category_scores:
|
| 235 |
+
return max(category_scores, key=category_scores.get)
|
| 236 |
+
else:
|
| 237 |
+
return "general"
|
| 238 |
+
|
| 239 |
+
# Evaluation metrics
|
| 240 |
+
def calculate_similarity_score(text1: str, text2: str) -> float:
|
| 241 |
+
"""Calculate simple similarity score between two texts"""
|
| 242 |
+
if not text1 or not text2:
|
| 243 |
+
return 0.0
|
| 244 |
+
|
| 245 |
+
words1 = set(clean_kinyarwanda_text(text1).lower().split())
|
| 246 |
+
words2 = set(clean_kinyarwanda_text(text2).lower().split())
|
| 247 |
+
|
| 248 |
+
if not words1 or not words2:
|
| 249 |
+
return 0.0
|
| 250 |
+
|
| 251 |
+
intersection = len(words1.intersection(words2))
|
| 252 |
+
union = len(words1.union(words2))
|
| 253 |
+
|
| 254 |
+
return intersection / union if union > 0 else 0.0
|
| 255 |
+
|
| 256 |
+
# Utility functions
|
| 257 |
+
def save_predictions(predictions: list, output_path: str):
|
| 258 |
+
"""Save predictions to file"""
|
| 259 |
+
try:
|
| 260 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 261 |
+
json.dump(predictions, f, indent=2, ensure_ascii=False)
|
| 262 |
+
|
| 263 |
+
logging.info(f"Predictions saved to {output_path}")
|
| 264 |
+
|
| 265 |
+
except Exception as e:
|
| 266 |
+
logging.error(f"Could not save predictions: {e}")
|
| 267 |
+
|
| 268 |
+
def load_predictions(input_path: str) -> list:
|
| 269 |
+
"""Load predictions from file"""
|
| 270 |
+
try:
|
| 271 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 272 |
+
predictions = json.load(f)
|
| 273 |
+
|
| 274 |
+
logging.info(f"Predictions loaded from {input_path}")
|
| 275 |
+
return predictions
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logging.error(f"Could not load predictions: {e}")
|
| 279 |
+
return []
|
| 280 |
+
|
| 281 |
+
def format_punishment(punishment: str) -> dict:
|
| 282 |
+
"""Parse and format punishment information"""
|
| 283 |
+
if not punishment or pd.isna(punishment):
|
| 284 |
+
return {"type": "unknown", "details": ""}
|
| 285 |
+
|
| 286 |
+
punishment = str(punishment).lower()
|
| 287 |
+
|
| 288 |
+
result = {
|
| 289 |
+
"type": "unknown",
|
| 290 |
+
"imprisonment": None,
|
| 291 |
+
"fine": None,
|
| 292 |
+
"community_service": None,
|
| 293 |
+
"details": punishment
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
# Extract imprisonment
|
| 297 |
+
import re
|
| 298 |
+
|
| 299 |
+
# Years
|
| 300 |
+
year_pattern = r'(\d+)(?:-(\d+))?\s*(?:years?|imyaka)'
|
| 301 |
+
year_match = re.search(year_pattern, punishment)
|
| 302 |
+
if year_match:
|
| 303 |
+
min_years = int(year_match.group(1))
|
| 304 |
+
max_years = int(year_match.group(2)) if year_match.group(2) else min_years
|
| 305 |
+
result["imprisonment"] = f"{min_years}-{max_years} years"
|
| 306 |
+
result["type"] = "imprisonment"
|
| 307 |
+
|
| 308 |
+
# Life imprisonment
|
| 309 |
+
if any(term in punishment for term in ['life', 'burundu', 'cya burundu']):
|
| 310 |
+
result["imprisonment"] = "Life imprisonment"
|
| 311 |
+
result["type"] = "life_imprisonment"
|
| 312 |
+
|
| 313 |
+
# Fine amounts
|
| 314 |
+
fine_pattern = r'frw\s*([0-9,\.]+)(?:\s*-\s*([0-9,\.]+))?'
|
| 315 |
+
fine_match = re.search(fine_pattern, punishment)
|
| 316 |
+
if fine_match:
|
| 317 |
+
min_fine = fine_match.group(1)
|
| 318 |
+
max_fine = fine_match.group(2) if fine_match.group(2) else min_fine
|
| 319 |
+
result["fine"] = f"FRW {min_fine}-{max_fine}"
|
| 320 |
+
|
| 321 |
+
# Community service
|
| 322 |
+
if 'community service' in punishment or 'inyungu rusange' in punishment:
|
| 323 |
+
result["community_service"] = "Yes"
|
| 324 |
+
|
| 325 |
+
return result
|
| 326 |
+
|
| 327 |
+
# Default configuration instance
|
| 328 |
+
config_manager = ConfigManager()
|
| 329 |
+
|
| 330 |
+
if __name__ == "__main__":
|
| 331 |
+
# Test utilities
|
| 332 |
+
sample_text = "Umuntu yakoreye umwana igikorwa gishingiye ku gitsina"
|
| 333 |
+
|
| 334 |
+
print("Sample text:", sample_text)
|
| 335 |
+
print("Cleaned:", clean_kinyarwanda_text(sample_text))
|
| 336 |
+
print("Keywords:", extract_keywords_kinyarwanda(sample_text))
|
| 337 |
+
print("Category:", categorize_case(sample_text))
|
| 338 |
+
|
| 339 |
+
sample_punishment = "Igifungo cy'imyaka 10-15 + ihazabu FRW 1,000,000-2,000,000"
|
| 340 |
+
print("Formatted punishment:", format_punishment(sample_punishment))
|
dataset-all.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
greetings.csv
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
greeting,response,followup,greeting_en,response_en,followup_en,greeting_fr,response_fr,followup_fr
|
| 2 |
+
Mwaramutse,"Mwaramutse neza, amakuru yawe? (Ndaganira gusa mu Kinyarwanda)","Ndishimye ko umeze neza, mbwira icyo ngufasha mu mategeko cg urubanza witegura. (Ndaganira gusa mu Kinyarwanda)","Good morning","Good morning, how are you? (I only speak Kinyarwanda)","I'm glad you're well, tell me how I can help you with legal matters or your upcoming case. (I only speak Kinyarwanda)","Bonjour","Bonjour, comment ça va ? (Je ne parle que le kinyarwanda)","Je suis content que tu ailles bien, dis-moi comment je peux t'aider avec des questions juridiques ou ton prochain procès. (Je ne parle que le kinyarwanda)"
|
| 3 |
+
Bite,"Ni byiza, uraho nawe? (Ndaganira gusa mu Kinyarwanda)","Ndishimye ko umeze neza, hari ikibazo cy'amategeko ushaka ko tuganiraho? (Ndaganira gusa mu Kinyarwanda)","Hi","I'm good, how about you? (I only speak Kinyarwanda)","I'm glad you're well, is there a legal issue you'd like to discuss? (I only speak Kinyarwanda)","Salut","Je vais bien, et toi ? (Je ne parle que le kinyarwanda)","Je suis content que tu ailles bien, y a-t-il un problème juridique dont tu veux parler ? (Je ne parle que le kinyarwanda)"
|
| 4 |
+
Amakuru,"Ni meza cyane, wowe se? (Ndaganira gusa mu Kinyarwanda)","Ndanezerewe ko wowe nawe umeze neza, hari icyo wifuza kumenya ku mategeko? (Ndaganira gusa mu Kinyarwanda)","How are you?","I'm very well, and you? (I only speak Kinyarwanda)","I'm happy that you are also well, is there anything you want to know about the law? (I only speak Kinyarwanda)","Comment vas-tu ?","Je vais très bien, et toi ? (Je ne parle que le kinyarwanda)","Je suis heureux que tu ailles bien aussi, y a-t-il quelque chose que tu veux savoir sur la loi ? (Je ne parle que le kinyarwanda)"
|
| 5 |
+
Bite se,"Ni fresh, uraho neza!","Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?","Hey there","I'm great, how are you?","I'm glad you're well, is there anything you want to know or ask advice about?","Coucou","Je vais super bien, comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?"
|
| 6 |
+
Mwiriwe,"Mwiriwe neza, amakuru yawe?","Ndishimye ko umeze neza, mbwira icyo ngufasha mu mategeko cg urubanza witegura.","Good afternoon","Good afternoon, how are you?","I'm glad you're well, tell me how I can help you with legal matters or your upcoming case.","Bon après-midi","Bon après-midi, comment ça va ?","Je suis content que tu ailles bien, dis-moi comment je peux t'aider avec des questions juridiques ou ton prochain procès."
|
| 7 |
+
Muraho,"Muraho neza, uraho ute?","Ndishimye ko umeze neza, hari ikibazo cy'amategeko ushaka ko tuganiraho?","Hello","Hello, how are you?","I'm glad you're well, is there a legal issue you'd like to discuss?","Bonjour","Bonjour, comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il un problème juridique dont tu veux parler ?"
|
| 8 |
+
Umeze ute,"Meze neza cyane, urakoze kubaza.","Ndanezerewe ko umeze neza, hari icyo wifuza kumenya ku mategeko?","How are you feeling?","I'm very well, thank you for asking.","I'm happy that you are well, is there anything you want to know about the law?","Comment te sens-tu ?","Je vais très bien, merci de demander.","Je suis heureux que tu ailles bien, y a-t-il quelque chose que tu veux savoir sur la loi ?"
|
| 9 |
+
Urakomeye,"Ndakomeye, nawe uraho neza?","Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?","Are you strong?","I'm strong, how are you?","I'm glad you're well, is there anything you want to know or ask advice about?","Tu es fort ?","Je suis fort, et toi comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?"
|
| 10 |
+
Urabeho,"Urabeho, tuzabonana ubutaha.","Niba hari ikibazo cy'amategeko uzagaruke tuganireho.","Goodbye","Goodbye, see you next time.","If you have a legal issue, come back and we'll discuss it.","Au revoir","Au revoir, à la prochaine.","Si tu as un problème juridique, reviens et nous en discuterons."
|
| 11 |
+
Ndagushimiye,"Nta kibazo, nanjye ndagushimiye.","Niba hari ikindi ushaka kumenya, ntuzazuyaze kumbaza.","Thank you","No problem, thank you too.","If there's anything else you want to know, don't hesitate to ask.","Merci","Pas de problème, merci à toi aussi.","Si tu veux savoir autre chose, n'hésite pas à demander."
|
| 12 |
+
Urakoze,"Urakoze nawe, ni byiza kugufasha.","Niba hari ikindi kibazo cy'amategeko, ndahari kugufasha.","Thanks","Thanks to you too, it's good to help you.","If you have any other legal questions, I'm here to help.","Merci","Merci à toi aussi, c'est un plaisir de t'aider.","Si tu as d'autres questions juridiques, je suis là pour t'aider."
|
| 13 |
+
Wiriwe,"Wiriwe neza, tugire umunsi mwiza.","Niba hari icyo wifuza kumenya, ndahari kugufasha.","Good evening","Good evening, have a nice day.","If there's anything you want to know, I'm here to help.","Bonsoir","Bonsoir, passe une bonne journée.","Si tu veux savoir quelque chose, je suis là pour t'aider."
|
| 14 |
+
Ijoro ryiza,"Ijoro ryiza nawe, uryame rwose.","Niba hari ikibazo cy'amategeko, uzambwire ejo.","Good night","Good night to you too, sleep well.","If you have a legal issue, let me know tomorrow.","Bonne nuit","Bonne nuit à toi aussi, dors bien.","Si tu as un problème juridique, dis-le-moi demain."
|
| 15 |
+
Naho,"Ni byiza cyane, uraho neza!","Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?","Hey","That's great, how are you?","I'm glad you're well, is there anything you want to know or ask advice about?","Salut","C'est super, comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?"
|
| 16 |
+
Murabeho,"Murabeho, mukomeze kugira umugisha.","Niba hari ikibazo cy'amategeko uzagaruke tuganireho.","Farewell","Farewell, may you continue to be blessed.","If you have a legal issue, come back and we'll discuss it.","Adieu","Adieu, que tu continues à être béni.","Si tu as un problème juridique, reviens et nous en discuterons."
|
| 17 |
+
Ndishimye ko umeze neza, mbwira icyo ngufasha mu mategeko cg urubanza witegura.,"Nditeguye kugufasha, sobanura ikibazo cyawe.","Wakenera ubundi bufasha bw'amategeko?","I'm glad you're well, tell me how I can help you with legal matters or your upcoming case.","I'm ready to help you, explain your issue.","Would you need any other legal assistance?","Je suis content que tu ailles bien, dis-moi comment je peux t'aider avec des questions juridiques ou ton prochain procès.","Je suis prêt à t'aider, explique ton problème.","Aurais-tu besoin d'une autre assistance juridique ?"
|
| 18 |
+
Ndishimye ko umeze neza, hari ikibazo cy'amategeko ushaka ko tuganiraho?,"Mbwira ikibazo cyawe, ndagufasha uko nshoboye.","Hari ibindi wifuza kumenya?","I'm glad you're well, is there a legal issue you'd like to discuss?","Tell me your issue, I'll help as much as I can.","Is there anything else you want to know?","Je suis content que tu ailles bien, y a-t-il un problème juridique dont tu veux parler ?","Dis-moi ton problème, je t'aiderai autant que possible.","Y a-t-il autre chose que tu veux savoir ?"
|
| 19 |
+
Ndanezerewe ko wowe nawe umeze neza, hari icyo wifuza kumenya ku mategeko?,"Niba hari ikibazo cyihariye, nyibwira.","Ndashobora kugufasha no mu bindi bibazo by'amategeko.","I'm happy that you are also well, is there anything you want to know about the law?","If you have a specific issue, tell me.","I can also help you with other legal issues.","Je suis heureux que tu ailles bien aussi, y a-t-il quelque chose que tu veux savoir sur la loi ?","Si tu as un problème spécifique, dis-le-moi.","Je peux aussi t'aider avec d'autres questions juridiques."
|
| 20 |
+
Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?,"Sobanura ikibazo cyawe, nditeguye kugufasha.","Hari indi nama ushaka?","I'm glad you're well, is there anything you want to know or ask advice about?","Explain your issue, I'm ready to help.","Do you want any other advice?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?","Explique ton problème, je suis prêt à t'aider.","Veux-tu un autre conseil ?"
|
| 21 |
+
Ndanezerewe ko umeze neza, hari icyo wifuza kumenya ku mategeko?,"Mbwira ikibazo cyawe cy'amategeko.","Niba hari ibindi bibazo, ndahari.","I'm happy that you are well, is there anything you want to know about the law?","Tell me your legal issue.","If there are other questions, I'm here.","Je suis heureux que tu ailles bien, y a-t-il quelque chose que tu veux savoir sur la loi ?","Dis-moi ton problème juridique.","S'il y a d'autres questions, je suis là."
|
| 22 |
+
Niba hari ikibazo cy'amategeko uzagaruke tuganireho.,"Uzanyandikire igihe cyose ukeneye ubufasha.","Igihe cyose ukeneye inama, ndahari.","If you have a legal issue, come back and we'll discuss it.","Write to me anytime you need help.","Whenever you need advice, I'm here.","Si tu as un problème juridique, reviens et nous en discuterons.","Écris-moi quand tu as besoin d'aide.","Quand tu as besoin de conseils, je suis là."
|
| 23 |
+
Niba hari ikindi ushaka kumenya, ntuzazuyaze kumbaza.,"Nditeguye kugufasha igihe cyose.","Wakenera ubundi bufasha?","If there's anything else you want to know, don't hesitate to ask.","I'm ready to help you anytime.","Would you need any other assistance?","Si tu veux savoir autre chose, n'hésite pas à demander.","Je suis prêt à t'aider à tout moment.","Aurais-tu besoin d'une autre assistance ?"
|
| 24 |
+
Niba hari ikindi kibazo cy'amategeko, ndahari kugufasha.,"Mbwira ikibazo cyawe igihe cyose.","Ndashobora kugufasha no mu bindi.","If you have any other legal questions, I'm here to help.","Tell me your issue anytime.","I can also help you with other things.","Si tu as d'autres questions juridiques, je suis là pour t'aider.","Dis-moi ton problème à tout moment.","Je peux aussi t'aider avec d'autres choses."
|
| 25 |
+
Niba hari icyo wifuza kumenya, ndahari kugufasha.,"Sobanura icyo wifuza kumenya.","Nditeguye kugufasha igihe cyose.","If there's anything you want to know, I'm here to help.","Explain what you want to know.","I'm ready to help you anytime.","Si tu veux savoir quelque chose, je suis là pour t'aider.","Explique ce que tu veux savoir.","Je suis prêt à t'aider à tout moment."
|
| 26 |
+
Niba hari ikibazo cy'amategeko, uzambwire ejo.,"Uzanyandikire igihe cyose ukeneye ubufasha.","Ndahari igihe cyose ukeneye inama.","If you have a legal issue, let me know tomorrow.","Write to me anytime you need help.","I'm here whenever you need advice.","Si tu as un problème juridique, dis-le-moi demain.","Écris-moi quand tu as besoin d'aide.","Je suis là chaque fois que tu as besoin de conseils."
|
inference.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hugging Face Inference API endpoint for mbaza-model
|
| 3 |
+
This file exposes the model for API calls via HF Inference API.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Dict, List, Any
|
| 6 |
+
from assistant import get_assistant
|
| 7 |
+
|
| 8 |
+
# Initialize the assistant once
|
| 9 |
+
assistant = get_assistant()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def model(inputs: List[str]) -> Dict[str, Any]:
|
| 13 |
+
"""
|
| 14 |
+
Main inference function called by Hugging Face Inference API.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
inputs: List of input strings (prompts/queries)
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
Dict with model response
|
| 21 |
+
"""
|
| 22 |
+
if not inputs or not isinstance(inputs, list):
|
| 23 |
+
return {"error": "Invalid input format. Expected list of strings."}
|
| 24 |
+
|
| 25 |
+
prompt = inputs[0]
|
| 26 |
+
user_id = inputs[1] if len(inputs) > 1 else "api_user"
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
result = assistant.handle_query(user_id, prompt)
|
| 30 |
+
return result
|
| 31 |
+
except Exception as e:
|
| 32 |
+
return {
|
| 33 |
+
"error": f"Processing failed: {str(e)}",
|
| 34 |
+
"text": "An error occurred while processing your request."
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def predict(prompt: str, user_id: str = "api_user") -> Dict[str, Any]:
|
| 39 |
+
"""
|
| 40 |
+
Alternative prediction function with explicit parameters.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
prompt: User query or greeting
|
| 44 |
+
user_id: Optional user identifier for context tracking
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Dict with response, intent, and any matched data
|
| 48 |
+
"""
|
| 49 |
+
try:
|
| 50 |
+
result = assistant.handle_query(user_id, prompt)
|
| 51 |
+
return result
|
| 52 |
+
except Exception as e:
|
| 53 |
+
return {
|
| 54 |
+
"error": f"Processing failed: {str(e)}",
|
| 55 |
+
"text": "An error occurred while processing your request."
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# For testing locally
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
test_queries = [
|
| 62 |
+
"Mwaramutse neza",
|
| 63 |
+
"Ibihano by'ubujura ni ibihe?",
|
| 64 |
+
"Kwinjira aho umuntu atuye bitemewe namategeko",
|
| 65 |
+
"What are the laws about corruption?"
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
print("Testing inference.py locally...\n")
|
| 69 |
+
for query in test_queries:
|
| 70 |
+
print(f"Query: {query}")
|
| 71 |
+
response = model([query])
|
| 72 |
+
print(f"Response: {response.get('text', response)}")
|
| 73 |
+
print("-" * 80)
|
law_embeddings.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e537738aac9aa323758cde5c61b7ab843ff3b1ab6189a5e63b405d11574ec432
|
| 3 |
+
size 3090560
|
law_meta.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
penal_code.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
sentence-transformers
|
| 3 |
+
torch
|
| 4 |
+
nltk
|
| 5 |
+
flask
|
| 6 |
+
gunicorn
|
| 7 |
+
gradio
|
retriever.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import json
|
| 6 |
+
import random
|
| 7 |
+
from typing import List, Dict, Any, Tuple
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
+
except Exception:
|
| 12 |
+
SentenceTransformer = None
|
| 13 |
+
|
| 14 |
+
ROOT = Path(__file__).parent
|
| 15 |
+
MODEL_NAME = 'all-MiniLM-L6-v2'
|
| 16 |
+
EMBED_PATH = ROOT / 'law_embeddings.npy'
|
| 17 |
+
META_PATH = ROOT / 'law_meta.json'
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _combine_law_text(row: pd.Series) -> str:
|
| 21 |
+
parts = []
|
| 22 |
+
for col in ['Law-Name', 'Article-Description', 'Punishment', 'Category', 'Chapter', 'Article-Number']:
|
| 23 |
+
if col in row and pd.notna(row[col]):
|
| 24 |
+
parts.append(str(row[col]))
|
| 25 |
+
return ' '.join(parts)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class LawRetriever:
|
| 29 |
+
def __init__(self, model_name: str = MODEL_NAME):
|
| 30 |
+
self.model_name = model_name
|
| 31 |
+
self.model = None
|
| 32 |
+
self.laws_df: pd.DataFrame | None = None
|
| 33 |
+
self.embeddings: np.ndarray | None = None
|
| 34 |
+
self.meta: List[dict] = []
|
| 35 |
+
self.greetings_df: pd.DataFrame | None = None
|
| 36 |
+
|
| 37 |
+
def ensure_model(self):
|
| 38 |
+
if self.model is None:
|
| 39 |
+
if SentenceTransformer is None:
|
| 40 |
+
raise RuntimeError('sentence-transformers not available in environment')
|
| 41 |
+
self.model = SentenceTransformer(self.model_name)
|
| 42 |
+
|
| 43 |
+
def load_laws(self, laws_path: Path | str = None):
|
| 44 |
+
path = Path(laws_path) if laws_path else ROOT / 'dataset-all.csv'
|
| 45 |
+
if not path.exists():
|
| 46 |
+
raise FileNotFoundError(f'Laws CSV not found at {path}')
|
| 47 |
+
df = pd.read_csv(path)
|
| 48 |
+
# ensure consistent columns
|
| 49 |
+
df = df.fillna('')
|
| 50 |
+
self.laws_df = df
|
| 51 |
+
# build metadata
|
| 52 |
+
self.meta = []
|
| 53 |
+
for i, row in df.iterrows():
|
| 54 |
+
combined = _combine_law_text(row)
|
| 55 |
+
self.meta.append({'index': int(i), 'text': combined})
|
| 56 |
+
return df
|
| 57 |
+
|
| 58 |
+
def build_or_load_embeddings(self, force: bool = False):
|
| 59 |
+
# If embeddings exist and not forced, load them
|
| 60 |
+
if EMBED_PATH.exists() and META_PATH.exists() and not force:
|
| 61 |
+
try:
|
| 62 |
+
self.embeddings = np.load(EMBED_PATH)
|
| 63 |
+
with open(META_PATH, 'r', encoding='utf-8') as f:
|
| 64 |
+
self.meta = json.load(f)
|
| 65 |
+
# ensure laws_df exists
|
| 66 |
+
if self.laws_df is None:
|
| 67 |
+
self.load_laws()
|
| 68 |
+
return self.embeddings
|
| 69 |
+
except Exception:
|
| 70 |
+
# fall through to rebuild
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
# Build embeddings
|
| 74 |
+
if self.laws_df is None:
|
| 75 |
+
self.load_laws()
|
| 76 |
+
|
| 77 |
+
self.ensure_model()
|
| 78 |
+
texts = [m['text'] for m in self.meta]
|
| 79 |
+
if not texts:
|
| 80 |
+
self.embeddings = np.zeros((0, 384), dtype=np.float32)
|
| 81 |
+
else:
|
| 82 |
+
emb = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
|
| 83 |
+
# normalize
|
| 84 |
+
norms = np.linalg.norm(emb, axis=1, keepdims=True)
|
| 85 |
+
norms[norms == 0] = 1
|
| 86 |
+
emb = emb / norms
|
| 87 |
+
self.embeddings = emb.astype(np.float32)
|
| 88 |
+
np.save(EMBED_PATH, self.embeddings)
|
| 89 |
+
with open(META_PATH, 'w', encoding='utf-8') as f:
|
| 90 |
+
json.dump(self.meta, f, ensure_ascii=False)
|
| 91 |
+
|
| 92 |
+
return self.embeddings
|
| 93 |
+
|
| 94 |
+
def find_similar(self, query: str, top_k: int = 1) -> List[Tuple[float, dict]]:
|
| 95 |
+
if self.embeddings is None:
|
| 96 |
+
self.build_or_load_embeddings()
|
| 97 |
+
self.ensure_model()
|
| 98 |
+
q_emb = self.model.encode([query], convert_to_numpy=True)
|
| 99 |
+
q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
|
| 100 |
+
sims = (self.embeddings @ q_emb[0]).astype(float)
|
| 101 |
+
# get top_k indices
|
| 102 |
+
idx = np.argsort(-sims)[:top_k]
|
| 103 |
+
results = []
|
| 104 |
+
for i in idx:
|
| 105 |
+
score = float(sims[i])
|
| 106 |
+
meta = self.meta[i].copy()
|
| 107 |
+
meta['score'] = score
|
| 108 |
+
# attach original row (if available)
|
| 109 |
+
if self.laws_df is not None and int(meta['index']) in self.laws_df.index:
|
| 110 |
+
row = self.laws_df.loc[int(meta['index'])].to_dict()
|
| 111 |
+
meta['row'] = row
|
| 112 |
+
results.append((score, meta))
|
| 113 |
+
return results
|
| 114 |
+
|
| 115 |
+
def load_greetings(self, path: Path | str = None):
|
| 116 |
+
p = Path(path) if path else ROOT / 'greetings.csv'
|
| 117 |
+
if not p.exists():
|
| 118 |
+
raise FileNotFoundError(f'Greetings CSV not found at {p}')
|
| 119 |
+
try:
|
| 120 |
+
df = pd.read_csv(p)
|
| 121 |
+
except Exception:
|
| 122 |
+
# sometimes the CSV has irregular quoting/commas; try a more tolerant parser
|
| 123 |
+
try:
|
| 124 |
+
df = pd.read_csv(p, engine='python', on_bad_lines='skip')
|
| 125 |
+
except Exception:
|
| 126 |
+
# final fallback: read as plain text and attempt a crude split
|
| 127 |
+
text = p.read_text(encoding='utf-8', errors='ignore')
|
| 128 |
+
# attempt to split lines and parse with first line as header
|
| 129 |
+
lines = [l for l in text.splitlines() if l.strip()]
|
| 130 |
+
if not lines:
|
| 131 |
+
raise
|
| 132 |
+
header = [h.strip() for h in lines[0].split(',')]
|
| 133 |
+
records = []
|
| 134 |
+
for ln in lines[1:]:
|
| 135 |
+
parts = [c.strip() for c in ln.split(',')]
|
| 136 |
+
# pad/truncate to header length
|
| 137 |
+
if len(parts) < len(header):
|
| 138 |
+
parts += [''] * (len(header) - len(parts))
|
| 139 |
+
parts = parts[:len(header)]
|
| 140 |
+
records.append(dict(zip(header, parts)))
|
| 141 |
+
df = pd.DataFrame.from_records(records)
|
| 142 |
+
|
| 143 |
+
self.greetings_df = df.fillna('')
|
| 144 |
+
return self.greetings_df
|
| 145 |
+
|
| 146 |
+
def detect_and_reply_greeting(self, text: str) -> dict | None:
|
| 147 |
+
# Load greetings if needed
|
| 148 |
+
if self.greetings_df is None:
|
| 149 |
+
try:
|
| 150 |
+
self.load_greetings()
|
| 151 |
+
except FileNotFoundError:
|
| 152 |
+
return None
|
| 153 |
+
t = str(text).lower().strip()
|
| 154 |
+
|
| 155 |
+
# Check exact/substring matches across language columns and prefer language-specific response
|
| 156 |
+
tokens = set(t.split())
|
| 157 |
+
|
| 158 |
+
# 1) direct matches preferring Kinyarwanda, then English, then French
|
| 159 |
+
for lang_suffix in ['', '_en', '_fr']:
|
| 160 |
+
gcol = f'greeting{lang_suffix}'
|
| 161 |
+
rcol = f'response{lang_suffix}'
|
| 162 |
+
fcol = f'followup{lang_suffix}'
|
| 163 |
+
for _, row in self.greetings_df.iterrows():
|
| 164 |
+
gval = str(row.get(gcol, '')).strip()
|
| 165 |
+
if not gval:
|
| 166 |
+
continue
|
| 167 |
+
gval_l = gval.lower()
|
| 168 |
+
if gval_l == t or gval_l in t or t in gval_l:
|
| 169 |
+
response = row.get(rcol) or row.get('response') or row.get('response_en') or row.get('response_fr') or ''
|
| 170 |
+
followup = row.get(fcol) or row.get('followup') or row.get('followup_en') or row.get('followup_fr') or ''
|
| 171 |
+
return {'response': response, 'followup': followup}
|
| 172 |
+
|
| 173 |
+
# 2) token overlap fallback: collect candidates with token overlap across languages
|
| 174 |
+
candidates = []
|
| 175 |
+
for _, row in self.greetings_df.iterrows():
|
| 176 |
+
for lang_suffix in ['', '_en', '_fr']:
|
| 177 |
+
gcol = f'greeting{lang_suffix}'
|
| 178 |
+
gval = str(row.get(gcol, '')).strip().lower()
|
| 179 |
+
if not gval:
|
| 180 |
+
continue
|
| 181 |
+
if any(tok in gval.split() for tok in tokens):
|
| 182 |
+
candidates.append((row, lang_suffix))
|
| 183 |
+
|
| 184 |
+
if candidates:
|
| 185 |
+
row, lang_suffix = random.choice(candidates)
|
| 186 |
+
rcol = f'response{lang_suffix}'
|
| 187 |
+
fcol = f'followup{lang_suffix}'
|
| 188 |
+
response = row.get(rcol) or row.get('response') or row.get('response_en') or row.get('response_fr') or ''
|
| 189 |
+
followup = row.get(fcol) or row.get('followup') or row.get('followup_en') or row.get('followup_fr') or ''
|
| 190 |
+
return {'response': response, 'followup': followup}
|
| 191 |
+
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# Provide a singleton retriever for the server to use
|
| 196 |
+
_RETRIEVER: LawRetriever | None = None
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def get_retriever() -> LawRetriever:
|
| 200 |
+
global _RETRIEVER
|
| 201 |
+
if _RETRIEVER is None:
|
| 202 |
+
_RETRIEVER = LawRetriever()
|
| 203 |
+
try:
|
| 204 |
+
_RETRIEVER.load_laws()
|
| 205 |
+
except FileNotFoundError:
|
| 206 |
+
pass
|
| 207 |
+
return _RETRIEVER
|