mugwaneza commited on
Commit
fc1c893
·
1 Parent(s): 8905535

Deploy Mbaza Legal AI Model with inference endpoint

Browse files
Files changed (11) hide show
  1. README.md +295 -1
  2. assistant.py +199 -0
  3. config.py +340 -0
  4. dataset-all.csv +0 -0
  5. greetings.csv +26 -0
  6. inference.py +73 -0
  7. law_embeddings.npy +3 -0
  8. law_meta.json +0 -0
  9. penal_code.csv +0 -0
  10. requirements.txt +7 -0
  11. retriever.py +207 -0
README.md CHANGED
@@ -1,3 +1,297 @@
1
  ---
2
- license: unknown
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Mbaza Legal AI Model
3
+ language:
4
+ - rw
5
+ - en
6
+ - fr
7
+ tags:
8
+ - legal
9
+ - kinyarwanda
10
+ - nlp
11
+ - sentence-transformers
12
+ - question-answering
13
+ license: mit
14
  ---
15
+
16
+ # 🇷🇼 Mbaza Legal AI Model
17
+
18
+ **Multilingual Legal Assistant for Rwandan Laws** - Supporting Kinyarwanda, English, and French.
19
+
20
+ ## Model Description
21
+
22
+ This model provides intelligent legal assistance for Rwandan laws, punishments, and legal procedures. It uses semantic search with sentence embeddings to match user queries with relevant legal articles and punishment information.
23
+
24
+ ### Key Features
25
+
26
+ - **Multilingual Support**: Kinyarwanda, English, and French
27
+ - **Greeting Detection**: Natural conversation in multiple languages
28
+ - **Legal Article Retrieval**: Semantic search across Rwandan legal code
29
+ - **Punishment Information**: Detailed penalty and sentencing information
30
+ - **Context Tracking**: Maintains conversation history
31
+
32
+ ### Technical Details
33
+
34
+ - **Base Model**: `sentence-transformers/all-MiniLM-L6-v2`
35
+ - **Embedding Dimension**: 384
36
+ - **Similarity Metric**: Cosine similarity
37
+ - **Framework**: PyTorch + SentenceTransformers
38
+
39
+ ## Usage
40
+
41
+ ### Via Hugging Face Inference API
42
+
43
+ ```python
44
+ import requests
45
+
46
+ API_URL = "https://api-inference.huggingface.co/models/mugwaneza/mbaza-model"
47
+ headers = {"Authorization": f"Bearer {YOUR_HF_TOKEN}"}
48
+
49
+ def query(prompt):
50
+ response = requests.post(API_URL, headers=headers, json={"inputs": [prompt]})
51
+ return response.json()
52
+
53
+ # Example usage
54
+ result = query("Ibihano by'ubujura ni ibihe?")
55
+ print(result["text"])
56
+ ```
57
+
58
+ ### Via Python Client
59
+
60
+ ```python
61
+ from huggingface_hub import InferenceClient
62
+
63
+ client = InferenceClient(token=YOUR_HF_TOKEN)
64
+
65
+ response = client.post(
66
+ "mugwaneza/mbaza-model",
67
+ json={"inputs": ["Kwinjira aho umuntu atuye bitemewe namategeko"]}
68
+ )
69
+ print(response)
70
+ ```
71
+
72
+ ### Via cURL
73
+
74
+ ```bash
75
+ curl -X POST \
76
+ -H "Authorization: Bearer YOUR_HF_TOKEN" \
77
+ -H "Content-Type: application/json" \
78
+ -d '{"inputs":["Mwaramutse neza"]}' \
79
+ https://api-inference.huggingface.co/models/mugwaneza/mbaza-model
80
+ ```
81
+
82
+ ### For Mobile Apps (React Native, Flutter, Swift)
83
+
84
+ ```javascript
85
+ // React Native / JavaScript
86
+ const API_URL = "https://api-inference.huggingface.co/models/mugwaneza/mbaza-model";
87
+ const HF_TOKEN = "your_token_here";
88
+
89
+ async function queryLegalAI(prompt) {
90
+ const response = await fetch(API_URL, {
91
+ method: "POST",
92
+ headers: {
93
+ "Authorization": `Bearer ${HF_TOKEN}`,
94
+ "Content-Type": "application/json"
95
+ },
96
+ body: JSON.stringify({ inputs: [prompt] })
97
+ });
98
+
99
+ return await response.json();
100
+ }
101
+
102
+ // Usage
103
+ const result = await queryLegalAI("What is the punishment for theft?");
104
+ console.log(result.text);
105
+ ```
106
+
107
+ ### For Laravel Backend
108
+
109
+ ```php
110
+ <?php
111
+
112
+ namespace App\Services;
113
+
114
+ use Illuminate\Support\Facades\Http;
115
+
116
+ class MbazaLegalAI
117
+ {
118
+ protected $apiUrl = 'https://api-inference.huggingface.co/models/mugwaneza/mbaza-model';
119
+ protected $token;
120
+
121
+ public function __construct()
122
+ {
123
+ $this->token = config('services.huggingface.token');
124
+ }
125
+
126
+ public function query($prompt, $userId = 'web_user')
127
+ {
128
+ $response = Http::withHeaders([
129
+ 'Authorization' => "Bearer {$this->token}",
130
+ 'Content-Type' => 'application/json'
131
+ ])->post($this->apiUrl, [
132
+ 'inputs' => [$prompt, $userId]
133
+ ]);
134
+
135
+ return $response->json();
136
+ }
137
+ }
138
+
139
+ // Usage in Controller
140
+ $ai = new MbazaLegalAI();
141
+ $result = $ai->query("Ibihano by'ubujura ni ibihe?");
142
+ return response()->json($result);
143
+ ```
144
+
145
+ ## Example Queries
146
+
147
+ ### Greetings (Multilingual)
148
+
149
+ ```python
150
+ # Kinyarwanda
151
+ query("Mwaramutse neza")
152
+ # Response: Mwaramutse neza, amakuru yawe?
153
+
154
+ # English
155
+ query("Good morning")
156
+ # Response: Good morning, how can I help you with legal matters?
157
+
158
+ # French
159
+ query("Bonjour")
160
+ # Response: Bonjour, comment puis-je vous aider?
161
+ ```
162
+
163
+ ### Legal Questions
164
+
165
+ ```python
166
+ # Kinyarwanda
167
+ query("Ibihano by'ubujura ni ibihe?")
168
+ # Returns: Punishment information for theft
169
+
170
+ # English
171
+ query("What are the laws about corruption in Rwanda?")
172
+ # Returns: Relevant legal articles on corruption
173
+
174
+ # Mixed
175
+ query("Kwinjira aho umuntu atuye bitemewe namategeko")
176
+ # Returns: Laws about trespassing and unauthorized entry
177
+ ```
178
+
179
+ ### Punishment Queries
180
+
181
+ ```python
182
+ query("Igihano cy'umuntu wakubise undi")
183
+ # Returns: Punishment for assault
184
+
185
+ query("What is the penalty for fraud?")
186
+ # Returns: Detailed penalty information
187
+ ```
188
+
189
+ ## Response Format
190
+
191
+ ```json
192
+ {
193
+ "text": "Main response text (formatted for display)",
194
+ "intent": "greeting|law|punishment|fallback",
195
+ "laws": [
196
+ {
197
+ "article": "Article 166",
198
+ "description": "...",
199
+ "punishment": "...",
200
+ "similarity": 0.85
201
+ }
202
+ ],
203
+ "punishments": [
204
+ {
205
+ "crime": "Theft",
206
+ "category": "Property crimes",
207
+ "penalty": "..."
208
+ }
209
+ ]
210
+ }
211
+ ```
212
+
213
+ ## Datasets
214
+
215
+ The model uses the following datasets:
216
+
217
+ 1. **Legal Code Dataset** (`dataset-all.csv`)
218
+ - Rwandan laws and articles
219
+ - Descriptions in Kinyarwanda, English, and French
220
+ - Article numbers, chapters, and categories
221
+
222
+ 2. **Penal Code** (`penal_code.csv`)
223
+ - Crime categories
224
+ - Punishment details
225
+ - Sentencing guidelines
226
+
227
+ 3. **Greetings** (`greetings.csv`)
228
+ - Multilingual greetings and responses
229
+ - Conversational patterns
230
+
231
+ ## Model Files
232
+
233
+ - `inference.py` - Main inference endpoint
234
+ - `assistant.py` - Core assistant logic
235
+ - `retriever.py` - Semantic search and embedding management
236
+ - `config.py` - Configuration and utilities
237
+ - `law_embeddings.npy` - Precomputed embeddings (384-dim vectors)
238
+ - `law_meta.json` - Metadata for legal articles
239
+ - `conversation_contexts.json` - Context tracking storage
240
+
241
+ ## Installation (Self-Hosted)
242
+
243
+ ```bash
244
+ # Clone the model repository
245
+ git clone https://huggingface.co/mugwaneza/mbaza-model
246
+ cd mbaza-model
247
+
248
+ # Install dependencies
249
+ pip install -r requirements.txt
250
+
251
+ # Test locally
252
+ python inference.py
253
+ ```
254
+
255
+ ## API Limits
256
+
257
+ - **Free Tier**: ~30,000 requests/month
258
+ - **Rate Limiting**: ~100 requests/hour (burst)
259
+ - **Cold Start**: First request may take 10-30 seconds
260
+ - **Warm**: Subsequent requests ~1-3 seconds
261
+
262
+ For production with higher limits, consider:
263
+ 1. Upgrading to Hugging Face PRO ($9/month)
264
+ 2. Self-hosting with this model
265
+ 3. Using dedicated inference endpoints
266
+
267
+ ## License
268
+
269
+ MIT License - See LICENSE file for details
270
+
271
+ ## Citation
272
+
273
+ ```bibtex
274
+ @misc{mbaza-legal-ai,
275
+ author = {Mugwaneza Manzi},
276
+ title = {Mbaza Legal AI: Multilingual Legal Assistant for Rwanda},
277
+ year = {2025},
278
+ publisher = {Hugging Face},
279
+ url = {https://huggingface.co/mugwaneza/mbaza-model}
280
+ }
281
+ ```
282
+
283
+ ## Related Resources
284
+
285
+ - **Demo Space**: https://huggingface.co/spaces/mugwaneza/mbaza
286
+ - **GitHub**: https://github.com/MUGWANEZAMANZI/Model
287
+ - **Documentation**: See DEPLOYMENT.md in repository
288
+
289
+ ## Contact
290
+
291
+ For questions, issues, or collaboration:
292
+ - Create an issue on the [GitHub repository](https://github.com/MUGWANEZAMANZI/Model)
293
+ - Email: mugwaneza@example.com (replace with your email)
294
+
295
+ ---
296
+
297
+ **Note**: This model is designed for informational purposes. Always consult with a qualified legal professional for official legal advice.
assistant.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import re
4
+ from typing import Dict, Any, List, Optional
5
+ import random
6
+
7
+ import pandas as pd
8
+
9
+ from retriever import get_retriever
10
+ from config import KINYARWANDA_STOPWORDS, calculate_similarity_score
11
+
12
+ ROOT = Path(__file__).parent
13
+ CONTEXT_PATH = ROOT / 'conversation_contexts.json'
14
+
15
+
16
+ class Assistant:
17
+ def __init__(self):
18
+ self.retriever = get_retriever()
19
+ # load datasets
20
+ self.laws = None
21
+ self.punishments = None
22
+ self.greetings = None
23
+ self.contexts: Dict[str, List[Dict[str, Any]]] = {}
24
+ self.intent_keywords = {
25
+ 'greeting': ['mwaramutse', 'muraho', 'amakuru', 'bite', 'mwiriwe', 'urabeho', 'urakomeye', 'ndabona'],
26
+ 'law': ['itegeko', 'ingingo', 'article', 'ingingo ya', 'itegeko rya', 'law', 'article', 'ingingo'],
27
+ 'punishment': ['igihano', 'ibihano', 'ihazabu', 'igifungo', 'fine', 'imyaka', 'years', 'punishment']
28
+ }
29
+ self.load_datasets()
30
+ self._load_contexts()
31
+
32
+ def load_datasets(self):
33
+ # laws are loaded by retriever; ensure available
34
+ try:
35
+ if self.retriever.laws_df is None:
36
+ self.retriever.load_laws()
37
+ self.laws = self.retriever.laws_df
38
+ except Exception:
39
+ self.laws = None
40
+
41
+ # punishments fallback to penal_code.csv
42
+ ppath = ROOT / 'penal_code.csv'
43
+ if ppath.exists():
44
+ try:
45
+ self.punishments = pd.read_csv(ppath).fillna('')
46
+ except Exception:
47
+ self.punishments = None
48
+ else:
49
+ self.punishments = None
50
+
51
+ # greetings
52
+ gpath = ROOT / 'greetings.csv'
53
+ if gpath.exists():
54
+ try:
55
+ self.greetings = pd.read_csv(gpath).fillna('')
56
+ except Exception:
57
+ self.greetings = None
58
+ else:
59
+ # retriever uses greetings.csv or kin... so try retriever load
60
+ try:
61
+ self.greetings = self.retriever.greetings_df
62
+ except Exception:
63
+ self.greetings = None
64
+
65
+ def _load_contexts(self):
66
+ if CONTEXT_PATH.exists():
67
+ try:
68
+ with open(CONTEXT_PATH, 'r', encoding='utf-8') as f:
69
+ self.contexts = json.load(f)
70
+ except Exception:
71
+ self.contexts = {}
72
+
73
+ def _save_contexts(self):
74
+ try:
75
+ with open(CONTEXT_PATH, 'w', encoding='utf-8') as f:
76
+ json.dump(self.contexts, f, ensure_ascii=False, indent=2)
77
+ except Exception:
78
+ pass
79
+
80
+ def tokenize(self, text: str) -> List[str]:
81
+ if not text:
82
+ return []
83
+ txt = str(text).lower()
84
+ # simple cleaning
85
+ txt = re.sub(r"[\r\n]+", " ", txt)
86
+ txt = re.sub(r"[^\w\s\u00C0-\u017F]", " ", txt)
87
+ toks = [t for t in txt.split() if t and t not in KINYARWANDA_STOPWORDS]
88
+ return toks
89
+
90
+ def detect_intent(self, text: str) -> str:
91
+ t = str(text).lower()
92
+ toks = set(self.tokenize(t))
93
+ # keyword scoring
94
+ scores = {k: 0 for k in self.intent_keywords}
95
+ for intent, keywords in self.intent_keywords.items():
96
+ for kw in keywords:
97
+ if kw in t or kw in toks:
98
+ scores[intent] += 1
99
+
100
+ # greeting detection via retriever as fallback
101
+ if scores.get('greeting', 0) > 0:
102
+ return 'greeting'
103
+
104
+ # if punishment keywords present
105
+ if scores.get('punishment', 0) > 0:
106
+ return 'punishment'
107
+
108
+ if scores.get('law', 0) > 0:
109
+ return 'law'
110
+
111
+ # try to detect if users mention known law numbers or article numbers
112
+ if re.search(r'\bingingo\b|\bingingo ya\b|\barticle\b|\bitegeko\b', t):
113
+ return 'law'
114
+
115
+ # default unclear
116
+ return 'unclear'
117
+
118
+ def _update_context(self, user_id: str, entry: Dict[str, Any]):
119
+ self.contexts.setdefault(user_id, []).append(entry)
120
+ # keep only last 50
121
+ if len(self.contexts[user_id]) > 50:
122
+ self.contexts[user_id] = self.contexts[user_id][-50:]
123
+ self._save_contexts()
124
+
125
+ def handle_query(self, user_id: str, text: str) -> Dict[str, Any]:
126
+ # first, try language-aware greeting detection (this handles ky/en/fr greetings)
127
+ # record the raw user message
128
+ self._update_context(user_id, {'role': 'user', 'text': text})
129
+
130
+ try:
131
+ reply = self.retriever.detect_and_reply_greeting(text)
132
+ except Exception:
133
+ reply = None
134
+
135
+ if reply:
136
+ out = {'type': 'greeting', 'response': reply.get('response', ''), 'followup': reply.get('followup', '')}
137
+ # record assistant response
138
+ self._update_context(user_id, {'role': 'assistant', 'text': out})
139
+ return out
140
+
141
+ # no greeting detected, continue with intent detection
142
+ intent = self.detect_intent(text)
143
+ # update the last user message with intent information as well
144
+ self._update_context(user_id, {'role': 'user', 'text': text, 'intent': intent})
145
+
146
+ if intent == 'law':
147
+ # use retriever find_similar
148
+ try:
149
+ # ensure embeddings built
150
+ self.retriever.build_or_load_embeddings()
151
+ results = self.retriever.find_similar(text, top_k=1)
152
+ except Exception:
153
+ results = []
154
+
155
+ if results:
156
+ score, meta = results[0]
157
+ law_row = meta.get('row')
158
+ out = {'type': 'law', 'score': score, 'law': law_row}
159
+ self._update_context(user_id, {'role': 'assistant', 'text': out})
160
+ return out
161
+
162
+ return {'type': 'unclear', 'text': "I couldn't find a matching law. Can you be more specific?"}
163
+
164
+ if intent == 'punishment':
165
+ # use simple matching against penal_code.csv if available, otherwise use overlap scoring
166
+ if self.punishments is not None:
167
+ # score by overlap
168
+ best = None
169
+ best_score = 0.0
170
+ for _, row in self.punishments.iterrows():
171
+ desc = ' '.join([str(row.get(c, '')) for c in row.index])
172
+ s = calculate_similarity_score(text, desc)
173
+ if s > best_score:
174
+ best_score = s
175
+ best = row.to_dict()
176
+
177
+ if best is not None and best_score > 0:
178
+ out = {'type': 'punishment', 'score': best_score, 'punishment_row': best}
179
+ self._update_context(user_id, {'role': 'assistant', 'text': out})
180
+ return out
181
+
182
+ # fallback: return unclear
183
+ return {'type': 'unclear', 'text': "I couldn't find a matching punishment. Can you provide more detail?"}
184
+
185
+ # unclear
186
+ out = {'type': 'unclear', 'text': "Can you please try a legal question? I'm here to assist you."}
187
+ self._update_context(user_id, {'role': 'assistant', 'text': out})
188
+ return out
189
+
190
+
191
+ # Singleton assistant
192
+ _ASSISTANT: Optional[Assistant] = None
193
+
194
+
195
+ def get_assistant() -> Assistant:
196
+ global _ASSISTANT
197
+ if _ASSISTANT is None:
198
+ _ASSISTANT = Assistant()
199
+ return _ASSISTANT
config.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration and utility functions for Rwanda Legal NLP System
3
+ """
4
+
5
+ import os
6
+ import json
7
+ import logging
8
+ import pandas as pd
9
+ from typing import Dict, Any
10
+ from dataclasses import dataclass
11
+
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
16
+ )
17
+
18
+ @dataclass
19
+ class ModelConfig:
20
+ """Configuration for model settings"""
21
+ model_name: str = "EleutherAI/gpt-j-6B"
22
+ max_length: int = 512
23
+ temperature: float = 0.7
24
+ top_p: float = 0.9
25
+ top_k: int = 50
26
+ do_sample: bool = True
27
+ num_return_sequences: int = 1
28
+
29
+ @dataclass
30
+ class TrainingConfig:
31
+ """Configuration for training settings"""
32
+ output_dir: str = "./trained_legal_model"
33
+ num_epochs: int = 3
34
+ batch_size: int = 2
35
+ learning_rate: float = 5e-5
36
+ warmup_steps: int = 500
37
+ save_steps: int = 1000
38
+ eval_steps: int = 1000
39
+ max_grad_norm: float = 1.0
40
+ gradient_accumulation_steps: int = 4
41
+ weight_decay: float = 0.01
42
+
43
+ @dataclass
44
+ class DataConfig:
45
+ """Configuration for data processing"""
46
+ dataset_path: str = "dataset-all.csv"
47
+ max_text_length: int = 512
48
+ test_size: float = 0.1
49
+ random_state: int = 42
50
+
51
+ class ConfigManager:
52
+ """Manage configuration for the legal NLP system"""
53
+
54
+ def __init__(self, config_path: str = "config.json"):
55
+ self.config_path = config_path
56
+ self.model_config = ModelConfig()
57
+ self.training_config = TrainingConfig()
58
+ self.data_config = DataConfig()
59
+
60
+ self.load_config()
61
+
62
+ def load_config(self):
63
+ """Load configuration from JSON file if exists"""
64
+ if os.path.exists(self.config_path):
65
+ try:
66
+ with open(self.config_path, 'r') as f:
67
+ config_data = json.load(f)
68
+
69
+ # Update configurations
70
+ if 'model' in config_data:
71
+ for key, value in config_data['model'].items():
72
+ if hasattr(self.model_config, key):
73
+ setattr(self.model_config, key, value)
74
+
75
+ if 'training' in config_data:
76
+ for key, value in config_data['training'].items():
77
+ if hasattr(self.training_config, key):
78
+ setattr(self.training_config, key, value)
79
+
80
+ if 'data' in config_data:
81
+ for key, value in config_data['data'].items():
82
+ if hasattr(self.data_config, key):
83
+ setattr(self.data_config, key, value)
84
+
85
+ logging.info(f"Configuration loaded from {self.config_path}")
86
+
87
+ except Exception as e:
88
+ logging.warning(f"Could not load config from {self.config_path}: {e}")
89
+
90
+ def save_config(self):
91
+ """Save current configuration to JSON file"""
92
+ config_data = {
93
+ 'model': self.model_config.__dict__,
94
+ 'training': self.training_config.__dict__,
95
+ 'data': self.data_config.__dict__
96
+ }
97
+
98
+ try:
99
+ with open(self.config_path, 'w') as f:
100
+ json.dump(config_data, f, indent=2)
101
+
102
+ logging.info(f"Configuration saved to {self.config_path}")
103
+
104
+ except Exception as e:
105
+ logging.error(f"Could not save config to {self.config_path}: {e}")
106
+
107
+ # Kinyarwanda language utilities
108
+ KINYARWANDA_STOPWORDS = {
109
+ 'ni', 'na', 'ku', 'mu', 'nk', 'no', 'cyangwa', 'ariko', 'naho', 'none',
110
+ 'kandi', 'rero', 'ubwo', 'uko', 'ubu', 'aha', 'aho', 'iyo', 'ese',
111
+ 'nta', 'nti', 'nte', 'nto', 'ntu', 'ntw', 'aba', 'ari', 'hari',
112
+ 'kuri', 'muri', 'buri', 'abantu', 'umuntu', 'ibintu', 'ikintu'
113
+ }
114
+
115
+ # Kinyarwanda legal terminology
116
+ KINYARWANDA_LEGAL_TERMS = {
117
+ 'gusambanya': 'sexual defilement',
118
+ 'kwiba': 'theft',
119
+ 'gukoresha_imbaraga': 'use of force/violence',
120
+ 'kwinjira': 'enter/trespass',
121
+ 'kwica': 'kill/murder',
122
+ 'gukubita': 'assault/beat',
123
+ 'uburiganya': 'fraud/deception',
124
+ 'ubuhemu': 'embezzlement',
125
+ 'igifungo': 'imprisonment',
126
+ 'ihazabu': 'fine',
127
+ 'igihano': 'punishment',
128
+ 'ingingo': 'article',
129
+ 'itegeko': 'law',
130
+ 'umwana': 'child',
131
+ 'imyaka': 'years',
132
+ 'amezi': 'months',
133
+ 'burundu': 'life (imprisonment)',
134
+ 'gahato': 'force/violence',
135
+ 'imibonano_mpuzabitsina': 'sexual intercourse',
136
+ 'inyamaswa': 'animals',
137
+ 'rugo': 'home/house'
138
+ }
139
+
140
+ def clean_kinyarwanda_text(text: str) -> str:
141
+ """Clean and normalize Kinyarwanda text"""
142
+ import re
143
+
144
+ if not text or pd.isna(text):
145
+ return ""
146
+
147
+ text = str(text)
148
+
149
+ # Remove extra whitespace
150
+ text = re.sub(r'\s+', ' ', text)
151
+
152
+ # Keep Kinyarwanda specific characters
153
+ text = re.sub(r'[^\w\s\-\.\,\;\:\!\?\'\"\u00C0-\u017F]', '', text)
154
+
155
+ # Remove common noise patterns
156
+ text = re.sub(r'\b[0-9]+\b', ' NUMBER ', text)
157
+ text = re.sub(r'\bFRW\s*[0-9,\.]+\b', ' AMOUNT ', text)
158
+ text = re.sub(r'\b[0-9]+-[0-9]+\b', ' RANGE ', text)
159
+
160
+ return text.strip()
161
+
162
+ def extract_keywords_kinyarwanda(text: str, max_keywords: int = 10) -> list:
163
+ """Extract keywords from Kinyarwanda text"""
164
+ if not text:
165
+ return []
166
+
167
+ # Simple keyword extraction
168
+ words = clean_kinyarwanda_text(text).lower().split()
169
+
170
+ # Filter out stopwords and short words
171
+ keywords = [
172
+ word for word in words
173
+ if len(word) > 2 and word not in KINYARWANDA_STOPWORDS
174
+ ]
175
+
176
+ # Count frequency and return most common
177
+ from collections import Counter
178
+ word_counts = Counter(keywords)
179
+
180
+ return [word for word, count in word_counts.most_common(max_keywords)]
181
+
182
+ # Legal category mappings with Kinyarwanda terms
183
+ LEGAL_CATEGORIES = {
184
+ 'sexual_offence': {
185
+ 'english': ['sexual', 'rape', 'assault', 'child', 'defilement'],
186
+ 'kinyarwanda': ['gukoresha', 'gusambanya', 'igitsina', 'umwana', 'imibonano']
187
+ },
188
+ 'theft': {
189
+ 'english': ['theft', 'robbery', 'stealing', 'property'],
190
+ 'kinyarwanda': ['kwiba', 'gufata', 'umutungo', 'imbaraga']
191
+ },
192
+ 'privacy': {
193
+ 'english': ['privacy', 'domicile', 'recording', 'entry'],
194
+ 'kinyarwanda': ['kwinjira', 'kumviriza', 'rugo', 'ubuzima_bwite']
195
+ },
196
+ 'morality': {
197
+ 'english': ['adultery', 'bigamy', 'concubinage', 'marriage'],
198
+ 'kinyarwanda': ['ubusambanyi', 'ubushoreke', 'gushyingirwa', 'guta_urugo']
199
+ },
200
+ 'violence': {
201
+ 'english': ['violence', 'murder', 'genocide', 'torture', 'assault'],
202
+ 'kinyarwanda': ['kwica', 'gukubita', 'jenoside', 'ihohotera', 'imbaraga']
203
+ },
204
+ 'fraud': {
205
+ 'english': ['fraud', 'forgery', 'deception', 'embezzlement'],
206
+ 'kinyarwanda': ['uburiganya', 'kwigana', 'ubuhemu', 'kwibeshya']
207
+ }
208
+ }
209
+
210
+ def categorize_case(description: str) -> str:
211
+ """Categorize a case based on description keywords (English and Kinyarwanda)"""
212
+ if not description:
213
+ return "unknown"
214
+
215
+ description_lower = description.lower()
216
+
217
+ category_scores = {}
218
+
219
+ for category, terms in LEGAL_CATEGORIES.items():
220
+ score = 0
221
+ # Check English terms
222
+ for keyword in terms['english']:
223
+ if keyword in description_lower:
224
+ score += 2
225
+
226
+ # Check Kinyarwanda terms
227
+ for keyword in terms['kinyarwanda']:
228
+ if keyword in description_lower:
229
+ score += 3 # Higher weight for Kinyarwanda terms
230
+
231
+ if score > 0:
232
+ category_scores[category] = score
233
+
234
+ if category_scores:
235
+ return max(category_scores, key=category_scores.get)
236
+ else:
237
+ return "general"
238
+
239
+ # Evaluation metrics
240
+ def calculate_similarity_score(text1: str, text2: str) -> float:
241
+ """Calculate simple similarity score between two texts"""
242
+ if not text1 or not text2:
243
+ return 0.0
244
+
245
+ words1 = set(clean_kinyarwanda_text(text1).lower().split())
246
+ words2 = set(clean_kinyarwanda_text(text2).lower().split())
247
+
248
+ if not words1 or not words2:
249
+ return 0.0
250
+
251
+ intersection = len(words1.intersection(words2))
252
+ union = len(words1.union(words2))
253
+
254
+ return intersection / union if union > 0 else 0.0
255
+
256
+ # Utility functions
257
+ def save_predictions(predictions: list, output_path: str):
258
+ """Save predictions to file"""
259
+ try:
260
+ with open(output_path, 'w', encoding='utf-8') as f:
261
+ json.dump(predictions, f, indent=2, ensure_ascii=False)
262
+
263
+ logging.info(f"Predictions saved to {output_path}")
264
+
265
+ except Exception as e:
266
+ logging.error(f"Could not save predictions: {e}")
267
+
268
+ def load_predictions(input_path: str) -> list:
269
+ """Load predictions from file"""
270
+ try:
271
+ with open(input_path, 'r', encoding='utf-8') as f:
272
+ predictions = json.load(f)
273
+
274
+ logging.info(f"Predictions loaded from {input_path}")
275
+ return predictions
276
+
277
+ except Exception as e:
278
+ logging.error(f"Could not load predictions: {e}")
279
+ return []
280
+
281
+ def format_punishment(punishment: str) -> dict:
282
+ """Parse and format punishment information"""
283
+ if not punishment or pd.isna(punishment):
284
+ return {"type": "unknown", "details": ""}
285
+
286
+ punishment = str(punishment).lower()
287
+
288
+ result = {
289
+ "type": "unknown",
290
+ "imprisonment": None,
291
+ "fine": None,
292
+ "community_service": None,
293
+ "details": punishment
294
+ }
295
+
296
+ # Extract imprisonment
297
+ import re
298
+
299
+ # Years
300
+ year_pattern = r'(\d+)(?:-(\d+))?\s*(?:years?|imyaka)'
301
+ year_match = re.search(year_pattern, punishment)
302
+ if year_match:
303
+ min_years = int(year_match.group(1))
304
+ max_years = int(year_match.group(2)) if year_match.group(2) else min_years
305
+ result["imprisonment"] = f"{min_years}-{max_years} years"
306
+ result["type"] = "imprisonment"
307
+
308
+ # Life imprisonment
309
+ if any(term in punishment for term in ['life', 'burundu', 'cya burundu']):
310
+ result["imprisonment"] = "Life imprisonment"
311
+ result["type"] = "life_imprisonment"
312
+
313
+ # Fine amounts
314
+ fine_pattern = r'frw\s*([0-9,\.]+)(?:\s*-\s*([0-9,\.]+))?'
315
+ fine_match = re.search(fine_pattern, punishment)
316
+ if fine_match:
317
+ min_fine = fine_match.group(1)
318
+ max_fine = fine_match.group(2) if fine_match.group(2) else min_fine
319
+ result["fine"] = f"FRW {min_fine}-{max_fine}"
320
+
321
+ # Community service
322
+ if 'community service' in punishment or 'inyungu rusange' in punishment:
323
+ result["community_service"] = "Yes"
324
+
325
+ return result
326
+
327
+ # Default configuration instance
328
+ config_manager = ConfigManager()
329
+
330
+ if __name__ == "__main__":
331
+ # Test utilities
332
+ sample_text = "Umuntu yakoreye umwana igikorwa gishingiye ku gitsina"
333
+
334
+ print("Sample text:", sample_text)
335
+ print("Cleaned:", clean_kinyarwanda_text(sample_text))
336
+ print("Keywords:", extract_keywords_kinyarwanda(sample_text))
337
+ print("Category:", categorize_case(sample_text))
338
+
339
+ sample_punishment = "Igifungo cy'imyaka 10-15 + ihazabu FRW 1,000,000-2,000,000"
340
+ print("Formatted punishment:", format_punishment(sample_punishment))
dataset-all.csv ADDED
The diff for this file is too large to render. See raw diff
 
greetings.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ greeting,response,followup,greeting_en,response_en,followup_en,greeting_fr,response_fr,followup_fr
2
+ Mwaramutse,"Mwaramutse neza, amakuru yawe? (Ndaganira gusa mu Kinyarwanda)","Ndishimye ko umeze neza, mbwira icyo ngufasha mu mategeko cg urubanza witegura. (Ndaganira gusa mu Kinyarwanda)","Good morning","Good morning, how are you? (I only speak Kinyarwanda)","I'm glad you're well, tell me how I can help you with legal matters or your upcoming case. (I only speak Kinyarwanda)","Bonjour","Bonjour, comment ça va ? (Je ne parle que le kinyarwanda)","Je suis content que tu ailles bien, dis-moi comment je peux t'aider avec des questions juridiques ou ton prochain procès. (Je ne parle que le kinyarwanda)"
3
+ Bite,"Ni byiza, uraho nawe? (Ndaganira gusa mu Kinyarwanda)","Ndishimye ko umeze neza, hari ikibazo cy'amategeko ushaka ko tuganiraho? (Ndaganira gusa mu Kinyarwanda)","Hi","I'm good, how about you? (I only speak Kinyarwanda)","I'm glad you're well, is there a legal issue you'd like to discuss? (I only speak Kinyarwanda)","Salut","Je vais bien, et toi ? (Je ne parle que le kinyarwanda)","Je suis content que tu ailles bien, y a-t-il un problème juridique dont tu veux parler ? (Je ne parle que le kinyarwanda)"
4
+ Amakuru,"Ni meza cyane, wowe se? (Ndaganira gusa mu Kinyarwanda)","Ndanezerewe ko wowe nawe umeze neza, hari icyo wifuza kumenya ku mategeko? (Ndaganira gusa mu Kinyarwanda)","How are you?","I'm very well, and you? (I only speak Kinyarwanda)","I'm happy that you are also well, is there anything you want to know about the law? (I only speak Kinyarwanda)","Comment vas-tu ?","Je vais très bien, et toi ? (Je ne parle que le kinyarwanda)","Je suis heureux que tu ailles bien aussi, y a-t-il quelque chose que tu veux savoir sur la loi ? (Je ne parle que le kinyarwanda)"
5
+ Bite se,"Ni fresh, uraho neza!","Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?","Hey there","I'm great, how are you?","I'm glad you're well, is there anything you want to know or ask advice about?","Coucou","Je vais super bien, comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?"
6
+ Mwiriwe,"Mwiriwe neza, amakuru yawe?","Ndishimye ko umeze neza, mbwira icyo ngufasha mu mategeko cg urubanza witegura.","Good afternoon","Good afternoon, how are you?","I'm glad you're well, tell me how I can help you with legal matters or your upcoming case.","Bon après-midi","Bon après-midi, comment ça va ?","Je suis content que tu ailles bien, dis-moi comment je peux t'aider avec des questions juridiques ou ton prochain procès."
7
+ Muraho,"Muraho neza, uraho ute?","Ndishimye ko umeze neza, hari ikibazo cy'amategeko ushaka ko tuganiraho?","Hello","Hello, how are you?","I'm glad you're well, is there a legal issue you'd like to discuss?","Bonjour","Bonjour, comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il un problème juridique dont tu veux parler ?"
8
+ Umeze ute,"Meze neza cyane, urakoze kubaza.","Ndanezerewe ko umeze neza, hari icyo wifuza kumenya ku mategeko?","How are you feeling?","I'm very well, thank you for asking.","I'm happy that you are well, is there anything you want to know about the law?","Comment te sens-tu ?","Je vais très bien, merci de demander.","Je suis heureux que tu ailles bien, y a-t-il quelque chose que tu veux savoir sur la loi ?"
9
+ Urakomeye,"Ndakomeye, nawe uraho neza?","Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?","Are you strong?","I'm strong, how are you?","I'm glad you're well, is there anything you want to know or ask advice about?","Tu es fort ?","Je suis fort, et toi comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?"
10
+ Urabeho,"Urabeho, tuzabonana ubutaha.","Niba hari ikibazo cy'amategeko uzagaruke tuganireho.","Goodbye","Goodbye, see you next time.","If you have a legal issue, come back and we'll discuss it.","Au revoir","Au revoir, à la prochaine.","Si tu as un problème juridique, reviens et nous en discuterons."
11
+ Ndagushimiye,"Nta kibazo, nanjye ndagushimiye.","Niba hari ikindi ushaka kumenya, ntuzazuyaze kumbaza.","Thank you","No problem, thank you too.","If there's anything else you want to know, don't hesitate to ask.","Merci","Pas de problème, merci à toi aussi.","Si tu veux savoir autre chose, n'hésite pas à demander."
12
+ Urakoze,"Urakoze nawe, ni byiza kugufasha.","Niba hari ikindi kibazo cy'amategeko, ndahari kugufasha.","Thanks","Thanks to you too, it's good to help you.","If you have any other legal questions, I'm here to help.","Merci","Merci à toi aussi, c'est un plaisir de t'aider.","Si tu as d'autres questions juridiques, je suis là pour t'aider."
13
+ Wiriwe,"Wiriwe neza, tugire umunsi mwiza.","Niba hari icyo wifuza kumenya, ndahari kugufasha.","Good evening","Good evening, have a nice day.","If there's anything you want to know, I'm here to help.","Bonsoir","Bonsoir, passe une bonne journée.","Si tu veux savoir quelque chose, je suis là pour t'aider."
14
+ Ijoro ryiza,"Ijoro ryiza nawe, uryame rwose.","Niba hari ikibazo cy'amategeko, uzambwire ejo.","Good night","Good night to you too, sleep well.","If you have a legal issue, let me know tomorrow.","Bonne nuit","Bonne nuit à toi aussi, dors bien.","Si tu as un problème juridique, dis-le-moi demain."
15
+ Naho,"Ni byiza cyane, uraho neza!","Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?","Hey","That's great, how are you?","I'm glad you're well, is there anything you want to know or ask advice about?","Salut","C'est super, comment vas-tu ?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?"
16
+ Murabeho,"Murabeho, mukomeze kugira umugisha.","Niba hari ikibazo cy'amategeko uzagaruke tuganireho.","Farewell","Farewell, may you continue to be blessed.","If you have a legal issue, come back and we'll discuss it.","Adieu","Adieu, que tu continues à être béni.","Si tu as un problème juridique, reviens et nous en discuterons."
17
+ Ndishimye ko umeze neza, mbwira icyo ngufasha mu mategeko cg urubanza witegura.,"Nditeguye kugufasha, sobanura ikibazo cyawe.","Wakenera ubundi bufasha bw'amategeko?","I'm glad you're well, tell me how I can help you with legal matters or your upcoming case.","I'm ready to help you, explain your issue.","Would you need any other legal assistance?","Je suis content que tu ailles bien, dis-moi comment je peux t'aider avec des questions juridiques ou ton prochain procès.","Je suis prêt à t'aider, explique ton problème.","Aurais-tu besoin d'une autre assistance juridique ?"
18
+ Ndishimye ko umeze neza, hari ikibazo cy'amategeko ushaka ko tuganiraho?,"Mbwira ikibazo cyawe, ndagufasha uko nshoboye.","Hari ibindi wifuza kumenya?","I'm glad you're well, is there a legal issue you'd like to discuss?","Tell me your issue, I'll help as much as I can.","Is there anything else you want to know?","Je suis content que tu ailles bien, y a-t-il un problème juridique dont tu veux parler ?","Dis-moi ton problème, je t'aiderai autant que possible.","Y a-t-il autre chose que tu veux savoir ?"
19
+ Ndanezerewe ko wowe nawe umeze neza, hari icyo wifuza kumenya ku mategeko?,"Niba hari ikibazo cyihariye, nyibwira.","Ndashobora kugufasha no mu bindi bibazo by'amategeko.","I'm happy that you are also well, is there anything you want to know about the law?","If you have a specific issue, tell me.","I can also help you with other legal issues.","Je suis heureux que tu ailles bien aussi, y a-t-il quelque chose que tu veux savoir sur la loi ?","Si tu as un problème spécifique, dis-le-moi.","Je peux aussi t'aider avec d'autres questions juridiques."
20
+ Ndishimye ko umeze neza, hari icyo wifuza kumenya cyangwa kugisha inama?,"Sobanura ikibazo cyawe, nditeguye kugufasha.","Hari indi nama ushaka?","I'm glad you're well, is there anything you want to know or ask advice about?","Explain your issue, I'm ready to help.","Do you want any other advice?","Je suis content que tu ailles bien, y a-t-il quelque chose que tu veux savoir ou demander un conseil ?","Explique ton problème, je suis prêt à t'aider.","Veux-tu un autre conseil ?"
21
+ Ndanezerewe ko umeze neza, hari icyo wifuza kumenya ku mategeko?,"Mbwira ikibazo cyawe cy'amategeko.","Niba hari ibindi bibazo, ndahari.","I'm happy that you are well, is there anything you want to know about the law?","Tell me your legal issue.","If there are other questions, I'm here.","Je suis heureux que tu ailles bien, y a-t-il quelque chose que tu veux savoir sur la loi ?","Dis-moi ton problème juridique.","S'il y a d'autres questions, je suis là."
22
+ Niba hari ikibazo cy'amategeko uzagaruke tuganireho.,"Uzanyandikire igihe cyose ukeneye ubufasha.","Igihe cyose ukeneye inama, ndahari.","If you have a legal issue, come back and we'll discuss it.","Write to me anytime you need help.","Whenever you need advice, I'm here.","Si tu as un problème juridique, reviens et nous en discuterons.","Écris-moi quand tu as besoin d'aide.","Quand tu as besoin de conseils, je suis là."
23
+ Niba hari ikindi ushaka kumenya, ntuzazuyaze kumbaza.,"Nditeguye kugufasha igihe cyose.","Wakenera ubundi bufasha?","If there's anything else you want to know, don't hesitate to ask.","I'm ready to help you anytime.","Would you need any other assistance?","Si tu veux savoir autre chose, n'hésite pas à demander.","Je suis prêt à t'aider à tout moment.","Aurais-tu besoin d'une autre assistance ?"
24
+ Niba hari ikindi kibazo cy'amategeko, ndahari kugufasha.,"Mbwira ikibazo cyawe igihe cyose.","Ndashobora kugufasha no mu bindi.","If you have any other legal questions, I'm here to help.","Tell me your issue anytime.","I can also help you with other things.","Si tu as d'autres questions juridiques, je suis là pour t'aider.","Dis-moi ton problème à tout moment.","Je peux aussi t'aider avec d'autres choses."
25
+ Niba hari icyo wifuza kumenya, ndahari kugufasha.,"Sobanura icyo wifuza kumenya.","Nditeguye kugufasha igihe cyose.","If there's anything you want to know, I'm here to help.","Explain what you want to know.","I'm ready to help you anytime.","Si tu veux savoir quelque chose, je suis là pour t'aider.","Explique ce que tu veux savoir.","Je suis prêt à t'aider à tout moment."
26
+ Niba hari ikibazo cy'amategeko, uzambwire ejo.,"Uzanyandikire igihe cyose ukeneye ubufasha.","Ndahari igihe cyose ukeneye inama.","If you have a legal issue, let me know tomorrow.","Write to me anytime you need help.","I'm here whenever you need advice.","Si tu as un problème juridique, dis-le-moi demain.","Écris-moi quand tu as besoin d'aide.","Je suis là chaque fois que tu as besoin de conseils."
inference.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Inference API endpoint for mbaza-model
3
+ This file exposes the model for API calls via HF Inference API.
4
+ """
5
+ from typing import Dict, List, Any
6
+ from assistant import get_assistant
7
+
8
+ # Initialize the assistant once
9
+ assistant = get_assistant()
10
+
11
+
12
+ def model(inputs: List[str]) -> Dict[str, Any]:
13
+ """
14
+ Main inference function called by Hugging Face Inference API.
15
+
16
+ Args:
17
+ inputs: List of input strings (prompts/queries)
18
+
19
+ Returns:
20
+ Dict with model response
21
+ """
22
+ if not inputs or not isinstance(inputs, list):
23
+ return {"error": "Invalid input format. Expected list of strings."}
24
+
25
+ prompt = inputs[0]
26
+ user_id = inputs[1] if len(inputs) > 1 else "api_user"
27
+
28
+ try:
29
+ result = assistant.handle_query(user_id, prompt)
30
+ return result
31
+ except Exception as e:
32
+ return {
33
+ "error": f"Processing failed: {str(e)}",
34
+ "text": "An error occurred while processing your request."
35
+ }
36
+
37
+
38
+ def predict(prompt: str, user_id: str = "api_user") -> Dict[str, Any]:
39
+ """
40
+ Alternative prediction function with explicit parameters.
41
+
42
+ Args:
43
+ prompt: User query or greeting
44
+ user_id: Optional user identifier for context tracking
45
+
46
+ Returns:
47
+ Dict with response, intent, and any matched data
48
+ """
49
+ try:
50
+ result = assistant.handle_query(user_id, prompt)
51
+ return result
52
+ except Exception as e:
53
+ return {
54
+ "error": f"Processing failed: {str(e)}",
55
+ "text": "An error occurred while processing your request."
56
+ }
57
+
58
+
59
+ # For testing locally
60
+ if __name__ == "__main__":
61
+ test_queries = [
62
+ "Mwaramutse neza",
63
+ "Ibihano by'ubujura ni ibihe?",
64
+ "Kwinjira aho umuntu atuye bitemewe namategeko",
65
+ "What are the laws about corruption?"
66
+ ]
67
+
68
+ print("Testing inference.py locally...\n")
69
+ for query in test_queries:
70
+ print(f"Query: {query}")
71
+ response = model([query])
72
+ print(f"Response: {response.get('text', response)}")
73
+ print("-" * 80)
law_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e537738aac9aa323758cde5c61b7ab843ff3b1ab6189a5e63b405d11574ec432
3
+ size 3090560
law_meta.json ADDED
The diff for this file is too large to render. See raw diff
 
penal_code.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ sentence-transformers
3
+ torch
4
+ nltk
5
+ flask
6
+ gunicorn
7
+ gradio
retriever.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ import json
6
+ import random
7
+ from typing import List, Dict, Any, Tuple
8
+
9
+ try:
10
+ from sentence_transformers import SentenceTransformer
11
+ except Exception:
12
+ SentenceTransformer = None
13
+
14
+ ROOT = Path(__file__).parent
15
+ MODEL_NAME = 'all-MiniLM-L6-v2'
16
+ EMBED_PATH = ROOT / 'law_embeddings.npy'
17
+ META_PATH = ROOT / 'law_meta.json'
18
+
19
+
20
+ def _combine_law_text(row: pd.Series) -> str:
21
+ parts = []
22
+ for col in ['Law-Name', 'Article-Description', 'Punishment', 'Category', 'Chapter', 'Article-Number']:
23
+ if col in row and pd.notna(row[col]):
24
+ parts.append(str(row[col]))
25
+ return ' '.join(parts)
26
+
27
+
28
+ class LawRetriever:
29
+ def __init__(self, model_name: str = MODEL_NAME):
30
+ self.model_name = model_name
31
+ self.model = None
32
+ self.laws_df: pd.DataFrame | None = None
33
+ self.embeddings: np.ndarray | None = None
34
+ self.meta: List[dict] = []
35
+ self.greetings_df: pd.DataFrame | None = None
36
+
37
+ def ensure_model(self):
38
+ if self.model is None:
39
+ if SentenceTransformer is None:
40
+ raise RuntimeError('sentence-transformers not available in environment')
41
+ self.model = SentenceTransformer(self.model_name)
42
+
43
+ def load_laws(self, laws_path: Path | str = None):
44
+ path = Path(laws_path) if laws_path else ROOT / 'dataset-all.csv'
45
+ if not path.exists():
46
+ raise FileNotFoundError(f'Laws CSV not found at {path}')
47
+ df = pd.read_csv(path)
48
+ # ensure consistent columns
49
+ df = df.fillna('')
50
+ self.laws_df = df
51
+ # build metadata
52
+ self.meta = []
53
+ for i, row in df.iterrows():
54
+ combined = _combine_law_text(row)
55
+ self.meta.append({'index': int(i), 'text': combined})
56
+ return df
57
+
58
+ def build_or_load_embeddings(self, force: bool = False):
59
+ # If embeddings exist and not forced, load them
60
+ if EMBED_PATH.exists() and META_PATH.exists() and not force:
61
+ try:
62
+ self.embeddings = np.load(EMBED_PATH)
63
+ with open(META_PATH, 'r', encoding='utf-8') as f:
64
+ self.meta = json.load(f)
65
+ # ensure laws_df exists
66
+ if self.laws_df is None:
67
+ self.load_laws()
68
+ return self.embeddings
69
+ except Exception:
70
+ # fall through to rebuild
71
+ pass
72
+
73
+ # Build embeddings
74
+ if self.laws_df is None:
75
+ self.load_laws()
76
+
77
+ self.ensure_model()
78
+ texts = [m['text'] for m in self.meta]
79
+ if not texts:
80
+ self.embeddings = np.zeros((0, 384), dtype=np.float32)
81
+ else:
82
+ emb = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
83
+ # normalize
84
+ norms = np.linalg.norm(emb, axis=1, keepdims=True)
85
+ norms[norms == 0] = 1
86
+ emb = emb / norms
87
+ self.embeddings = emb.astype(np.float32)
88
+ np.save(EMBED_PATH, self.embeddings)
89
+ with open(META_PATH, 'w', encoding='utf-8') as f:
90
+ json.dump(self.meta, f, ensure_ascii=False)
91
+
92
+ return self.embeddings
93
+
94
+ def find_similar(self, query: str, top_k: int = 1) -> List[Tuple[float, dict]]:
95
+ if self.embeddings is None:
96
+ self.build_or_load_embeddings()
97
+ self.ensure_model()
98
+ q_emb = self.model.encode([query], convert_to_numpy=True)
99
+ q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
100
+ sims = (self.embeddings @ q_emb[0]).astype(float)
101
+ # get top_k indices
102
+ idx = np.argsort(-sims)[:top_k]
103
+ results = []
104
+ for i in idx:
105
+ score = float(sims[i])
106
+ meta = self.meta[i].copy()
107
+ meta['score'] = score
108
+ # attach original row (if available)
109
+ if self.laws_df is not None and int(meta['index']) in self.laws_df.index:
110
+ row = self.laws_df.loc[int(meta['index'])].to_dict()
111
+ meta['row'] = row
112
+ results.append((score, meta))
113
+ return results
114
+
115
+ def load_greetings(self, path: Path | str = None):
116
+ p = Path(path) if path else ROOT / 'greetings.csv'
117
+ if not p.exists():
118
+ raise FileNotFoundError(f'Greetings CSV not found at {p}')
119
+ try:
120
+ df = pd.read_csv(p)
121
+ except Exception:
122
+ # sometimes the CSV has irregular quoting/commas; try a more tolerant parser
123
+ try:
124
+ df = pd.read_csv(p, engine='python', on_bad_lines='skip')
125
+ except Exception:
126
+ # final fallback: read as plain text and attempt a crude split
127
+ text = p.read_text(encoding='utf-8', errors='ignore')
128
+ # attempt to split lines and parse with first line as header
129
+ lines = [l for l in text.splitlines() if l.strip()]
130
+ if not lines:
131
+ raise
132
+ header = [h.strip() for h in lines[0].split(',')]
133
+ records = []
134
+ for ln in lines[1:]:
135
+ parts = [c.strip() for c in ln.split(',')]
136
+ # pad/truncate to header length
137
+ if len(parts) < len(header):
138
+ parts += [''] * (len(header) - len(parts))
139
+ parts = parts[:len(header)]
140
+ records.append(dict(zip(header, parts)))
141
+ df = pd.DataFrame.from_records(records)
142
+
143
+ self.greetings_df = df.fillna('')
144
+ return self.greetings_df
145
+
146
+ def detect_and_reply_greeting(self, text: str) -> dict | None:
147
+ # Load greetings if needed
148
+ if self.greetings_df is None:
149
+ try:
150
+ self.load_greetings()
151
+ except FileNotFoundError:
152
+ return None
153
+ t = str(text).lower().strip()
154
+
155
+ # Check exact/substring matches across language columns and prefer language-specific response
156
+ tokens = set(t.split())
157
+
158
+ # 1) direct matches preferring Kinyarwanda, then English, then French
159
+ for lang_suffix in ['', '_en', '_fr']:
160
+ gcol = f'greeting{lang_suffix}'
161
+ rcol = f'response{lang_suffix}'
162
+ fcol = f'followup{lang_suffix}'
163
+ for _, row in self.greetings_df.iterrows():
164
+ gval = str(row.get(gcol, '')).strip()
165
+ if not gval:
166
+ continue
167
+ gval_l = gval.lower()
168
+ if gval_l == t or gval_l in t or t in gval_l:
169
+ response = row.get(rcol) or row.get('response') or row.get('response_en') or row.get('response_fr') or ''
170
+ followup = row.get(fcol) or row.get('followup') or row.get('followup_en') or row.get('followup_fr') or ''
171
+ return {'response': response, 'followup': followup}
172
+
173
+ # 2) token overlap fallback: collect candidates with token overlap across languages
174
+ candidates = []
175
+ for _, row in self.greetings_df.iterrows():
176
+ for lang_suffix in ['', '_en', '_fr']:
177
+ gcol = f'greeting{lang_suffix}'
178
+ gval = str(row.get(gcol, '')).strip().lower()
179
+ if not gval:
180
+ continue
181
+ if any(tok in gval.split() for tok in tokens):
182
+ candidates.append((row, lang_suffix))
183
+
184
+ if candidates:
185
+ row, lang_suffix = random.choice(candidates)
186
+ rcol = f'response{lang_suffix}'
187
+ fcol = f'followup{lang_suffix}'
188
+ response = row.get(rcol) or row.get('response') or row.get('response_en') or row.get('response_fr') or ''
189
+ followup = row.get(fcol) or row.get('followup') or row.get('followup_en') or row.get('followup_fr') or ''
190
+ return {'response': response, 'followup': followup}
191
+
192
+ return None
193
+
194
+
195
+ # Provide a singleton retriever for the server to use
196
+ _RETRIEVER: LawRetriever | None = None
197
+
198
+
199
+ def get_retriever() -> LawRetriever:
200
+ global _RETRIEVER
201
+ if _RETRIEVER is None:
202
+ _RETRIEVER = LawRetriever()
203
+ try:
204
+ _RETRIEVER.load_laws()
205
+ except FileNotFoundError:
206
+ pass
207
+ return _RETRIEVER