Ara Yeroyan commited on
Commit
7c8b783
·
1 Parent(s): 72318ee

add district Metadata

Browse files
Files changed (1) hide show
  1. add_district_metadata.py +379 -0
add_district_metadata.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to add District metadata to Qdrant chunks based on filename analysis.
4
+ Handles Uganda districts, ministry mappings, and LLM inference for ambiguous cases.
5
+ """
6
+ import re
7
+ import yaml
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import Dict, List, Optional
11
+
12
+
13
+ from qdrant_client import QdrantClient
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class DistrictMapping:
22
+ """Mapping for district-related entities"""
23
+ name: str
24
+ aliases: List[str]
25
+ is_district: bool = True
26
+
27
+
28
+ class DistrictMetadataProcessor:
29
+ def __init__(self, config_path: str = "src/config/settings.yaml"):
30
+ # Load config manually
31
+ with open(config_path, 'r') as f:
32
+ self.config = yaml.safe_load(f)
33
+
34
+ # Initialize Qdrant client (will be imported when needed)
35
+ self.llm_client = None
36
+ self.qdrant_client = None
37
+ self.collection_name = self.config["qdrant"]["collection_name"]
38
+
39
+ # Initialize district mappings
40
+ self.district_mappings = self._initialize_district_mappings()
41
+ self.ministry_mappings = self._initialize_ministry_mappings()
42
+
43
+ def _initialize_district_mappings(self) -> Dict[str, DistrictMapping]:
44
+ """Initialize Uganda districts and their aliases"""
45
+ districts = [
46
+ # Central Region
47
+ DistrictMapping("Kampala", ["KCCA", "Kampala Capital City Authority"]),
48
+ DistrictMapping("Wakiso", ["Wakiso"]),
49
+ DistrictMapping("Mukono", ["Mukono"]),
50
+ DistrictMapping("Luweero", ["Luweero"]),
51
+ DistrictMapping("Nakaseke", ["Nakaseke"]),
52
+ DistrictMapping("Nakasongola", ["Nakasongola"]),
53
+ DistrictMapping("Kayunga", ["Kayunga"]),
54
+ DistrictMapping("Buikwe", ["Buikwe"]),
55
+ DistrictMapping("Buvuma", ["Buvuma"]),
56
+
57
+ # Northern Region
58
+ DistrictMapping("Gulu", ["Gulu", "Gulu DLG"]),
59
+ DistrictMapping("Kitgum", ["Kitgum"]),
60
+ DistrictMapping("Pader", ["Pader"]),
61
+ DistrictMapping("Agago", ["Agago"]),
62
+ DistrictMapping("Lamwo", ["Lamwo"]),
63
+ DistrictMapping("Nwoya", ["Nwoya"]),
64
+ DistrictMapping("Amuru", ["Amuru"]),
65
+ DistrictMapping("Omoro", ["Omoro"]),
66
+ DistrictMapping("Oyam", ["Oyam"]),
67
+ DistrictMapping("Kole", ["Kole"]),
68
+ DistrictMapping("Apac", ["Apac", "Apac District"]),
69
+ DistrictMapping("Lira", ["Lira"]),
70
+ DistrictMapping("Alebtong", ["Alebtong"]),
71
+ DistrictMapping("Amolatar", ["Amolatar"]),
72
+ DistrictMapping("Dokolo", ["Dokolo"]),
73
+ DistrictMapping("Otuke", ["Otuke"]),
74
+ DistrictMapping("Kwania", ["Kwania"]),
75
+
76
+ # Eastern Region
77
+ DistrictMapping("Jinja", ["Jinja"]),
78
+ DistrictMapping("Kamuli", ["Kamuli"]),
79
+ DistrictMapping("Iganga", ["Iganga"]),
80
+ DistrictMapping("Bugiri", ["Bugiri"]),
81
+ DistrictMapping("Mayuge", ["Mayuge"]),
82
+ DistrictMapping("Namayingo", ["Namayingo"]),
83
+ DistrictMapping("Busia", ["Busia"]),
84
+ DistrictMapping("Tororo", ["Tororo"]),
85
+ DistrictMapping("Pallisa", ["Pallisa"]),
86
+ DistrictMapping("Kumi", ["Kumi"]),
87
+ DistrictMapping("Bukedea", ["Bukedea"]),
88
+ DistrictMapping("Soroti", ["Soroti"]),
89
+ DistrictMapping("Serere", ["Serere"]),
90
+ DistrictMapping("Ngora", ["Ngora"]),
91
+ DistrictMapping("Kaberamaido", ["Kaberamaido"]),
92
+ DistrictMapping("Kalaki", ["Kalaki"]),
93
+ DistrictMapping("Kapelebyong", ["Kapelebyong"]),
94
+ DistrictMapping("Amuria", ["Amuria"]),
95
+ DistrictMapping("Katakwi", ["Katakwi"]),
96
+ DistrictMapping("Kotido", ["Kotido"]),
97
+ DistrictMapping("Abim", ["Abim"]),
98
+ DistrictMapping("Kaabong", ["Kaabong", "Kaabong District"]),
99
+ DistrictMapping("Karenga", ["Karenga"]),
100
+ DistrictMapping("Moroto", ["Moroto"]),
101
+ DistrictMapping("Napak", ["Napak"]),
102
+ DistrictMapping("Nabilatuk", ["Nabilatuk"]),
103
+ DistrictMapping("Amudat", ["Amudat"]),
104
+ DistrictMapping("Nakapiripirit", ["Nakapiripirit"]),
105
+ DistrictMapping("Bukwo", ["Bukwo"]),
106
+ DistrictMapping("Kween", ["Kween"]),
107
+ DistrictMapping("Kapchorwa", ["Kapchorwa"]),
108
+ DistrictMapping("Sironko", ["Sironko"]),
109
+ DistrictMapping("Manafwa", ["Manafwa"]),
110
+ DistrictMapping("Bududa", ["Bududa"]),
111
+ DistrictMapping("Mbale", ["Mbale"]),
112
+ DistrictMapping("Butaleja", ["Butaleja"]),
113
+ DistrictMapping("Namisindwa", ["Namisindwa"]),
114
+ DistrictMapping("Bulambuli", ["Bulambuli"]),
115
+
116
+ # Western Region
117
+ DistrictMapping("Masaka", ["Masaka"]),
118
+ DistrictMapping("Kalungu", ["Kalungu"]),
119
+ DistrictMapping("Bukomansimbi", ["Bukomansimbi"]),
120
+ DistrictMapping("Lwengo", ["Lwengo"]),
121
+ DistrictMapping("Sembabule", ["Sembabule"]),
122
+ DistrictMapping("Rakai", ["Rakai"]),
123
+ DistrictMapping("Kyotera", ["Kyotera"]),
124
+ DistrictMapping("Mpigi", ["Mpigi"]),
125
+ DistrictMapping("Butambala", ["Butambala"]),
126
+ DistrictMapping("Gomba", ["Gomba"]),
127
+ DistrictMapping("Mityana", ["Mityana"]),
128
+ DistrictMapping("Mubende", ["Mubende"]),
129
+ DistrictMapping("Kassanda", ["Kassanda"]),
130
+ DistrictMapping("Kiboga", ["Kiboga"]),
131
+ DistrictMapping("Kyankwanzi", ["Kyankwanzi"]),
132
+ DistrictMapping("Hoima", ["Hoima"]),
133
+ DistrictMapping("Kikuube", ["Kikuube"]),
134
+ DistrictMapping("Kakumiro", ["Kakumiro"]),
135
+ DistrictMapping("Kibaale", ["Kibaale"]),
136
+ DistrictMapping("Kagadi", ["Kagadi"]),
137
+ DistrictMapping("Buliisa", ["Buliisa"]),
138
+ DistrictMapping("Masindi", ["Masindi"]),
139
+ DistrictMapping("Kiryandongo", ["Kiryandongo"]),
140
+ DistrictMapping("Buliisa", ["Buliisa"]),
141
+ DistrictMapping("Pakwach", ["Pakwach"]),
142
+ DistrictMapping("Nebbi", ["Nebbi"]),
143
+ DistrictMapping("Zombo", ["Zombo"]),
144
+ DistrictMapping("Arua", ["Arua"]),
145
+ DistrictMapping("Terego", ["Terego"]),
146
+ DistrictMapping("Madi-Okollo", ["Madi-Okollo"]),
147
+ DistrictMapping("Obongi", ["Obongi"]),
148
+ DistrictMapping("Moyo", ["Moyo"]),
149
+ DistrictMapping("Yumbe", ["Yumbe"]),
150
+ DistrictMapping("Koboko", ["Koboko"]),
151
+ DistrictMapping("Maracha", ["Maracha"]),
152
+ DistrictMapping("Adjumani", ["Adjumani"]),
153
+
154
+ # South Western Region
155
+ DistrictMapping("Mbarara", ["Mbarara"]),
156
+ DistrictMapping("Ibanda", ["Ibanda"]),
157
+ DistrictMapping("Isingiro", ["Isingiro"]),
158
+ DistrictMapping("Kiruhura", ["Kiruhura"]),
159
+ DistrictMapping("Kazo", ["Kazo"]),
160
+ DistrictMapping("Ntungamo", ["Ntungamo"]),
161
+ DistrictMapping("Rwampara", ["Rwampara"]),
162
+ DistrictMapping("Rubanda", ["Rubanda"]),
163
+ DistrictMapping("Rukiga", ["Rukiga"]),
164
+ DistrictMapping("Kanungu", ["Kanungu"]),
165
+ DistrictMapping("Rukungiri", ["Rukungiri"]),
166
+ DistrictMapping("Kisoro", ["Kisoro"]),
167
+ DistrictMapping("Bundibugyo", ["Bundibugyo"]),
168
+ DistrictMapping("Ntoroko", ["Ntoroko"]),
169
+ DistrictMapping("Kasese", ["Kasese"]),
170
+ DistrictMapping("Bunyangabu", ["Bunyangabu"]),
171
+ DistrictMapping("Fort Portal", ["Fort Portal"]),
172
+ DistrictMapping("Kabarole", ["Kabarole"]),
173
+ DistrictMapping("Kyenjojo", ["Kyenjojo"]),
174
+ DistrictMapping("Kamwenge", ["Kamwenge"]),
175
+ DistrictMapping("Kitagwenda", ["Kitagwenda"]),
176
+ DistrictMapping("Kyegegwa", ["Kyegegwa"]),
177
+ DistrictMapping("Mitooma", ["Mitooma"]),
178
+ DistrictMapping("Rubirizi", ["Rubirizi"]),
179
+ DistrictMapping("Sheema", ["Sheema"]),
180
+ DistrictMapping("Bushenyi", ["Bushenyi"]),
181
+
182
+ # Special cases
183
+ DistrictMapping("Kalangala", ["Kalangala", "Kalangala DLG"]),
184
+ ]
185
+
186
+ # Create mapping dictionary
187
+ mapping_dict = {}
188
+ for district in districts:
189
+ mapping_dict[district.name.lower()] = district
190
+ for alias in district.aliases:
191
+ mapping_dict[alias.lower()] = district
192
+ return mapping_dict
193
+
194
+ def _initialize_ministry_mappings(self) -> Dict[str, str]:
195
+ """Initialize ministry and organization mappings"""
196
+ return {
197
+ "maaif": "Ministry of Agriculture, Animal Industry and Fisheries",
198
+ "mwts": "Ministry of Works and Transport",
199
+ "kcca": "Kampala Capital City Authority",
200
+ "oag": "Office of the Auditor General",
201
+ "arsdp": "Albertine Regional Sustainable Development Project",
202
+ "avcdp": "Agriculture Value Chain Development Project",
203
+ "ida": "International Development Association",
204
+ "dlg": "District Local Government",
205
+ "lg": "Local Government",
206
+ }
207
+
208
+ def _extract_district_from_filename(self, filename: str) -> Optional[str]:
209
+ """Extract district from filename using pattern matching"""
210
+ filename_lower = filename.lower()
211
+
212
+ # Check for explicit district mentions
213
+ for key, district_mapping in self.district_mappings.items():
214
+ if key in filename_lower:
215
+ return district_mapping.name
216
+
217
+ # Check for ministry/organization patterns that are NOT districts
218
+ for ministry_key in self.ministry_mappings.keys():
219
+ if ministry_key in filename_lower:
220
+ return None # This is a ministry, not a district
221
+
222
+ # Check for patterns like "District Local Government"
223
+ district_pattern = r'(\w+)\s+district\s+local\s+government'
224
+ match = re.search(district_pattern, filename_lower)
225
+ if match:
226
+ district_name = match.group(1).title()
227
+ if district_name.lower() in self.district_mappings:
228
+ return self.district_mappings[district_name.lower()].name
229
+
230
+ # Check for patterns like "DLG Report"
231
+ dlg_pattern = r'(\w+)\s+dlg\s+report'
232
+ match = re.search(dlg_pattern, filename_lower)
233
+ if match:
234
+ district_name = match.group(1).title()
235
+ if district_name.lower() in self.district_mappings:
236
+ return self.district_mappings[district_name.lower()].name
237
+
238
+ return None
239
+
240
+ def _infer_district_with_llm(self, filename: str) -> Optional[str]:
241
+ """Use LLM to infer district from filename when pattern matching fails"""
242
+ # For now, return None - LLM integration can be added later
243
+ logger.info(f"LLM inference needed for filename: {filename}")
244
+ return None
245
+
246
+ def infer_district(self, filename: str) -> Optional[str]:
247
+ """Main method to infer district from filename"""
248
+ # First try pattern matching
249
+ district = self._extract_district_from_filename(filename)
250
+ if district:
251
+ return district
252
+
253
+ # If pattern matching fails, use LLM
254
+ return self._infer_district_with_llm(filename)
255
+
256
+ def fetch_chunks_batch(self, batch_size: int = 100, offset: int = 0) -> List[Dict]:
257
+ """Fetch a batch of chunks from Qdrant (metadata only)"""
258
+ try:
259
+ # Import Qdrant client when needed
260
+ if self.qdrant_client is None:
261
+ self.qdrant_client = QdrantClient(
262
+ url=self.config["qdrant"]["url"],
263
+ api_key=self.config["qdrant"]["api_key"]
264
+ )
265
+
266
+ # Get points with metadata only (no vectors)
267
+ points = self.qdrant_client.scroll(
268
+ collection_name=self.collection_name,
269
+ limit=batch_size,
270
+ offset=offset,
271
+ with_payload=True,
272
+ with_vectors=False
273
+ )[0]
274
+
275
+ return points
276
+ except Exception as e:
277
+ logger.error(f"Failed to fetch batch: {e}")
278
+ return []
279
+
280
+ def update_chunks_with_district(self, points: List[Dict]) -> int:
281
+ """Update chunks with district metadata"""
282
+ updated_count = 0
283
+
284
+ # Import Qdrant client when needed
285
+ if self.qdrant_client is None:
286
+ from qdrant_client import QdrantClient
287
+ self.qdrant_client = QdrantClient(
288
+ url=self.config["qdrant"]["url"],
289
+ api_key=self.config["qdrant"]["api_key"]
290
+ )
291
+
292
+ for point in points:
293
+ try:
294
+ point_id = point.id
295
+ metadata = point.payload.get("metadata", {})
296
+ filename = metadata.get("filename", "")
297
+
298
+ if not filename:
299
+ logger.warning(f"Point {point_id} has no filename")
300
+ continue
301
+
302
+ # Infer district
303
+ district = self.infer_district(filename)
304
+
305
+ # Update metadata
306
+ updated_metadata = metadata.copy()
307
+ updated_metadata["district"] = district
308
+
309
+ # Update point in Qdrant
310
+ self.qdrant_client.set_payload(
311
+ collection_name=self.collection_name,
312
+ payload={"metadata": updated_metadata},
313
+ points=[point_id]
314
+ )
315
+
316
+ updated_count += 1
317
+ logger.info(f"Updated point {point_id}: {filename} -> {district}")
318
+
319
+ except Exception as e:
320
+ logger.error(f"Failed to update point {point_id}: {e}")
321
+
322
+ return updated_count
323
+
324
+ def process_all_chunks(self, batch_size: int = 100):
325
+ """Process all chunks in batches"""
326
+ total_updated = 0
327
+ offset = 0
328
+
329
+ logger.info(f"Starting to process chunks in batches of {batch_size}")
330
+
331
+ while True:
332
+ # Fetch batch
333
+ points = self.fetch_chunks_batch(batch_size, offset)
334
+ if not points:
335
+ break
336
+
337
+ logger.info(f"Processing batch: {len(points)} points (offset: {offset})")
338
+
339
+ # Update batch
340
+ updated_count = self.update_chunks_with_district(points)
341
+ total_updated += updated_count
342
+
343
+ logger.info(f"Updated {updated_count} points in this batch")
344
+
345
+ # Move to next batch
346
+ offset += batch_size
347
+
348
+ logger.info(f"Total updated: {total_updated} points")
349
+ return total_updated
350
+
351
+ def main():
352
+ """Main function to run the district metadata processor"""
353
+ try:
354
+ processor = DistrictMetadataProcessor()
355
+
356
+ # Test with a small batch first
357
+ logger.info("Testing with first 10 chunks...")
358
+ test_points = processor.fetch_chunks_batch(10, 0)
359
+
360
+ if test_points:
361
+ logger.info("Test batch fetched successfully. Processing...")
362
+ for point in test_points:
363
+ filename = point.payload.get("metadata", {}).get("filename", "")
364
+ district = processor.infer_district(filename)
365
+ logger.info(f"Test: {filename} -> {district}")
366
+
367
+ # Ask user if they want to proceed with full processing
368
+ response = input("\nProceed with full processing? (y/n): ")
369
+ if response.lower() == 'y':
370
+ processor.process_all_chunks(batch_size=100)
371
+ else:
372
+ logger.info("Processing cancelled by user")
373
+
374
+ except Exception as e:
375
+ logger.error(f"Error in main: {e}")
376
+ raise
377
+
378
+ if __name__ == "__main__":
379
+ main()