Spaces:
Sleeping
Sleeping
File size: 13,624 Bytes
519b145 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 |
"""
Entity extraction utilities for extracting fine codes, procedure names, and resolving pronouns.
"""
import re
from typing import List, Dict, Any, Optional, Tuple
from hue_portal.core.models import Fine, Procedure, Office
def extract_fine_code(text: str) -> Optional[str]:
"""
Extract fine code (V001, V002, etc.) from text.
Args:
text: Input text.
Returns:
Fine code string or None if not found.
"""
# Pattern: V followed by 3 digits
pattern = r'\bV\d{3}\b'
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
return matches[0].upper()
return None
def extract_procedure_name(text: str) -> Optional[str]:
"""
Extract procedure name from text by matching against database.
Args:
text: Input text.
Returns:
Procedure name or None if not found.
"""
text_lower = text.lower()
# Get all procedures and check for matches
procedures = Procedure.objects.all()
for procedure in procedures:
procedure_title_lower = procedure.title.lower()
# Check if procedure title appears in text
if procedure_title_lower in text_lower or text_lower in procedure_title_lower:
return procedure.title
return None
def extract_office_name(text: str) -> Optional[str]:
"""
Extract office/unit name from text by matching against database.
Args:
text: Input text.
Returns:
Office name or None if not found.
"""
text_lower = text.lower()
# Get all offices and check for matches
offices = Office.objects.all()
for office in offices:
office_name_lower = office.unit_name.lower()
# Check if office name appears in text
if office_name_lower in text_lower or text_lower in office_name_lower:
return office.unit_name
return None
def extract_reference_pronouns(text: str, context: Optional[List[Dict[str, Any]]] = None) -> List[str]:
"""
Extract reference pronouns from text.
Args:
text: Input text.
context: Optional context from recent messages.
Returns:
List of pronouns found.
"""
# Vietnamese reference pronouns
pronouns = [
"cái đó", "cái này", "cái kia",
"như vậy", "như thế",
"thủ tục đó", "thủ tục này",
"mức phạt đó", "mức phạt này",
"đơn vị đó", "đơn vị này",
"nó", "đó", "này", "kia"
]
text_lower = text.lower()
found_pronouns = []
for pronoun in pronouns:
if pronoun in text_lower:
found_pronouns.append(pronoun)
return found_pronouns
def enhance_query_with_context(query: str, recent_messages: List[Dict[str, Any]]) -> str:
"""
Enhance query with entities from conversation context.
This is more comprehensive than resolve_pronouns - it adds context even when query already has keywords.
Args:
query: Current query.
recent_messages: List of recent messages with role, content, intent, entities.
Returns:
Enhanced query with context entities added.
"""
if not recent_messages:
return query
# Collect entities from recent messages (reverse order - most recent first)
entities_found = {}
for msg in reversed(recent_messages):
# Check message content for entities
content = msg.get("content", "")
# Extract document code (highest priority for legal queries)
document_code = extract_document_code(content)
if document_code and "document_code" not in entities_found:
entities_found["document_code"] = document_code
# Extract fine code
fine_code = extract_fine_code(content)
if fine_code and "fine_code" not in entities_found:
entities_found["fine_code"] = fine_code
# Extract procedure name
procedure_name = extract_procedure_name(content)
if procedure_name and "procedure_name" not in entities_found:
entities_found["procedure_name"] = procedure_name
# Extract office name
office_name = extract_office_name(content)
if office_name and "office_name" not in entities_found:
entities_found["office_name"] = office_name
# Check entities field
msg_entities = msg.get("entities", {})
for key, value in msg_entities.items():
if key not in entities_found:
entities_found[key] = value
# Check intent to infer entity type
intent = msg.get("intent", "")
if intent == "search_fine" and "fine_name" not in entities_found:
# Try to extract fine name from content
fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "tốc độ"]
for keyword in fine_keywords:
if keyword in content.lower():
entities_found["fine_name"] = keyword
break
if intent == "search_procedure" and "procedure_name" not in entities_found:
procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
for keyword in procedure_keywords:
if keyword in content.lower():
entities_found["procedure_name"] = keyword
break
if intent == "search_legal" and "document_code" not in entities_found:
# Try to extract document code from content if not already found
doc_code = extract_document_code(content)
if doc_code:
entities_found["document_code"] = doc_code
# Enhance query with context entities
enhanced_parts = [query]
query_lower = query.lower()
# If query mentions a document but doesn't have the code, add it from context
if "thông tư" in query_lower or "quyết định" in query_lower or "quy định" in query_lower:
if "document_code" in entities_found:
doc_code = entities_found["document_code"]
# Only add if not already in query
if doc_code.lower() not in query_lower:
enhanced_parts.append(doc_code)
# Add document code if intent is legal and code is in context
# This helps with follow-up questions like "nói rõ hơn về thông tư 02"
if "document_code" in entities_found:
doc_code = entities_found["document_code"]
if doc_code.lower() not in query_lower:
# Add document code to enhance search
enhanced_parts.append(doc_code)
return " ".join(enhanced_parts)
def resolve_pronouns(query: str, recent_messages: List[Dict[str, Any]]) -> str:
"""
Resolve pronouns in query by replacing them with actual entities from context.
This is a simpler version that only handles pronoun replacement.
For comprehensive context enhancement, use enhance_query_with_context().
Args:
query: Current query with pronouns.
recent_messages: List of recent messages with role, content, intent, entities.
Returns:
Enhanced query with pronouns resolved.
"""
if not recent_messages:
return query
# Check for pronouns
pronouns = extract_reference_pronouns(query)
if not pronouns:
return query
# Look for entities in recent messages (reverse order - most recent first)
resolved_query = query
entities_found = {}
for msg in reversed(recent_messages):
# Check message content for entities
content = msg.get("content", "")
# Extract fine code
fine_code = extract_fine_code(content)
if fine_code and "fine_code" not in entities_found:
entities_found["fine_code"] = fine_code
# Extract procedure name
procedure_name = extract_procedure_name(content)
if procedure_name and "procedure_name" not in entities_found:
entities_found["procedure_name"] = procedure_name
# Extract office name
office_name = extract_office_name(content)
if office_name and "office_name" not in entities_found:
entities_found["office_name"] = office_name
# Extract document code
document_code = extract_document_code(content)
if document_code and "document_code" not in entities_found:
entities_found["document_code"] = document_code
# Check entities field
msg_entities = msg.get("entities", {})
for key, value in msg_entities.items():
if key not in entities_found:
entities_found[key] = value
# Check intent to infer entity type
intent = msg.get("intent", "")
if intent == "search_fine" and "fine_name" not in entities_found:
fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "tốc độ"]
for keyword in fine_keywords:
if keyword in content.lower():
entities_found["fine_name"] = keyword
break
if intent == "search_procedure" and "procedure_name" not in entities_found:
procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
for keyword in procedure_keywords:
if keyword in content.lower():
entities_found["procedure_name"] = keyword
break
# Replace pronouns with entities
query_lower = query.lower()
# Replace "cái đó", "cái này", "nó" with most relevant entity
if any(pronoun in query_lower for pronoun in ["cái đó", "cái này", "nó", "đó"]):
if "document_code" in entities_found:
resolved_query = re.sub(
r'\b(cái đó|cái này|nó|đó)\b',
entities_found["document_code"],
resolved_query,
flags=re.IGNORECASE
)
elif "fine_name" in entities_found:
resolved_query = re.sub(
r'\b(cái đó|cái này|nó|đó)\b',
entities_found["fine_name"],
resolved_query,
flags=re.IGNORECASE
)
elif "procedure_name" in entities_found:
resolved_query = re.sub(
r'\b(cái đó|cái này|nó|đó)\b',
entities_found["procedure_name"],
resolved_query,
flags=re.IGNORECASE
)
elif "office_name" in entities_found:
resolved_query = re.sub(
r'\b(cái đó|cái này|nó|đó)\b',
entities_found["office_name"],
resolved_query,
flags=re.IGNORECASE
)
# Replace "thủ tục đó", "thủ tục này" with procedure name
if "thủ tục" in query_lower and "procedure_name" in entities_found:
resolved_query = re.sub(
r'\bthủ tục (đó|này)\b',
entities_found["procedure_name"],
resolved_query,
flags=re.IGNORECASE
)
# Replace "mức phạt đó", "mức phạt này" with fine name
if "mức phạt" in query_lower and "fine_name" in entities_found:
resolved_query = re.sub(
r'\bmức phạt (đó|này)\b',
entities_found["fine_name"],
resolved_query,
flags=re.IGNORECASE
)
return resolved_query
def extract_document_code(text: str) -> Optional[str]:
"""
Extract legal document code from text (e.g., "thông tư 02", "quyết định 264").
Args:
text: Input text.
Returns:
Document code string or None if not found.
"""
# Patterns for legal document codes
patterns = [
r'\bthông tư\s+(\d+[-\w]*)',
r'\btt\s+(\d+[-\w]*)',
r'\bquyết định\s+(\d+[-\w]*)',
r'\bqd\s+(\d+[-\w]*)',
r'\bquy định\s+(\d+[-\w]*)',
r'\b(\d+[-\w]*)\s*[-/]\s*QĐ[-/]TW',
r'\b(\d+[-\w]*)\s*[-/]\s*TT',
]
text_lower = text.lower()
for pattern in patterns:
matches = re.findall(pattern, text_lower, re.IGNORECASE)
if matches:
# Return the full match with document type
full_match = re.search(pattern, text_lower, re.IGNORECASE)
if full_match:
return full_match.group(0)
return None
def extract_all_entities(text: str) -> Dict[str, Any]:
"""
Extract all entities from text.
Args:
text: Input text.
Returns:
Dictionary with all extracted entities.
"""
entities = {}
# Extract fine code
fine_code = extract_fine_code(text)
if fine_code:
entities["fine_code"] = fine_code
# Extract procedure name
procedure_name = extract_procedure_name(text)
if procedure_name:
entities["procedure_name"] = procedure_name
# Extract office name
office_name = extract_office_name(text)
if office_name:
entities["office_name"] = office_name
# Extract document code
document_code = extract_document_code(text)
if document_code:
entities["document_code"] = document_code
# Extract pronouns
pronouns = extract_reference_pronouns(text)
if pronouns:
entities["pronouns"] = pronouns
return entities
|