Spaces:

davidtran999
/

hue-portal-backend-v2

Sleeping

App Files Files Community

hue-portal-backend-v2 / backend /hue_portal /chatbot /entity_extraction.py

davidtran999

Push full code from hue-portal-backend folder

519b145 9 days ago

raw

history blame

13.6 kB

	"""
	Entity extraction utilities for extracting fine codes, procedure names, and resolving pronouns.
	"""
	import re
	from typing import List, Dict, Any, Optional, Tuple
	from hue_portal.core.models import Fine, Procedure, Office


	def extract_fine_code(text: str) -> Optional[str]:
	"""
	Extract fine code (V001, V002, etc.) from text.

	Args:
	text: Input text.

	Returns:
	Fine code string or None if not found.
	"""
	# Pattern: V followed by 3 digits
	pattern = r'\bV\d{3}\b'
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	return matches[0].upper()
	return None


	def extract_procedure_name(text: str) -> Optional[str]:
	"""
	Extract procedure name from text by matching against database.

	Args:
	text: Input text.

	Returns:
	Procedure name or None if not found.
	"""
	text_lower = text.lower()

	# Get all procedures and check for matches
	procedures = Procedure.objects.all()
	for procedure in procedures:
	procedure_title_lower = procedure.title.lower()
	# Check if procedure title appears in text
	if procedure_title_lower in text_lower or text_lower in procedure_title_lower:
	return procedure.title

	return None


	def extract_office_name(text: str) -> Optional[str]:
	"""
	Extract office/unit name from text by matching against database.

	Args:
	text: Input text.

	Returns:
	Office name or None if not found.
	"""
	text_lower = text.lower()

	# Get all offices and check for matches
	offices = Office.objects.all()
	for office in offices:
	office_name_lower = office.unit_name.lower()
	# Check if office name appears in text
	if office_name_lower in text_lower or text_lower in office_name_lower:
	return office.unit_name

	return None


	def extract_reference_pronouns(text: str, context: Optional[List[Dict[str, Any]]] = None) -> List[str]:
	"""
	Extract reference pronouns from text.

	Args:
	text: Input text.
	context: Optional context from recent messages.

	Returns:
	List of pronouns found.
	"""
	# Vietnamese reference pronouns
	pronouns = [
	"cái đó", "cái này", "cái kia",
	"như vậy", "như thế",
	"thủ tục đó", "thủ tục này",
	"mức phạt đó", "mức phạt này",
	"đơn vị đó", "đơn vị này",
	"nó", "đó", "này", "kia"
	]

	text_lower = text.lower()
	found_pronouns = []

	for pronoun in pronouns:
	if pronoun in text_lower:
	found_pronouns.append(pronoun)

	return found_pronouns


	def enhance_query_with_context(query: str, recent_messages: List[Dict[str, Any]]) -> str:
	"""
	Enhance query with entities from conversation context.
	This is more comprehensive than resolve_pronouns - it adds context even when query already has keywords.

	Args:
	query: Current query.
	recent_messages: List of recent messages with role, content, intent, entities.

	Returns:
	Enhanced query with context entities added.
	"""
	if not recent_messages:
	return query

	# Collect entities from recent messages (reverse order - most recent first)
	entities_found = {}

	for msg in reversed(recent_messages):
	# Check message content for entities
	content = msg.get("content", "")

	# Extract document code (highest priority for legal queries)
	document_code = extract_document_code(content)
	if document_code and "document_code" not in entities_found:
	entities_found["document_code"] = document_code

	# Extract fine code
	fine_code = extract_fine_code(content)
	if fine_code and "fine_code" not in entities_found:
	entities_found["fine_code"] = fine_code

	# Extract procedure name
	procedure_name = extract_procedure_name(content)
	if procedure_name and "procedure_name" not in entities_found:
	entities_found["procedure_name"] = procedure_name

	# Extract office name
	office_name = extract_office_name(content)
	if office_name and "office_name" not in entities_found:
	entities_found["office_name"] = office_name

	# Check entities field
	msg_entities = msg.get("entities", {})
	for key, value in msg_entities.items():
	if key not in entities_found:
	entities_found[key] = value

	# Check intent to infer entity type
	intent = msg.get("intent", "")
	if intent == "search_fine" and "fine_name" not in entities_found:
	# Try to extract fine name from content
	fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "tốc độ"]
	for keyword in fine_keywords:
	if keyword in content.lower():
	entities_found["fine_name"] = keyword
	break

	if intent == "search_procedure" and "procedure_name" not in entities_found:
	procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
	for keyword in procedure_keywords:
	if keyword in content.lower():
	entities_found["procedure_name"] = keyword
	break

	if intent == "search_legal" and "document_code" not in entities_found:
	# Try to extract document code from content if not already found
	doc_code = extract_document_code(content)
	if doc_code:
	entities_found["document_code"] = doc_code

	# Enhance query with context entities
	enhanced_parts = [query]
	query_lower = query.lower()

	# If query mentions a document but doesn't have the code, add it from context
	if "thông tư" in query_lower or "quyết định" in query_lower or "quy định" in query_lower:
	if "document_code" in entities_found:
	doc_code = entities_found["document_code"]
	# Only add if not already in query
	if doc_code.lower() not in query_lower:
	enhanced_parts.append(doc_code)

	# Add document code if intent is legal and code is in context
	# This helps with follow-up questions like "nói rõ hơn về thông tư 02"
	if "document_code" in entities_found:
	doc_code = entities_found["document_code"]
	if doc_code.lower() not in query_lower:
	# Add document code to enhance search
	enhanced_parts.append(doc_code)

	return " ".join(enhanced_parts)


	def resolve_pronouns(query: str, recent_messages: List[Dict[str, Any]]) -> str:
	"""
	Resolve pronouns in query by replacing them with actual entities from context.
	This is a simpler version that only handles pronoun replacement.
	For comprehensive context enhancement, use enhance_query_with_context().

	Args:
	query: Current query with pronouns.
	recent_messages: List of recent messages with role, content, intent, entities.

	Returns:
	Enhanced query with pronouns resolved.
	"""
	if not recent_messages:
	return query

	# Check for pronouns
	pronouns = extract_reference_pronouns(query)
	if not pronouns:
	return query

	# Look for entities in recent messages (reverse order - most recent first)
	resolved_query = query
	entities_found = {}

	for msg in reversed(recent_messages):
	# Check message content for entities
	content = msg.get("content", "")

	# Extract fine code
	fine_code = extract_fine_code(content)
	if fine_code and "fine_code" not in entities_found:
	entities_found["fine_code"] = fine_code

	# Extract procedure name
	procedure_name = extract_procedure_name(content)
	if procedure_name and "procedure_name" not in entities_found:
	entities_found["procedure_name"] = procedure_name

	# Extract office name
	office_name = extract_office_name(content)
	if office_name and "office_name" not in entities_found:
	entities_found["office_name"] = office_name

	# Extract document code
	document_code = extract_document_code(content)
	if document_code and "document_code" not in entities_found:
	entities_found["document_code"] = document_code

	# Check entities field
	msg_entities = msg.get("entities", {})
	for key, value in msg_entities.items():
	if key not in entities_found:
	entities_found[key] = value

	# Check intent to infer entity type
	intent = msg.get("intent", "")
	if intent == "search_fine" and "fine_name" not in entities_found:
	fine_keywords = ["vượt đèn đỏ", "mũ bảo hiểm", "nồng độ cồn", "tốc độ"]
	for keyword in fine_keywords:
	if keyword in content.lower():
	entities_found["fine_name"] = keyword
	break

	if intent == "search_procedure" and "procedure_name" not in entities_found:
	procedure_keywords = ["đăng ký", "thủ tục", "cư trú", "antt", "pccc"]
	for keyword in procedure_keywords:
	if keyword in content.lower():
	entities_found["procedure_name"] = keyword
	break

	# Replace pronouns with entities
	query_lower = query.lower()

	# Replace "cái đó", "cái này", "nó" with most relevant entity
	if any(pronoun in query_lower for pronoun in ["cái đó", "cái này", "nó", "đó"]):
	if "document_code" in entities_found:
	resolved_query = re.sub(
	r'\b(cái đó\|cái này\|nó\|đó)\b',
	entities_found["document_code"],
	resolved_query,
	flags=re.IGNORECASE
	)
	elif "fine_name" in entities_found:
	resolved_query = re.sub(
	r'\b(cái đó\|cái này\|nó\|đó)\b',
	entities_found["fine_name"],
	resolved_query,
	flags=re.IGNORECASE
	)
	elif "procedure_name" in entities_found:
	resolved_query = re.sub(
	r'\b(cái đó\|cái này\|nó\|đó)\b',
	entities_found["procedure_name"],
	resolved_query,
	flags=re.IGNORECASE
	)
	elif "office_name" in entities_found:
	resolved_query = re.sub(
	r'\b(cái đó\|cái này\|nó\|đó)\b',
	entities_found["office_name"],
	resolved_query,
	flags=re.IGNORECASE
	)

	# Replace "thủ tục đó", "thủ tục này" with procedure name
	if "thủ tục" in query_lower and "procedure_name" in entities_found:
	resolved_query = re.sub(
	r'\bthủ tục (đó\|này)\b',
	entities_found["procedure_name"],
	resolved_query,
	flags=re.IGNORECASE
	)

	# Replace "mức phạt đó", "mức phạt này" with fine name
	if "mức phạt" in query_lower and "fine_name" in entities_found:
	resolved_query = re.sub(
	r'\bmức phạt (đó\|này)\b',
	entities_found["fine_name"],
	resolved_query,
	flags=re.IGNORECASE
	)

	return resolved_query


	def extract_document_code(text: str) -> Optional[str]:
	"""
	Extract legal document code from text (e.g., "thông tư 02", "quyết định 264").

	Args:
	text: Input text.

	Returns:
	Document code string or None if not found.
	"""
	# Patterns for legal document codes
	patterns = [
	r'\bthông tư\s+(\d+[-\w]*)',
	r'\btt\s+(\d+[-\w]*)',
	r'\bquyết định\s+(\d+[-\w]*)',
	r'\bqd\s+(\d+[-\w]*)',
	r'\bquy định\s+(\d+[-\w]*)',
	r'\b(\d+[-\w])\s[-/]\s*QĐ[-/]TW',
	r'\b(\d+[-\w])\s[-/]\s*TT',
	]

	text_lower = text.lower()
	for pattern in patterns:
	matches = re.findall(pattern, text_lower, re.IGNORECASE)
	if matches:
	# Return the full match with document type
	full_match = re.search(pattern, text_lower, re.IGNORECASE)
	if full_match:
	return full_match.group(0)

	return None


	def extract_all_entities(text: str) -> Dict[str, Any]:
	"""
	Extract all entities from text.

	Args:
	text: Input text.

	Returns:
	Dictionary with all extracted entities.
	"""
	entities = {}

	# Extract fine code
	fine_code = extract_fine_code(text)
	if fine_code:
	entities["fine_code"] = fine_code

	# Extract procedure name
	procedure_name = extract_procedure_name(text)
	if procedure_name:
	entities["procedure_name"] = procedure_name

	# Extract office name
	office_name = extract_office_name(text)
	if office_name:
	entities["office_name"] = office_name

	# Extract document code
	document_code = extract_document_code(text)
	if document_code:
	entities["document_code"] = document_code

	# Extract pronouns
	pronouns = extract_reference_pronouns(text)
	if pronouns:
	entities["pronouns"] = pronouns

	return entities