Spaces:

davidtran999
/

hue-portal-backend-v2

Sleeping

App Files Files Community

davidtran999 commited on 9 days ago

Commit

765d69d

verified ·

1 Parent(s): 8604302

Upload backend/core/rag.py with huggingface_hub

Browse files

Files changed (1) hide show

backend/core/rag.py +561 -0

backend/core/rag.py ADDED Viewed

	@@ -0,0 +1,561 @@

+"""
+RAG (Retrieval-Augmented Generation) pipeline for answer generation.
+"""
+import re
+import unicodedata
+from typing import List, Dict, Any, Optional
+from .hybrid_search import hybrid_search
+from .models import Procedure, Fine, Office, Advisory, LegalSection
+from hue_portal.chatbot.chatbot import format_fine_amount
+from hue_portal.chatbot.llm_integration import get_llm_generator
+from hue_portal.chatbot.structured_legal import format_structured_legal_answer
+def retrieve_top_k_documents(
+    query: str,
+    content_type: str,
+    top_k: int = 5
+) -> List[Any]:
+    """
+    Retrieve top-k documents using hybrid search.
+    Args:
+        query: Search query.
+        content_type: Type of content ('procedure', 'fine', 'office', 'advisory').
+        top_k: Number of documents to retrieve.
+    Returns:
+        List of document objects.
+    """
+    # Get appropriate queryset
+    if content_type == 'procedure':
+        queryset = Procedure.objects.all()
+        text_fields = ['title', 'domain', 'conditions', 'dossier']
+    elif content_type == 'fine':
+        queryset = Fine.objects.all()
+        text_fields = ['name', 'code', 'article', 'decree', 'remedial']
+    elif content_type == 'office':
+        queryset = Office.objects.all()
+        text_fields = ['unit_name', 'address', 'district', 'service_scope']
+    elif content_type == 'advisory':
+        queryset = Advisory.objects.all()
+        text_fields = ['title', 'summary']
+    elif content_type == 'legal':
+        queryset = LegalSection.objects.select_related("document").all()
+        text_fields = ['section_title', 'section_code', 'content']
+    else:
+        return []
+    # Use hybrid search with text_fields for exact match boost
+    try:
+        from .config.hybrid_search_config import get_config
+        config = get_config(content_type)
+        results = hybrid_search(
+            queryset,
+            query,
+            top_k=top_k,
+            bm25_weight=config.bm25_weight,
+            vector_weight=config.vector_weight,
+            min_hybrid_score=config.min_hybrid_score,
+            text_fields=text_fields
+        )
+        return results
+    except Exception as e:
+        print(f"Error in retrieval: {e}")
+        return []
+def generate_answer_template(
+    query: str,
+    documents: List[Any],
+    content_type: str,
+    context: Optional[List[Dict[str, Any]]] = None,
+    use_llm: bool = True
+) -> str:
+    """
+    Generate answer using LLM (if available) or template-based summarization.
+    Args:
+        query: Original query.
+        documents: Retrieved documents.
+        content_type: Type of content.
+        context: Optional conversation context.
+        use_llm: Whether to try LLM generation first.
+    Returns:
+        Generated answer text.
+    """
+    def _invoke_llm(documents_for_prompt: List[Any]) -> Optional[str]:
+        """Call configured LLM provider safely."""
+        try:
+            import traceback
+            from hue_portal.chatbot.llm_integration import get_llm_generator
+            llm = get_llm_generator()
+            if not llm:
+                print("[RAG] ⚠️ LLM not available, using template", flush=True)
+                return None
+                print(f"[RAG] Using LLM provider: {llm.provider}", flush=True)
+            llm_answer = llm.generate_answer(
+                query,
+                context=context,
+                documents=documents_for_prompt
+            )
+                if llm_answer:
+                    print(f"[RAG] ✅ LLM answer generated (length: {len(llm_answer)})", flush=True)
+                    return llm_answer
+            print("[RAG] ⚠️ LLM returned None, using template", flush=True)
+        except Exception as exc:
+            import traceback
+            error_trace = traceback.format_exc()
+            print(f"[RAG] ❌ LLM generation failed, using template: {exc}", flush=True)
+            print(f"[RAG] ❌ Trace: {error_trace}", flush=True)
+        return None
+    llm_enabled = use_llm or content_type == 'general'
+    if llm_enabled:
+        llm_documents = documents if documents else []
+        llm_answer = _invoke_llm(llm_documents)
+        if llm_answer:
+            return llm_answer
+    # If no documents, fall back gracefully
+    if not documents:
+        if content_type == 'general':
+            return (
+                f"Tôi chưa có dữ liệu pháp luật liên quan đến '{query}', "
+                "nhưng vẫn sẵn sàng trò chuyện hoặc hỗ trợ bạn ở chủ đề khác. "
+                "Bạn có thể mô tả cụ thể hơn để tôi giúp tốt hơn nhé!"
+            )
+        return (
+            f"Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}' trong cơ sở dữ liệu. "
+            "Vui lòng thử lại với từ khóa khác hoặc liên hệ trực tiếp với Công an thành phố Huế để được tư vấn."
+        )
+    # Fallback to template-based generation
+    if content_type == 'procedure':
+        return _generate_procedure_answer(query, documents)
+    elif content_type == 'fine':
+        return _generate_fine_answer(query, documents)
+    elif content_type == 'office':
+        return _generate_office_answer(query, documents)
+    elif content_type == 'advisory':
+        return _generate_advisory_answer(query, documents)
+    elif content_type == 'legal':
+        return _generate_legal_answer(query, documents)
+    else:
+        return _generate_general_answer(query, documents)
+def _generate_procedure_answer(query: str, documents: List[Procedure]) -> str:
+    """Generate answer for procedure queries."""
+    count = len(documents)
+    answer = f"Tôi tìm thấy {count} thủ tục liên quan đến '{query}':\n\n"
+    for i, doc in enumerate(documents[:5], 1):
+        answer += f"{i}. {doc.title}\n"
+        if doc.domain:
+            answer += f"   Lĩnh vực: {doc.domain}\n"
+        if doc.level:
+            answer += f"   Cấp: {doc.level}\n"
+        if doc.conditions:
+            conditions_short = doc.conditions[:100] + "..." if len(doc.conditions) > 100 else doc.conditions
+            answer += f"   Điều kiện: {conditions_short}\n"
+        answer += "\n"
+    if count > 5:
+        answer += f"... và {count - 5} thủ tục khác.\n"
+    return answer
+def _generate_fine_answer(query: str, documents: List[Fine]) -> str:
+    """Generate answer for fine queries."""
+    count = len(documents)
+    answer = f"Tôi tìm thấy {count} mức phạt liên quan đến '{query}':\n\n"
+    # Highlight best match (first result) if available
+    if documents:
+        best_match = documents[0]
+        answer += "Kết quả chính xác nhất:\n"
+        answer += f"• {best_match.name}\n"
+        if best_match.code:
+            answer += f"  Mã vi phạm: {best_match.code}\n"
+        # Format fine amount using helper function
+        fine_amount = format_fine_amount(
+            float(best_match.min_fine) if best_match.min_fine else None,
+            float(best_match.max_fine) if best_match.max_fine else None
+        )
+        if fine_amount:
+            answer += f"  Mức phạt: {fine_amount}\n"
+        if best_match.article:
+            answer += f"  Điều luật: {best_match.article}\n"
+        answer += "\n"
+        # Add other results if available
+        if count > 1:
+            answer += "Các mức phạt khác:\n"
+            for i, doc in enumerate(documents[1:5], 2):
+                answer += f"{i}. {doc.name}\n"
+                if doc.code:
+                    answer += f"   Mã vi phạm: {doc.code}\n"
+                # Format fine amount
+                fine_amount = format_fine_amount(
+                    float(doc.min_fine) if doc.min_fine else None,
+                    float(doc.max_fine) if doc.max_fine else None
+                )
+                if fine_amount:
+                    answer += f"   Mức phạt: {fine_amount}\n"
+                if doc.article:
+                    answer += f"   Điều luật: {doc.article}\n"
+                answer += "\n"
+    else:
+        # Fallback if no documents
+        for i, doc in enumerate(documents[:5], 1):
+            answer += f"{i}. {doc.name}\n"
+            if doc.code:
+                answer += f"   Mã vi phạm: {doc.code}\n"
+            # Format fine amount
+            fine_amount = format_fine_amount(
+                float(doc.min_fine) if doc.min_fine else None,
+                float(doc.max_fine) if doc.max_fine else None
+            )
+            if fine_amount:
+                answer += f"   Mức phạt: {fine_amount}\n"
+            if doc.article:
+                answer += f"   Điều luật: {doc.article}\n"
+            answer += "\n"
+    if count > 5:
+        answer += f"... và {count - 5} mức phạt khác.\n"
+    return answer
+def _generate_office_answer(query: str, documents: List[Office]) -> str:
+    """Generate answer for office queries."""
+    count = len(documents)
+    answer = f"Tôi tìm thấy {count} đơn vị liên quan đến '{query}':\n\n"
+    for i, doc in enumerate(documents[:5], 1):
+        answer += f"{i}. {doc.unit_name}\n"
+        if doc.address:
+            answer += f"   Địa chỉ: {doc.address}\n"
+        if doc.district:
+            answer += f"   Quận/Huyện: {doc.district}\n"
+        if doc.phone:
+            answer += f"   Điện thoại: {doc.phone}\n"
+        if doc.working_hours:
+            answer += f"   Giờ làm việc: {doc.working_hours}\n"
+        answer += "\n"
+    if count > 5:
+        answer += f"... và {count - 5} đơn vị khác.\n"
+    return answer
+def _generate_advisory_answer(query: str, documents: List[Advisory]) -> str:
+    """Generate answer for advisory queries."""
+    count = len(documents)
+    answer = f"Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':\n\n"
+    for i, doc in enumerate(documents[:5], 1):
+        answer += f"{i}. {doc.title}\n"
+        if doc.summary:
+            summary_short = doc.summary[:150] + "..." if len(doc.summary) > 150 else doc.summary
+            answer += f"   {summary_short}\n"
+        answer += "\n"
+    if count > 5:
+        answer += f"... và {count - 5} cảnh báo khác.\n"
+    return answer
+def _clean_text(value: str) -> str:
+    """Normalize whitespace and strip noise for legal snippets."""
+    if not value:
+        return ""
+    compressed = re.sub(r"\s+", " ", value)
+    return compressed.strip()
+def _summarize_section(
+    section: LegalSection,
+    max_sentences: int = 3,
+    max_chars: int = 600
+) -> str:
+    """
+    Produce a concise Vietnamese summary directly from the stored content.
+    This is used as the Vietnamese prefill before calling the LLM so we avoid
+    English drift and keep the answer grounded.
+    """
+    content = _clean_text(section.content)
+    if not content:
+        return ""
+    # Split by sentence boundaries; fall back to chunks if delimiters missing.
+    sentences = re.split(r"(?<=[.!?])\s+", content)
+    if not sentences:
+        sentences = [content]
+    summary_parts = []
+    for sentence in sentences:
+        if not sentence:
+            continue
+        summary_parts.append(sentence)
+        joined = " ".join(summary_parts)
+        if len(summary_parts) >= max_sentences or len(joined) >= max_chars:
+            break
+    summary = " ".join(summary_parts)
+    if len(summary) > max_chars:
+        summary = summary[:max_chars].rsplit(" ", 1)[0] + "..."
+    return summary.strip()
+def _format_citation(section: LegalSection) -> str:
+    citation = section.document.title
+    if section.section_code:
+        citation = f"{citation} – {section.section_code}"
+    page = ""
+    if section.page_start:
+        page = f" (trang {section.page_start}"
+        if section.page_end and section.page_end != section.page_start:
+            page += f"-{section.page_end}"
+        page += ")"
+    return f"{citation}{page}".strip()
+def _build_legal_prefill(documents: List[LegalSection]) -> str:
+    """
+    Build a compact Vietnamese summary block that will be injected into the
+    Guardrails prompt. The goal is to bias the model toward Vietnamese output.
+    """
+    if not documents:
+        return ""
+    lines = ["Bản tóm tắt tiếng Việt từ cơ sở dữ liệu:"]
+    for idx, section in enumerate(documents[:3], start=1):
+        summary = _summarize_section(section, max_sentences=2, max_chars=400)
+        citation = _format_citation(section)
+        if not summary:
+            continue
+        lines.append(f"{idx}. {summary} (Nguồn: {citation})")
+    return "\n".join(lines)
+def _generate_legal_citation_block(documents: List[LegalSection]) -> str:
+    """Return formatted citation block reused by multiple answer modes."""
+    if not documents:
+        return ""
+    lines: List[str] = []
+    for idx, section in enumerate(documents[:5], start=1):
+        summary = _summarize_section(section)
+        snippet = _clean_text(section.content)[:350]
+        if snippet and len(snippet) == 350:
+            snippet = snippet.rsplit(" ", 1)[0] + "..."
+        citation = _format_citation(section)
+        lines.append(f"{idx}. {section.section_title or 'Nội dung'} – {citation}")
+        if summary:
+            lines.append(f"   - Tóm tắt: {summary}")
+        if snippet:
+            lines.append(f"   - Trích dẫn: \"{snippet}\"")
+        lines.append("")
+    if len(documents) > 5:
+        lines.append(f"... và {len(documents) - 5} trích đoạn khác trong cùng nguồn dữ liệu.")
+    return "\n".join(lines).strip()
+def _generate_legal_answer(query: str, documents: List[LegalSection]) -> str:
+    count = len(documents)
+    if count == 0:
+        return (
+            f"Tôi chưa tìm thấy trích dẫn pháp lý nào cho '{query}'. "
+            "Bạn có thể cung cấp thêm ngữ cảnh để tôi tiếp tục hỗ trợ."
+        )
+    header = (
+        f"Tôi đã tổng hợp {count} trích đoạn pháp lý liên quan đến '{query}'. "
+        "Đây là bản tóm tắt tiếng Việt kèm trích dẫn:"
+    )
+    citation_block = _generate_legal_citation_block(documents)
+    return f"{header}\n\n{citation_block}".strip()
+def _generate_general_answer(query: str, documents: List[Any]) -> str:
+    """Generate general answer."""
+    count = len(documents)
+    return f"Tôi tìm thấy {count} kết quả liên quan đến '{query}'. Vui lòng xem chi tiết bên dưới."
+def _strip_accents(value: str) -> str:
+    return "".join(
+        char for char in unicodedata.normalize("NFD", value)
+        if unicodedata.category(char) != "Mn"
+    )
+def _contains_markers(
+    text_with_accents: str,
+    text_without_accents: str,
+    markers: List[str]
+) -> bool:
+    for marker in markers:
+        marker_lower = marker.lower()
+        marker_no_accents = _strip_accents(marker_lower)
+        if marker_lower in text_with_accents or marker_no_accents in text_without_accents:
+            return True
+    return False
+def _is_valid_legal_answer(answer: str, documents: List[LegalSection]) -> bool:
+    """
+    Validate that the LLM answer for legal intent references actual legal content.
+    Criteria:
+        - Must not contain denial phrases (already handled earlier) or "xin lỗi".
+        - Must not introduce obvious monetary values (legal documents không có số tiền phạt).
+        - Must have tối thiểu 40 ký tự để tránh câu trả lời quá ngắn.
+    """
+    if not answer:
+        return False
+    normalized_answer = answer.lower()
+    normalized_answer_no_accents = _strip_accents(normalized_answer)
+    denial_markers = [
+        "xin lỗi",
+        "thông tin trong cơ sở dữ liệu chưa đủ",
+        "không thể giúp",
+        "không tìm thấy thông tin",
+        "không có dữ liệu",
+    ]
+    if _contains_markers(normalized_answer, normalized_answer_no_accents, denial_markers):
+        return False
+    money_markers = ["vnđ", "vnd", "đồng", "đ", "dong"]
+    if _contains_markers(normalized_answer, normalized_answer_no_accents, money_markers):
+        return False
+    if len(answer.strip()) < 40:
+        return False
+    return True
+def rag_pipeline(
+    query: str,
+    intent: str,
+    top_k: int = 5,
+    min_confidence: float = 0.3,
+    context: Optional[List[Dict[str, Any]]] = None,
+    use_llm: bool = True
+) -> Dict[str, Any]:
+    """
+    Complete RAG pipeline: retrieval + answer generation.
+    Args:
+        query: User query.
+        intent: Detected intent.
+        top_k: Number of documents to retrieve.
+        min_confidence: Minimum confidence threshold.
+        context: Optional conversation context.
+        use_llm: Whether to use LLM for answer generation.
+    Returns:
+        Dictionary with 'answer', 'documents', 'count', 'confidence', 'content_type'.
+    """
+    # Map intent to content type
+    intent_to_type = {
+        'search_procedure': 'procedure',
+        'search_fine': 'fine',
+        'search_office': 'office',
+        'search_advisory': 'advisory',
+        'search_legal': 'legal',
+        'general_query': 'general',
+        'greeting': 'general',
+    }
+    content_type = intent_to_type.get(intent, 'procedure')
+    # Retrieve documents
+    documents = retrieve_top_k_documents(query, content_type, top_k=top_k)
+    # Enable LLM automatically for casual conversation intents
+    llm_allowed = use_llm or intent in {"general_query", "greeting"}
+    structured_used = False
+    answer: Optional[str] = None
+    if intent == "search_legal" and documents:
+        llm = get_llm_generator()
+        if llm:
+            prefill_summary = _build_legal_prefill(documents)
+            structured = llm.generate_structured_legal_answer(
+                query,
+                documents,
+                prefill_summary=prefill_summary,
+            )
+            if structured:
+                answer = format_structured_legal_answer(structured)
+                structured_used = True
+                citation_block = _generate_legal_citation_block(documents)
+                if citation_block:
+                    answer = (
+                        f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}"
+                    )
+    if answer is None:
+        answer = generate_answer_template(
+            query,
+            documents,
+            content_type,
+            context=context,
+            use_llm=llm_allowed
+        )
+    # Fallback nếu intent pháp luật nhưng câu LLM không đạt tiêu chí
+    if (
+        intent == "search_legal"
+        and documents
+        and isinstance(answer, str)
+        and not structured_used
+    ):
+        if not _is_valid_legal_answer(answer, documents):
+            print("[RAG] ⚠️ Fallback: invalid legal answer detected", flush=True)
+            answer = _generate_legal_answer(query, documents)
+        else:
+            citation_block = _generate_legal_answer(query, documents)
+            if citation_block.strip():
+                answer = f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}"
+    # Calculate confidence (simple: based on number of results and scores)
+    confidence = min(1.0, len(documents) / top_k)
+    if documents and hasattr(documents[0], '_hybrid_score'):
+        confidence = max(confidence, documents[0]._hybrid_score)
+    return {
+        'answer': answer,
+        'documents': documents,
+        'count': len(documents),
+        'confidence': confidence,
+        'content_type': content_type
+    }