Spaces:
Sleeping
Sleeping
| """ | |
| DeepEval β weryfikacja Faithfulness (WiernoΕci) dla GrantForge AI poprzez instancjΔ Prawnika (LangGraph). | |
| FAZA 6: LLMOps β automatyczna weryfikacja halucynacji w RAG. | |
| Wymaga `.env` (lub pustego .env i domyΕlnego zachowania) + zainstalowanego `deepeval`. | |
| Uruchomienie: | |
| pip install -r requirements-dev.txt | |
| deepeval test run tests/test_deepeval_rag.py | |
| """ | |
| import pytest | |
| import os | |
| from dotenv import load_dotenv | |
| # DeepEval jest opcjonalnΔ zaleΕΌnoΕciΔ dla produkcji β graceful import uΕatwia CI | |
| try: | |
| from deepeval import assert_test | |
| from deepeval.test_case import LLMTestCase | |
| from deepeval.metrics import FaithfulnessMetric | |
| DEEPEVAL_AVAILABLE = True | |
| except ImportError: | |
| DEEPEVAL_AVAILABLE = False | |
| from langgraph.graph import StateGraph, START, END | |
| from agents.panel_state import AuditorPanelState | |
| from agents.panel_nodes import ( | |
| prawnik_node, | |
| prawnik_tools_node, | |
| prawnik_evaluator_node, | |
| prawnik_routing, | |
| ) | |
| # ZaΕaduj zmienne od razu (test_panel.py style) | |
| dotenv_path = os.path.join(os.path.dirname(__file__), "..", ".env") | |
| load_dotenv(dotenv_path) | |
| # WyΕΔ czamy LangSmith by uniknΔ Δ 401 w testach bez dobrego api key | |
| os.environ["LANGCHAIN_TRACING_V2"] = "false" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NarzΔdzie: Konstrukcja wycinka Grafu tylko dla ewaluacji RAG | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def create_test_prawnik_graph(): | |
| """Zwraca podrzΔdny graf reprezentujΔ cy wyΕΔ cznie ΕcieΕΌkΔ prawnika.""" | |
| workflow = StateGraph(AuditorPanelState) | |
| workflow.add_node("prawnik", prawnik_node) | |
| workflow.add_node("prawnik_tools", prawnik_tools_node) | |
| workflow.add_node("prawnik_evaluator", prawnik_evaluator_node) | |
| workflow.add_edge(START, "prawnik") | |
| workflow.add_conditional_edges( | |
| "prawnik", | |
| prawnik_routing, | |
| {"tools": "prawnik_tools", "evaluate": "prawnik_evaluator"}, | |
| ) | |
| workflow.add_edge("prawnik_tools", "prawnik") | |
| workflow.add_edge("prawnik_evaluator", END) | |
| return workflow.compile() | |
| # Pobieramy to globalnie by nie kompilowaΔ dla kaΕΌdego testu | |
| app_test = create_test_prawnik_graph() | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Model customowy dla DeepEval (np. uΕΌywamy Gemini zamiast domyΕlnego OpenAI) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if DEEPEVAL_AVAILABLE: | |
| from deepeval.models.base_model import DeepEvalBaseLLM | |
| class DeepEvalGemini(DeepEvalBaseLLM): | |
| """Implementacja wrapper'a dostarczajΔ cego wΕasny model via langchain""" | |
| def __init__(self): | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| self._gemini = ChatGoogleGenerativeAI( | |
| model="gemini-2.0-flash", temperature=0 | |
| ) | |
| def load_model(self): | |
| return self._gemini | |
| def generate(self, prompt: str, schema=None, **kwargs) -> str: | |
| # DeepEval passing schema? We just use standard invocation. | |
| res = self._gemini.invoke(prompt) | |
| return res.content | |
| async def a_generate(self, prompt: str, schema=None, **kwargs) -> str: | |
| res = await self._gemini.ainvoke(prompt) | |
| return res.content | |
| def get_model_name(self): | |
| return "gemini-2.0-flash" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Dane testowe (Live Query Testing) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RAG_TEST_CASES = [ | |
| { | |
| "name": "FENG_Szybka_Sciezka_MSP", | |
| "input": "Czy moja firma jako duΕΌe przedsiΔbiorstwo moΕΌe ubiegaΔ siΔ o FENG Szybka ΕcieΕΌka?", | |
| "program": "FENG", | |
| }, | |
| { | |
| "name": "KPO_Ubezpieczenia", | |
| "input": "Czy koszty ubezpieczenia samochodΓ³w sΕuΕΌbowych sΔ kwalifikowalne w KPO?", | |
| "program": "KPO", | |
| }, | |
| { | |
| "name": "DNSH_Maszyny", | |
| "input": "Jak wykazaΔ zasadΔ DNSH w projekcie polegajΔ cym na zakupie maszyn CNC?", | |
| "program": "SMART", | |
| }, | |
| ] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Testy wiernoΕci (Live Execution) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestLiveRAGFaithfulness: | |
| def setup(self): | |
| """Konfiguracja metryk z progami akceptacji.""" | |
| custom_gemini = DeepEvalGemini() | |
| self.faithfulness_metric = FaithfulnessMetric( | |
| threshold=0.7, | |
| model=custom_gemini, | |
| include_reason=True, | |
| ) | |
| def test_faithfulness_live(self, case_data: dict): | |
| """RozwiΔ zuje pytanie na ΕΌywych narzΔdziach LangGraph i testuje faithfulness.""" | |
| # 1. Inicjalizacja stanu | |
| initial_state = { | |
| "project_id": "eval_test", | |
| "program_name": case_data["program"], | |
| "content": f"Aplikujemy o projekt. Pytanie upewniajΔ ce: {case_data['input']}", | |
| "issues": [], | |
| "perspectives_summary": {}, | |
| "perspective_scores": [], | |
| "legal_attempts": 0, | |
| "legal_queries": [], | |
| "messages": [], | |
| "prawnik_done": False, | |
| } | |
| # 2. Uruchomienie Graphu (Prawnik -> Tools -> Evaluator) | |
| final_state = app_test.invoke(initial_state) | |
| # 3. WyciΔ gniΔcie Outputu Prawnika i ContextΓ³w RAG (history of queries) | |
| # prawnik_evaluator wrzuca ocenΔ do perspectives_summary["Prawnik"] jako sΕownik (z merge_dicts) | |
| prawnik_summary = final_state.get("perspectives_summary", {}).get("Prawnik", {}) | |
| # LLM output to treΕΔ podsumowania: | |
| actual_output = str(prawnik_summary) | |
| # Kontekst to zapytania przekazane i zwrΓ³cone: | |
| # Odwzorujemy historiΔ uΕΌytego kontekstu przez legal_queries: | |
| legal_queries = final_state.get("legal_queries", []) | |
| retrieval_context = [q for q in legal_queries] | |
| if not retrieval_context: | |
| retrieval_context = [ | |
| "Brak formalnie pobranego kontekstu. MogΕo odpowiedzieΔ z wiedzy wΕasnej." | |
| ] | |
| # 4. DeepEval LLMTestCase | |
| test_case = LLMTestCase( | |
| input=case_data["input"], | |
| actual_output=actual_output, | |
| retrieval_context=retrieval_context, | |
| ) | |
| assert_test(test_case, [self.faithfulness_metric]) | |
| class TestAuditStructure: | |
| """Testy nie uΕΌywajΔ ce external API β sprawdzanie struktur klas.""" | |
| def test_audit_output_has_disclaimer(self): | |
| from agents.auditor import GlobalAuditOutput | |
| output = GlobalAuditOutput( | |
| is_approved=True, | |
| export_status="ok", | |
| overall_score=85, | |
| issues=[], | |
| ) | |
| assert "AI" in output.ai_disclaimer | |
| def test_human_review_required_logic(self): | |
| from agents.auditor import GlobalAuditOutput, AuditIssue | |
| output = GlobalAuditOutput( | |
| is_approved=False, | |
| export_status="warning", | |
| overall_score=65, | |
| human_review_required=True, | |
| issues=[AuditIssue(category="Test", severity="high", message="Test issue")], | |
| ) | |
| assert output.human_review_required is True | |
| assert output.overall_score == 65 | |