Spaces:
Sleeping
Sleeping
| """ | |
| llm_utils.py — LangChain integration layer for the analytics demo. | |
| Design principle: the LLM is an optional narrator, not a decision-maker. | |
| Every public function must return a valid result even when Ollama is absent. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| # Guard against environments where LangChain is not installed. | |
| # The rule engine in demo.py never imports this at module level for logic, | |
| # so an ImportError here degrades gracefully to the fallback path. | |
| # | |
| # We prefer langchain-ollama (the current, non-deprecated package) and fall | |
| # back to langchain_community.llms.Ollama for environments that only have | |
| # the older package installed. | |
| try: | |
| from langchain_core.prompts import PromptTemplate | |
| try: | |
| from langchain_ollama import OllamaLLM as Ollama # preferred (langchain-ollama) | |
| except ImportError: | |
| from langchain_community.llms import Ollama # fallback (older install) | |
| LANGCHAIN_AVAILABLE = True | |
| except ImportError: | |
| LANGCHAIN_AVAILABLE = False | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| OLLAMA_BASE_URL = "http://localhost:11434" | |
| OLLAMA_MODEL = "llama3.2" | |
| # The system instruction is kept as a constant so it can be audited, | |
| # versioned, and referenced in documentation independently of the template. | |
| SYSTEM_PROMPT = ( | |
| "You are assisting a business analyst. " | |
| "Explain the following metric validation findings clearly and factually. " | |
| "Do not speculate on causes. " | |
| "Do not introduce information not present in the data. " | |
| "Use a neutral, executive-friendly tone." | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Internal helpers | |
| # --------------------------------------------------------------------------- | |
| def _build_prompt_template() -> "PromptTemplate": | |
| """ | |
| Construct the LangChain PromptTemplate. | |
| Why PromptTemplate rather than an f-string: the variable injection point | |
| is explicit and the template can be tested and versioned independently | |
| of the findings serialization logic. | |
| """ | |
| template = ( | |
| f"{SYSTEM_PROMPT}\n\n" | |
| "Metric Validation Findings:\n" | |
| "{findings_text}\n\n" | |
| "Provide a concise executive summary (3-5 sentences) of the above findings. " | |
| "Stick strictly to the facts presented." | |
| ) | |
| return PromptTemplate( | |
| input_variables=["findings_text"], | |
| template=template, | |
| ) | |
| def _serialize_findings(findings: dict) -> str: | |
| """ | |
| Convert the structured findings dict into a plain-text paragraph suitable | |
| for injection into the LLM prompt. | |
| Why plain text rather than raw JSON: LLMs produce better, more natural | |
| summaries when given prose-style context rather than nested JSON objects. | |
| """ | |
| lines: list[str] = [] | |
| dr = findings["stats"]["date_range"] | |
| lines.append( | |
| f"Dataset covers {dr['actual_rows']} rows from {dr['start']} to {dr['end']}. " | |
| f"Expected {dr['expected_rows']} rows based on the date range " | |
| f"(gap: {dr['row_gap']} row(s))." | |
| ) | |
| rev = findings["stats"]["revenue"] | |
| lines.append( | |
| f"Revenue (USD): mean=${rev['mean']:,.2f}, std=${rev['std']:,.2f}, " | |
| f"total=${rev['total']:,.2f}, missing={rev['missing_count']} value(s)." | |
| ) | |
| ord_ = findings["stats"]["orders"] | |
| lines.append( | |
| f"Orders: mean={ord_['mean']:,.0f}, std={ord_['std']:,.1f}, " | |
| f"total={int(ord_['total']):,}, missing={ord_['missing_count']} value(s)." | |
| ) | |
| issues = findings["issues"] | |
| if issues: | |
| lines.append(f"\nDetected {len(issues)} issue(s):") | |
| for issue in issues: | |
| date_str = f" on {', '.join(issue['dates'])}" if issue["dates"] else "" | |
| lines.append(f" - [{issue['severity']}] {issue['detail']}{date_str}") | |
| else: | |
| lines.append("\nNo data quality issues detected. Dataset appears clean.") | |
| return "\n".join(lines) | |
| def _try_ollama_summary(findings_text: str) -> str | None: | |
| """ | |
| Attempt a local Ollama call via LangChain. Returns the summary string on | |
| success, or None on any failure (connection refused, model not found, etc.). | |
| Why return None instead of raising: the caller uses None as a signal to | |
| activate the deterministic fallback, keeping all error-handling in one place. | |
| Errors are printed to stderr so they don't pollute the report on stdout. | |
| """ | |
| if not LANGCHAIN_AVAILABLE: | |
| return None | |
| try: | |
| prompt_template = _build_prompt_template() | |
| llm = Ollama(base_url=OLLAMA_BASE_URL, model=OLLAMA_MODEL, timeout=30) | |
| # LCEL pipe syntax: preferred over deprecated LLMChain | |
| chain = prompt_template | llm | |
| result = chain.invoke({"findings_text": findings_text}) | |
| # langchain_community.llms.Ollama returns a plain str; | |
| # ChatOllama returns an AIMessage — handle both defensively. | |
| if hasattr(result, "content"): | |
| return result.content.strip() or None | |
| return str(result).strip() or None | |
| except Exception as exc: | |
| print( | |
| f"[llm_utils] Ollama unavailable ({type(exc).__name__}): {exc}", | |
| file=sys.stderr, | |
| ) | |
| return None | |
| def _rule_based_summary(findings: dict) -> str: | |
| """ | |
| Generate a deterministic plain-text summary from the findings dict. | |
| This is the guaranteed fallback when no LLM is available. Template-driven | |
| text is auditable, predictable, and consistent — qualities analysts require | |
| from a validation tool used in reporting contexts. | |
| """ | |
| dr = findings["stats"]["date_range"] | |
| issues = findings["issues"] | |
| line1 = ( | |
| f"The dataset spans {dr['actual_rows']} rows from {dr['start']} to " | |
| f"{dr['end']} (expected {dr['expected_rows']} calendar days)." | |
| ) | |
| if not issues: | |
| return ( | |
| f" {line1}\n" | |
| " No data quality issues were detected. " | |
| "The dataset appears suitable for reporting." | |
| ) | |
| parts: list[str] = [] | |
| missing_issues = [i for i in issues if i["type"] == "missing_values"] | |
| row_issues = [i for i in issues if i["type"] == "row_count"] | |
| anomaly_issues = [i for i in issues if i["type"] == "anomaly_drop"] | |
| duplicate_issues = [i for i in issues if i["type"] == "duplicate_dates"] | |
| if missing_issues: | |
| cols = sorted({i["column"] for i in missing_issues}) | |
| parts.append(f"Missing values were identified in column(s): {', '.join(cols)}.") | |
| if row_issues: | |
| gap = row_issues[0]["value"] | |
| parts.append(f"A row count gap of {int(gap)} was detected in the date sequence.") | |
| if duplicate_issues: | |
| parts.append( | |
| f"{len(duplicate_issues)} duplicate date(s) were found, " | |
| "which may cause double-counting in aggregations." | |
| ) | |
| if anomaly_issues: | |
| # Report worst single-day drop | |
| worst = min(anomaly_issues, key=lambda x: x["value"]) | |
| parts.append( | |
| f"{len(anomaly_issues)} day-over-day drop(s) exceeding the 20% anomaly " | |
| f"threshold were flagged; the largest was {worst['value']:.1f}% " | |
| f"on {worst['dates'][0]}." | |
| ) | |
| parts.append( | |
| "These findings should be reviewed and resolved before this dataset " | |
| "is used in executive or board-level reporting." | |
| ) | |
| body = " ".join(parts) | |
| return f" {line1}\n {body}" | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def generate_summary(findings: dict) -> str: | |
| """ | |
| Generate a human-readable summary of the findings dict. | |
| Attempts an Ollama (local LLM) call first; falls back to a deterministic | |
| rule-based summary if Ollama is unavailable. Always returns a non-empty string. | |
| Args: | |
| findings: The dict returned by demo.run_checks() | |
| Returns: | |
| A formatted summary string including a source label. | |
| """ | |
| findings_text = _serialize_findings(findings) | |
| llm_result = _try_ollama_summary(findings_text) | |
| if llm_result: | |
| source_label = f" Source: Ollama ({OLLAMA_MODEL})\n" | |
| # Indent each line of LLM output to match the report's 2-space style | |
| indented = "\n".join(f" {line}" for line in llm_result.splitlines()) | |
| return source_label + "\n" + indented | |
| source_label = " Source: Rule-Based Fallback (Ollama unavailable)\n" | |
| return source_label + "\n" + _rule_based_summary(findings) | |