analytics-bot / llm_utils.py
Arunvithyasegar's picture
Upload 6 files
7878c12 verified
"""
llm_utils.py — LangChain integration layer for the analytics demo.
Design principle: the LLM is an optional narrator, not a decision-maker.
Every public function must return a valid result even when Ollama is absent.
"""
from __future__ import annotations
import sys
# Guard against environments where LangChain is not installed.
# The rule engine in demo.py never imports this at module level for logic,
# so an ImportError here degrades gracefully to the fallback path.
#
# We prefer langchain-ollama (the current, non-deprecated package) and fall
# back to langchain_community.llms.Ollama for environments that only have
# the older package installed.
try:
from langchain_core.prompts import PromptTemplate
try:
from langchain_ollama import OllamaLLM as Ollama # preferred (langchain-ollama)
except ImportError:
from langchain_community.llms import Ollama # fallback (older install)
LANGCHAIN_AVAILABLE = True
except ImportError:
LANGCHAIN_AVAILABLE = False
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
OLLAMA_BASE_URL = "http://localhost:11434"
OLLAMA_MODEL = "llama3.2"
# The system instruction is kept as a constant so it can be audited,
# versioned, and referenced in documentation independently of the template.
SYSTEM_PROMPT = (
"You are assisting a business analyst. "
"Explain the following metric validation findings clearly and factually. "
"Do not speculate on causes. "
"Do not introduce information not present in the data. "
"Use a neutral, executive-friendly tone."
)
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _build_prompt_template() -> "PromptTemplate":
"""
Construct the LangChain PromptTemplate.
Why PromptTemplate rather than an f-string: the variable injection point
is explicit and the template can be tested and versioned independently
of the findings serialization logic.
"""
template = (
f"{SYSTEM_PROMPT}\n\n"
"Metric Validation Findings:\n"
"{findings_text}\n\n"
"Provide a concise executive summary (3-5 sentences) of the above findings. "
"Stick strictly to the facts presented."
)
return PromptTemplate(
input_variables=["findings_text"],
template=template,
)
def _serialize_findings(findings: dict) -> str:
"""
Convert the structured findings dict into a plain-text paragraph suitable
for injection into the LLM prompt.
Why plain text rather than raw JSON: LLMs produce better, more natural
summaries when given prose-style context rather than nested JSON objects.
"""
lines: list[str] = []
dr = findings["stats"]["date_range"]
lines.append(
f"Dataset covers {dr['actual_rows']} rows from {dr['start']} to {dr['end']}. "
f"Expected {dr['expected_rows']} rows based on the date range "
f"(gap: {dr['row_gap']} row(s))."
)
rev = findings["stats"]["revenue"]
lines.append(
f"Revenue (USD): mean=${rev['mean']:,.2f}, std=${rev['std']:,.2f}, "
f"total=${rev['total']:,.2f}, missing={rev['missing_count']} value(s)."
)
ord_ = findings["stats"]["orders"]
lines.append(
f"Orders: mean={ord_['mean']:,.0f}, std={ord_['std']:,.1f}, "
f"total={int(ord_['total']):,}, missing={ord_['missing_count']} value(s)."
)
issues = findings["issues"]
if issues:
lines.append(f"\nDetected {len(issues)} issue(s):")
for issue in issues:
date_str = f" on {', '.join(issue['dates'])}" if issue["dates"] else ""
lines.append(f" - [{issue['severity']}] {issue['detail']}{date_str}")
else:
lines.append("\nNo data quality issues detected. Dataset appears clean.")
return "\n".join(lines)
def _try_ollama_summary(findings_text: str) -> str | None:
"""
Attempt a local Ollama call via LangChain. Returns the summary string on
success, or None on any failure (connection refused, model not found, etc.).
Why return None instead of raising: the caller uses None as a signal to
activate the deterministic fallback, keeping all error-handling in one place.
Errors are printed to stderr so they don't pollute the report on stdout.
"""
if not LANGCHAIN_AVAILABLE:
return None
try:
prompt_template = _build_prompt_template()
llm = Ollama(base_url=OLLAMA_BASE_URL, model=OLLAMA_MODEL, timeout=30)
# LCEL pipe syntax: preferred over deprecated LLMChain
chain = prompt_template | llm
result = chain.invoke({"findings_text": findings_text})
# langchain_community.llms.Ollama returns a plain str;
# ChatOllama returns an AIMessage — handle both defensively.
if hasattr(result, "content"):
return result.content.strip() or None
return str(result).strip() or None
except Exception as exc:
print(
f"[llm_utils] Ollama unavailable ({type(exc).__name__}): {exc}",
file=sys.stderr,
)
return None
def _rule_based_summary(findings: dict) -> str:
"""
Generate a deterministic plain-text summary from the findings dict.
This is the guaranteed fallback when no LLM is available. Template-driven
text is auditable, predictable, and consistent — qualities analysts require
from a validation tool used in reporting contexts.
"""
dr = findings["stats"]["date_range"]
issues = findings["issues"]
line1 = (
f"The dataset spans {dr['actual_rows']} rows from {dr['start']} to "
f"{dr['end']} (expected {dr['expected_rows']} calendar days)."
)
if not issues:
return (
f" {line1}\n"
" No data quality issues were detected. "
"The dataset appears suitable for reporting."
)
parts: list[str] = []
missing_issues = [i for i in issues if i["type"] == "missing_values"]
row_issues = [i for i in issues if i["type"] == "row_count"]
anomaly_issues = [i for i in issues if i["type"] == "anomaly_drop"]
duplicate_issues = [i for i in issues if i["type"] == "duplicate_dates"]
if missing_issues:
cols = sorted({i["column"] for i in missing_issues})
parts.append(f"Missing values were identified in column(s): {', '.join(cols)}.")
if row_issues:
gap = row_issues[0]["value"]
parts.append(f"A row count gap of {int(gap)} was detected in the date sequence.")
if duplicate_issues:
parts.append(
f"{len(duplicate_issues)} duplicate date(s) were found, "
"which may cause double-counting in aggregations."
)
if anomaly_issues:
# Report worst single-day drop
worst = min(anomaly_issues, key=lambda x: x["value"])
parts.append(
f"{len(anomaly_issues)} day-over-day drop(s) exceeding the 20% anomaly "
f"threshold were flagged; the largest was {worst['value']:.1f}% "
f"on {worst['dates'][0]}."
)
parts.append(
"These findings should be reviewed and resolved before this dataset "
"is used in executive or board-level reporting."
)
body = " ".join(parts)
return f" {line1}\n {body}"
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def generate_summary(findings: dict) -> str:
"""
Generate a human-readable summary of the findings dict.
Attempts an Ollama (local LLM) call first; falls back to a deterministic
rule-based summary if Ollama is unavailable. Always returns a non-empty string.
Args:
findings: The dict returned by demo.run_checks()
Returns:
A formatted summary string including a source label.
"""
findings_text = _serialize_findings(findings)
llm_result = _try_ollama_summary(findings_text)
if llm_result:
source_label = f" Source: Ollama ({OLLAMA_MODEL})\n"
# Indent each line of LLM output to match the report's 2-space style
indented = "\n".join(f" {line}" for line in llm_result.splitlines())
return source_label + "\n" + indented
source_label = " Source: Rule-Based Fallback (Ollama unavailable)\n"
return source_label + "\n" + _rule_based_summary(findings)