Spaces:

Arunvithyasegar
/

analytics-bot

Sleeping

App Files Files Community

analytics-bot / llm_utils.py

Arunvithyasegar

Upload 6 files

7878c12 verified about 2 months ago

raw

history blame contribute delete

8.62 kB

	"""
	llm_utils.py — LangChain integration layer for the analytics demo.

	Design principle: the LLM is an optional narrator, not a decision-maker.
	Every public function must return a valid result even when Ollama is absent.
	"""

	from __future__ import annotations

	import sys

	# Guard against environments where LangChain is not installed.
	# The rule engine in demo.py never imports this at module level for logic,
	# so an ImportError here degrades gracefully to the fallback path.
	#
	# We prefer langchain-ollama (the current, non-deprecated package) and fall
	# back to langchain_community.llms.Ollama for environments that only have
	# the older package installed.
	try:
	from langchain_core.prompts import PromptTemplate

	try:
	from langchain_ollama import OllamaLLM as Ollama # preferred (langchain-ollama)
	except ImportError:
	from langchain_community.llms import Ollama # fallback (older install)

	LANGCHAIN_AVAILABLE = True
	except ImportError:
	LANGCHAIN_AVAILABLE = False

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	OLLAMA_BASE_URL = "http://localhost:11434"
	OLLAMA_MODEL = "llama3.2"

	# The system instruction is kept as a constant so it can be audited,
	# versioned, and referenced in documentation independently of the template.
	SYSTEM_PROMPT = (
	"You are assisting a business analyst. "
	"Explain the following metric validation findings clearly and factually. "
	"Do not speculate on causes. "
	"Do not introduce information not present in the data. "
	"Use a neutral, executive-friendly tone."
	)


	# ---------------------------------------------------------------------------
	# Internal helpers
	# ---------------------------------------------------------------------------


	def _build_prompt_template() -> "PromptTemplate":
	"""
	Construct the LangChain PromptTemplate.

	Why PromptTemplate rather than an f-string: the variable injection point
	is explicit and the template can be tested and versioned independently
	of the findings serialization logic.
	"""
	template = (
	f"{SYSTEM_PROMPT}\n\n"
	"Metric Validation Findings:\n"
	"{findings_text}\n\n"
	"Provide a concise executive summary (3-5 sentences) of the above findings. "
	"Stick strictly to the facts presented."
	)
	return PromptTemplate(
	input_variables=["findings_text"],
	template=template,
	)


	def _serialize_findings(findings: dict) -> str:
	"""
	Convert the structured findings dict into a plain-text paragraph suitable
	for injection into the LLM prompt.

	Why plain text rather than raw JSON: LLMs produce better, more natural
	summaries when given prose-style context rather than nested JSON objects.
	"""
	lines: list[str] = []

	dr = findings["stats"]["date_range"]
	lines.append(
	f"Dataset covers {dr['actual_rows']} rows from {dr['start']} to {dr['end']}. "
	f"Expected {dr['expected_rows']} rows based on the date range "
	f"(gap: {dr['row_gap']} row(s))."
	)

	rev = findings["stats"]["revenue"]
	lines.append(
	f"Revenue (USD): mean=${rev['mean']:,.2f}, std=${rev['std']:,.2f}, "
	f"total=${rev['total']:,.2f}, missing={rev['missing_count']} value(s)."
	)

	ord_ = findings["stats"]["orders"]
	lines.append(
	f"Orders: mean={ord_['mean']:,.0f}, std={ord_['std']:,.1f}, "
	f"total={int(ord_['total']):,}, missing={ord_['missing_count']} value(s)."
	)

	issues = findings["issues"]
	if issues:
	lines.append(f"\nDetected {len(issues)} issue(s):")
	for issue in issues:
	date_str = f" on {', '.join(issue['dates'])}" if issue["dates"] else ""
	lines.append(f" - [{issue['severity']}] {issue['detail']}{date_str}")
	else:
	lines.append("\nNo data quality issues detected. Dataset appears clean.")

	return "\n".join(lines)


	def _try_ollama_summary(findings_text: str) -> str \| None:
	"""
	Attempt a local Ollama call via LangChain. Returns the summary string on
	success, or None on any failure (connection refused, model not found, etc.).

	Why return None instead of raising: the caller uses None as a signal to
	activate the deterministic fallback, keeping all error-handling in one place.
	Errors are printed to stderr so they don't pollute the report on stdout.
	"""
	if not LANGCHAIN_AVAILABLE:
	return None

	try:
	prompt_template = _build_prompt_template()
	llm = Ollama(base_url=OLLAMA_BASE_URL, model=OLLAMA_MODEL, timeout=30)

	# LCEL pipe syntax: preferred over deprecated LLMChain
	chain = prompt_template \| llm
	result = chain.invoke({"findings_text": findings_text})

	# langchain_community.llms.Ollama returns a plain str;
	# ChatOllama returns an AIMessage — handle both defensively.
	if hasattr(result, "content"):
	return result.content.strip() or None
	return str(result).strip() or None

	except Exception as exc:
	print(
	f"[llm_utils] Ollama unavailable ({type(exc).__name__}): {exc}",
	file=sys.stderr,
	)
	return None


	def _rule_based_summary(findings: dict) -> str:
	"""
	Generate a deterministic plain-text summary from the findings dict.

	This is the guaranteed fallback when no LLM is available. Template-driven
	text is auditable, predictable, and consistent — qualities analysts require
	from a validation tool used in reporting contexts.
	"""
	dr = findings["stats"]["date_range"]
	issues = findings["issues"]

	line1 = (
	f"The dataset spans {dr['actual_rows']} rows from {dr['start']} to "
	f"{dr['end']} (expected {dr['expected_rows']} calendar days)."
	)

	if not issues:
	return (
	f" {line1}\n"
	" No data quality issues were detected. "
	"The dataset appears suitable for reporting."
	)

	parts: list[str] = []

	missing_issues = [i for i in issues if i["type"] == "missing_values"]
	row_issues = [i for i in issues if i["type"] == "row_count"]
	anomaly_issues = [i for i in issues if i["type"] == "anomaly_drop"]
	duplicate_issues = [i for i in issues if i["type"] == "duplicate_dates"]

	if missing_issues:
	cols = sorted({i["column"] for i in missing_issues})
	parts.append(f"Missing values were identified in column(s): {', '.join(cols)}.")

	if row_issues:
	gap = row_issues[0]["value"]
	parts.append(f"A row count gap of {int(gap)} was detected in the date sequence.")

	if duplicate_issues:
	parts.append(
	f"{len(duplicate_issues)} duplicate date(s) were found, "
	"which may cause double-counting in aggregations."
	)

	if anomaly_issues:
	# Report worst single-day drop
	worst = min(anomaly_issues, key=lambda x: x["value"])
	parts.append(
	f"{len(anomaly_issues)} day-over-day drop(s) exceeding the 20% anomaly "
	f"threshold were flagged; the largest was {worst['value']:.1f}% "
	f"on {worst['dates'][0]}."
	)

	parts.append(
	"These findings should be reviewed and resolved before this dataset "
	"is used in executive or board-level reporting."
	)

	body = " ".join(parts)
	return f" {line1}\n {body}"


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------


	def generate_summary(findings: dict) -> str:
	"""
	Generate a human-readable summary of the findings dict.

	Attempts an Ollama (local LLM) call first; falls back to a deterministic
	rule-based summary if Ollama is unavailable. Always returns a non-empty string.

	Args:
	findings: The dict returned by demo.run_checks()

	Returns:
	A formatted summary string including a source label.
	"""
	findings_text = _serialize_findings(findings)
	llm_result = _try_ollama_summary(findings_text)

	if llm_result:
	source_label = f" Source: Ollama ({OLLAMA_MODEL})\n"
	# Indent each line of LLM output to match the report's 2-space style
	indented = "\n".join(f" {line}" for line in llm_result.splitlines())
	return source_label + "\n" + indented

	source_label = " Source: Rule-Based Fallback (Ollama unavailable)\n"
	return source_label + "\n" + _rule_based_summary(findings)