Spaces:

mmoise00
/

askchosmky

Runtime error

App Files Files Community

askchosmky / main.py

mmoise00

prepare hugging face deployment and enable gitguardian

60ffeeb about 2 months ago

raw

history blame contribute delete

20.6 kB

	import argparse
	import asyncio
	import hashlib
	import json
	import logging
	import os
	import re
	import sys
	import time
	from typing import Any, TYPE_CHECKING


	def ensure_project_venv() -> None:
	project_root = os.path.dirname(os.path.abspath(__file__))
	venv_root = os.path.join(project_root, ".venv")
	venv_python = os.path.join(project_root, ".venv", "bin", "python")

	if not os.path.exists(venv_python):
	return

	current_prefix = os.path.realpath(sys.prefix)
	expected_prefix = os.path.realpath(venv_root)

	if current_prefix != expected_prefix:
	os.execv(venv_python, [venv_python, *sys.argv])


	ensure_project_venv()


	import numpy as np
	from datasets import load_dataset
	from dotenv import load_dotenv


	load_dotenv()


	def configure_logging() -> None:
	"""Configure app and dependency logging level.

	Defaults to WARNING to keep CLI output concise. Override with
	ASKCHOMSKY_LOG_LEVEL (e.g., INFO, DEBUG) when troubleshooting.
	"""
	level_name = os.getenv("ASKCHOMSKY_LOG_LEVEL", "WARNING").upper()
	level = getattr(logging, level_name, logging.WARNING)

	# Keep root and noisy dependencies aligned with the selected verbosity.
	logging.getLogger().setLevel(level)
	for logger_name in (
	"lightrag",
	"nano-vectordb",
	"sentence_transformers",
	"sentence_transformers.SentenceTransformer",
	"httpx",
	"openai",
	):
	logging.getLogger(logger_name).setLevel(level)


	configure_logging()


	if TYPE_CHECKING:
	# These imports are heavy (transitively pull in torch, CUDA, etc.).
	# Import them only for type checking; at runtime we import lazily.
	from lightrag import LightRAG, QueryParam
	from lightrag.llm.openai import openai_complete_if_cache
	from lightrag.utils import EmbeddingFunc

	# LightRAG configures its own logger during import, so apply our level again
	# once we actually import it lazily at runtime (see initialize_rag).
	configure_logging()


	OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
	LLM_MODEL = os.getenv("ASKCHOMSKY_LLM_MODEL", "openai/gpt-4o-mini")
	EMBED_MODEL = os.getenv("ASKCHOMSKY_EMBED_MODEL", "openai/text-embedding-3-small")
	EMBED_DIM = 1536
	DEFAULT_WORKING_DIR = "./lightrag_store"
	LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT", "600"))
	MAX_ASYNC_LLM_CALLS = int(os.getenv("MAX_ASYNC", "2"))
	MAX_PARALLEL_INSERT = int(os.getenv("MAX_PARALLEL_INSERT", "2"))
	REWRITE_QUERY_ENABLED = os.getenv("REWRITE_QUERY", "true").lower() == "true"
	VERIFY_CLAIMS_ENABLED = os.getenv("VERIFY_CLAIMS", "true").lower() == "true"
	QUERY_CACHE_TTL_SECONDS = int(os.getenv("QUERY_CACHE_TTL", "86400"))
	QUERY_CACHE_PATH = os.path.join(DEFAULT_WORKING_DIR, "query_cache.json")


	CITATION_SYSTEM_PROMPT = """You are a retrieval-grounded assistant.

	Use only the provided context data.
	If context is insufficient, say: I do not have enough information to answer from the retrieved corpus.

	Citation rules:
	1) Every factual claim must include at least one citation marker like [1].
	2) Do not invent citation IDs.
	3) Keep citation IDs consistent with the provided references.

	Response style: {response_type}
	User preference: {user_prompt}

	Context:
	{context_data}
	"""


	def get_langfuse_client():
	"""Return a configured Langfuse client or None if unavailable/invalid."""
	public_key = os.getenv("LANGFUSE_PUBLIC_KEY", "").strip().strip('"').strip("'")
	secret_key = os.getenv("LANGFUSE_SECRET_KEY", "").strip().strip('"').strip("'")
	base_url = (
	os.getenv("LANGFUSE_BASE_URL", "").strip().strip('"').strip("'")
	or os.getenv("LANGFUSE_HOST", "").strip().strip('"').strip("'")
	or "https://cloud.langfuse.com"
	)

	if (
	not public_key
	or public_key.startswith("pk-lf-...")
	or not secret_key
	or secret_key.startswith("sk-lf-...")
	):
	return None

	try:
	from langfuse import Langfuse

	client = Langfuse(
	public_key=public_key,
	secret_key=secret_key,
	base_url=base_url,
	debug=os.getenv("LANGFUSE_DEBUG", "false").lower() == "true",
	)

	if not client.auth_check():
	print("Langfuse auth check failed. Verify keys and LANGFUSE_BASE_URL.")
	return None

	return client
	except Exception as exc:
	print(f"Langfuse disabled: {exc}")
	return None


	def configure_langfuse() -> bool:
	"""Backward-compatible bool helper used by older call sites."""
	return get_langfuse_client() is not None


	# ---------------------------------------------------------------------------
	# API-based embeddings (OpenRouter / OpenAI-compatible)
	# ---------------------------------------------------------------------------


	def _get_api_key() -> str:
	api_key = os.getenv("openrouter_key") or os.getenv("OPENAI_API_KEY", "")
	if not api_key:
	raise ValueError("Missing openrouter_key or OPENAI_API_KEY in .env")
	return api_key


	def _api_embed_single(text: str) -> list[float]:
	import httpx

	api_key = _get_api_key()
	payload = {"input": text, "model": EMBED_MODEL}
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json",
	}
	with httpx.Client(timeout=30.0) as client:
	resp = client.post(
	OPENROUTER_BASE_URL + "/embeddings", json=payload, headers=headers
	)
	resp.raise_for_status()
	data = resp.json()
	return data["data"][0]["embedding"]


	def embed_texts(texts: list[str]) -> np.ndarray:
	embeddings = [_api_embed_single(t) for t in texts]
	arr = np.array(embeddings, dtype=np.float32)
	norms = np.linalg.norm(arr, axis=1, keepdims=True)
	norms[norms == 0] = 1.0
	return arr / norms


	async def embedding_func(texts: list[str]) -> np.ndarray:
	return await asyncio.to_thread(embed_texts, texts)


	# ---------------------------------------------------------------------------
	# Query result cache (disk-based, TTL-evicted)
	# ---------------------------------------------------------------------------


	def _load_query_cache() -> dict[str, dict[str, Any]]:
	if not os.path.exists(QUERY_CACHE_PATH):
	return {}
	try:
	with open(QUERY_CACHE_PATH, "r") as f:
	return json.load(f)
	except (json.JSONDecodeError, OSError):
	return {}


	def _save_query_cache(cache: dict[str, dict[str, Any]]) -> None:
	os.makedirs(os.path.dirname(QUERY_CACHE_PATH), exist_ok=True)
	with open(QUERY_CACHE_PATH, "w") as f:
	json.dump(cache, f)


	def _cache_key(question: str, mode: str) -> str:
	raw = f"{question.strip().lower()}\|{mode}"
	return hashlib.sha256(raw.encode()).hexdigest()


	def get_cached_answer(question: str, mode: str) -> str \| None:
	if QUERY_CACHE_TTL_SECONDS <= 0:
	return None
	key = _cache_key(question, mode)
	cache = _load_query_cache()
	entry = cache.get(key)
	if not entry:
	return None
	if time.time() - entry.get("ts", 0) > QUERY_CACHE_TTL_SECONDS:
	del cache[key]
	_save_query_cache(cache)
	return None
	return entry.get("answer")


	def cache_answer(question: str, mode: str, answer: str) -> None:
	if QUERY_CACHE_TTL_SECONDS <= 0:
	return
	key = _cache_key(question, mode)
	cache = _load_query_cache()
	cache[key] = {"answer": answer, "ts": time.time()}
	_save_query_cache(cache)


	async def llm_model_func(
	prompt,
	system_prompt=None,
	history_messages=None,
	keyword_extraction=False,
	**kwargs,
	) -> str:
	# Import here to avoid pulling in heavy dependencies during module import.
	from lightrag.llm.openai import openai_complete_if_cache

	api_key = os.getenv("openrouter_key")
	if not api_key:
	raise ValueError("Missing openrouter_key in .env")

	if history_messages is None:
	history_messages = []

	return await openai_complete_if_cache(
	model=LLM_MODEL,
	prompt=prompt,
	system_prompt=system_prompt,
	history_messages=history_messages,
	api_key=api_key,
	base_url=OPENROUTER_BASE_URL,
	timeout=LLM_TIMEOUT_SECONDS,
	keyword_extraction=keyword_extraction,
	**kwargs,
	)


	async def initialize_rag(working_dir: str = DEFAULT_WORKING_DIR) -> "LightRAG":
	# Lazy imports keep startup fast and avoid loading the full
	# LightRAG/torch stack until we actually need RAG functionality.
	from lightrag import LightRAG
	from lightrag.utils import EmbeddingFunc

	os.makedirs(working_dir, exist_ok=True)

	rag = LightRAG(
	working_dir=working_dir,
	llm_model_func=llm_model_func,
	llm_model_name=LLM_MODEL,
	default_llm_timeout=LLM_TIMEOUT_SECONDS,
	llm_model_max_async=MAX_ASYNC_LLM_CALLS,
	max_parallel_insert=MAX_PARALLEL_INSERT,
	embedding_func=EmbeddingFunc(
	embedding_dim=EMBED_DIM,
	max_token_size=8192,
	model_name=EMBED_MODEL,
	func=embedding_func,
	),
	)

	await rag.initialize_storages()
	return rag


	def load_corpus_texts(limit: int) -> list[str]:
	ds = load_dataset("mmoise00/chomsky-corpus", split="train")
	count = min(limit, len(ds))

	texts = []
	for row in ds.select(range(count)):
	title = row.get("article_title") or "Untitled"
	date = row.get("article_date") or ""
	content = row.get("content") or ""
	texts.append(f"Title: {title}\nDate: {date}\n\n{content}")

	return texts


	async def ingest_corpus(
	doc_limit: int = 200, working_dir: str = DEFAULT_WORKING_DIR
	) -> int:
	rag = None
	try:
	rag = await initialize_rag(working_dir)
	docs = load_corpus_texts(doc_limit)
	await rag.ainsert(docs)
	return len(docs)
	finally:
	if rag is not None:
	await rag.finalize_storages()


	async def query_rag(
	question: str,
	mode: str = "hybrid",
	working_dir: str = DEFAULT_WORKING_DIR,
	) -> str:
	def _looks_like_no_answer(answer: str) -> bool:
	text = answer.lower()
	return (
	"[no-context]" in text
	or "i do not have enough information to answer" in text
	or "sorry, i'm not able to provide an answer" in text
	)

	def _response_to_text(response: object) -> str:
	if isinstance(response, str):
	return response
	content = getattr(response, "content", None)
	if content is not None:
	return str(content)
	return str(response)

	def _has_citation_marker(text: str) -> bool:
	return bool(re.search(r"\[\d+\]", text))

	def _extract_json_object(text: str) -> dict[str, Any] \| None:
	match = re.search(r"\{[\s\S]*\}", text)
	if not match:
	return None
	try:
	return json.loads(match.group(0))
	except json.JSONDecodeError:
	return None

	def _extract_references(raw_result: dict[str, Any]) -> list[dict[str, str]]:
	data = raw_result.get("data", {})
	references = data.get("references", [])
	if isinstance(references, list):
	return [r for r in references if isinstance(r, dict)]
	return []

	def _extract_chunks(raw_result: dict[str, Any]) -> list[dict[str, Any]]:
	data = raw_result.get("data", {})
	chunks = data.get("chunks", [])
	if isinstance(chunks, list):
	return [c for c in chunks if isinstance(c, dict)]
	return []

	def _extract_llm_text(raw_result: dict[str, Any]) -> str:
	llm_response = raw_result.get("llm_response", {})
	content = llm_response.get("content")
	if content is None:
	return ""
	return str(content)

	def _render_references(references: list[dict[str, str]]) -> str:
	if not references:
	return ""
	lines: list[str] = ["Sources:"]
	for ref in references:
	ref_id = str(ref.get("reference_id", "")).strip()
	file_path = str(ref.get("file_path", "")).strip() or "unknown"
	if ref_id:
	lines.append(f"[{ref_id}] {file_path}")
	return "\n".join(lines)

	def _enforce_citation_answer(answer: str, references: list[dict[str, str]]) -> str:
	if not references:
	return answer

	rendered_references = _render_references(references)
	safe_answer = answer.strip()
	if not _has_citation_marker(safe_answer):
	first_ref = str(references[0].get("reference_id", "1")).strip() or "1"
	safe_answer = f"{safe_answer}\n\nPrimary support [{first_ref}]."

	if rendered_references and "Sources:" not in safe_answer:
	safe_answer = f"{safe_answer}\n\n{rendered_references}"
	return safe_answer

	async def _rewrite_query_for_retrieval(original_question: str) -> str:
	if not REWRITE_QUERY_ENABLED:
	return original_question

	rewrite_prompt = (
	"Rewrite this question for retrieval over a Noam Chomsky corpus. "
	"Preserve intent and named entities. Return one line only, no extra text.\n\n"
	f"Question: {original_question}"
	)

	try:
	rewritten = await llm_model_func(
	rewrite_prompt,
	system_prompt="You are a retrieval query rewriter.",
	history_messages=[],
	)
	candidate = _response_to_text(rewritten).strip().splitlines()[0].strip()
	if not candidate:
	return original_question
	return candidate[:600]
	except Exception:
	return original_question

	def _dynamic_query_param(
	selected_mode: str,
	original_question: str,
	rewritten_question: str,
	*,
	retry_level: int = 0,
	) -> "QueryParam":
	base_top_k = int(os.getenv("TOP_K", "40"))
	base_chunk_top_k = int(os.getenv("CHUNK_TOP_K", "20"))

	text = f"{original_question} {rewritten_question}".lower()
	token_count = len(re.findall(r"\w+", rewritten_question))

	top_k = base_top_k
	chunk_top_k = base_chunk_top_k

	if token_count > 18:
	top_k += 15
	chunk_top_k += 15
	if any(k in text for k in ("compare", "versus", "difference", "contrast")):
	top_k += 20
	chunk_top_k += 20
	if any(k in text for k in ("timeline", "history", "evolution", "over time")):
	top_k += 20
	chunk_top_k += 20
	if any(k in text for k in ("why", "how", "explain", "analyze")):
	top_k += 10
	chunk_top_k += 10

	if retry_level == 1:
	top_k = max(top_k, 80)
	chunk_top_k = max(chunk_top_k, 80)
	elif retry_level >= 2:
	top_k = max(top_k, 100)
	chunk_top_k = max(chunk_top_k, 100)

	rerank_default = os.getenv("RERANK_BY_DEFAULT", "false").lower() == "true"
	enable_rerank = rerank_default and retry_level == 0

	return QueryParam(
	mode=selected_mode,
	top_k=top_k,
	chunk_top_k=chunk_top_k,
	enable_rerank=enable_rerank,
	include_references=True,
	response_type="Multiple Paragraphs",
	)

	async def _verify_claims(
	answer_text: str,
	chunks: list[dict[str, Any]],
	) -> str:
	if not VERIFY_CLAIMS_ENABLED or not answer_text.strip() or not chunks:
	return ""

	evidence_lines: list[str] = []
	for chunk in chunks[:8]:
	ref_id = str(chunk.get("reference_id", "?")).strip() or "?"
	content = str(chunk.get("content", "")).strip().replace("\n", " ")
	if content:
	evidence_lines.append(f"[{ref_id}] {content[:700]}")

	if not evidence_lines:
	return ""

	verifier_prompt = (
	"Verify claims in the answer using ONLY the provided evidence snippets. "
	"Return strict JSON with keys: verdict, unsupported_claims, notes. "
	"verdict must be one of supported\|partially_supported\|unsupported.\n\n"
	f"Answer:\n{answer_text}\n\n"
	f"Evidence:\n{os.linesep.join(evidence_lines)}"
	)

	try:
	verifier_response = await llm_model_func(
	verifier_prompt,
	system_prompt="You are a strict factual verifier.",
	history_messages=[],
	)
	verifier_text = _response_to_text(verifier_response)
	verifier_json = _extract_json_object(verifier_text)
	if not verifier_json:
	return ""

	verdict = str(verifier_json.get("verdict", "")).strip().lower()
	unsupported_claims = verifier_json.get("unsupported_claims", [])
	if verdict in {"supported", ""} or not isinstance(unsupported_claims, list):
	return ""

	cleaned_claims = [
	str(c).strip() for c in unsupported_claims if str(c).strip()
	][:5]
	if not cleaned_claims:
	return ""

	joined = "\n".join(f"- {claim}" for claim in cleaned_claims)
	return (
	"\n\nClaim verification: some claims could not be fully supported by retrieved evidence."
	f"\n{joined}"
	)
	except Exception:
	return ""

	cached = get_cached_answer(question, mode)
	if cached is not None:
	return cached

	rag = None
	try:
	rag = await initialize_rag(working_dir)

	rewritten_question = await _rewrite_query_for_retrieval(question)
	selected_result: dict[str, Any] \| None = None

	attempt_modes = [mode, mode, "mix"] if mode != "mix" else ["mix", "mix"]

	for retry_level, attempt_mode in enumerate(attempt_modes):
	param = _dynamic_query_param(
	attempt_mode,
	question,
	rewritten_question,
	retry_level=retry_level,
	)
	result = await rag.aquery_llm(
	rewritten_question,
	param=param,
	system_prompt=CITATION_SYSTEM_PROMPT,
	)
	answer_text = _extract_llm_text(result)
	selected_result = result
	if answer_text and not _looks_like_no_answer(answer_text):
	break

	if selected_result is None:
	return (
	"I do not have enough information to answer from the retrieved corpus."
	)

	answer_text = _extract_llm_text(selected_result)
	references = _extract_references(selected_result)
	chunks = _extract_chunks(selected_result)

	answer_with_citations = _enforce_citation_answer(answer_text, references)
	verification_summary = await _verify_claims(answer_with_citations, chunks)

	final_answer = f"{answer_with_citations}{verification_summary}".strip()
	cache_answer(question, mode, final_answer)
	return final_answer
	finally:
	if rag is not None:
	await rag.finalize_storages()


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="LightRAG over the Chomsky corpus")
	parser.add_argument(
	"--ingest", action="store_true", help="Index dataset into LightRAG"
	)
	parser.add_argument("--query", type=str, help="Question to ask")
	parser.add_argument(
	"--mode",
	type=str,
	default="hybrid",
	choices=["naive", "local", "global", "hybrid", "mix"],
	help="LightRAG query mode",
	)
	parser.add_argument(
	"--doc-limit", type=int, default=200, help="How many docs to index"
	)
	parser.add_argument(
	"--working-dir",
	type=str,
	default=DEFAULT_WORKING_DIR,
	help="Directory where LightRAG stores graph/vector state",
	)
	return parser.parse_args()


	async def run_cli(args: argparse.Namespace) -> None:
	if args.ingest:
	count = await ingest_corpus(
	doc_limit=args.doc_limit, working_dir=args.working_dir
	)
	print(f"Indexed {count} documents into LightRAG store: {args.working_dir}")

	if args.query:
	answer = await query_rag(
	args.query, mode=args.mode, working_dir=args.working_dir
	)
	print(f"\nQ: {args.query}")
	print(f"\nA: {answer}")

	if not args.ingest and not args.query:
	print("Nothing to do. Use --ingest and/or --query.")


	if __name__ == "__main__":
	asyncio.run(run_cli(parse_args()))