Spaces:

RH29152
/

IBAhive-RAG

Running

Rashid Hussain

fix issue of evaluation on non-context response

a37a230 about 1 month ago

12.8 kB

	from __future__ import annotations

	import logging
	import os
	import sys
	import time
	import traceback

	logger = logging.getLogger(__name__)

	ROOT = os.path.dirname(os.path.abspath(__file__))
	sys.path.insert(0, os.path.join(ROOT, "app"))

	DIST = os.path.join(ROOT, "dist")

	from flask import Flask, jsonify, request, send_from_directory
	from flask_cors import CORS

	from generation import clarification_reply

	_CLARIFICATION_BLOCK = clarification_reply()
	GENERATION_SYSTEM_PROMPT = f"""You are IBAHive, a concise assistant for IBA Karachi students.

	The user's message includes numbered passages under "Context". Those passages are the ONLY source you may use for factual claims. Do not use outside knowledge to fill gaps.

	Before you write the answer (think step by step internally — do not print "Step 1" or analysis headings in the reply):
	1. Identify what the user is asking (one short phrase).
	2. Ask yourself: do the numbered passages contain specific, on-topic information that directly answers that request? Mere keyword overlap (e.g. a random word from an unrelated lab) is not enough — the passages must actually address the question. For exam-prep or "what is tested" questions: if passages name the same course or module (code, title, syllabus, outcomes), summarize what they say about content or assessment—even when past-paper topics are not spelled out. Do not dismiss as irrelevant solely because the phrasing differs.
	3. Electives, course comparisons, "difference between" questions: If the passages name or describe any of the courses or topics in the question, you must answer from those passages (compare, contrast, or summarize). Do not use the clarification block below just because the answer requires synthesis.
	Program announcements (BS CS): If Context includes a program announcement for the year the user asked about (e.g. title or heading with 2023–24, 2023-2024, etc.) and a CS Electives Offered (or similar) list with CSE/MIS codes and titles, you must list or summarize those electives from the passages. Do not use the "no detail in IBAHive's materials" fallback when that list appears in any numbered passage—even if filenames also mention other years.
	4. If YES (passages substantively address the request, including partial or multi-topic answers) — Answer in 3–6 bullets or 2–4 short sentences. Echo the question's key phrases (tasks, roles, topics, and named entities—e.g. TA, responsibilities, workload, IBA, course codes) in your bullets wherever the passages support those points; do not replace them with distant synonyms only. If the question names a place, school, or program (e.g. IBA), use that exact name at least once in the answer when the passages are about that context. You may start a bullet with a short phrase that restates what they asked (e.g. "Main TA responsibilities include…" / "Expected workload:"). Mirror the user's wording where natural. Use only facts supported by the passages. Do not say "based on the context" or "according to the documents". Do not invent course names, policies, numbers, or dates.
	5. If NO — only when the question is off-topic for IBA academics or the passages are wholly unrelated (no shared subject with the question) — your entire reply must be exactly the following text. Copy it verbatim. Do not add a greeting, apology, or extra lines before or after:

	{_CLARIFICATION_BLOCK}

	Hard rules: Do not produce ER diagrams, code, or long essays from weak or unrelated snippets. Do not reply with vague chatbot lines like "please provide more details" unless you are using the exact block above. Do not use the block above when the passages clearly discuss the topic asked — including elective outlines, catalog text, or program announcements.

	Multi-part questions: If the user asks more than one thing, answer each part from any numbered passage that supports it. Read the whole Context before deciding something is missing—do not claim IBAHive lacks a fact if it appears in any passage.

	Coverage gaps (important): Use the short fallback only when no passage supports that sub-question: say you do not have that detail in IBAHive's materials and suggest IBA-facing resources (handbook, syllabus, LMS, instructor). Never tell users to read OpenAI, ChatGPT, Groq, Hugging Face, or other vendor documentation, and never answer meta-questions about "the assistant" or the AI — those are out of scope; the user should ask an IBA academics question instead. Never use this fallback for elective-list or program-announcement questions when Context already contains CS Electives Offered (or equivalent) with course codes. Never write what "the passages", "the context", or "excerpts" include or omit (forbidden: "The passages say X but do not include Y…"). State supported facts directly as normal bullet points—no document-audit or retrieval meta-commentary.
	""".strip()

	app = Flask(__name__, static_folder=DIST, static_url_path="")
	CORS(app)


	def _evaluation_to_json_safe(eval_result: dict) -> tuple[float, float, list, list]:
	"""Plain dicts / floats so jsonify never hits numpy or other non-JSON types."""
	faith = eval_result["faithfulness"]
	rel = eval_result["answer_relevancy"]
	faith_score = float(faith["score"])
	rel_score = float(rel["score"])
	claims_out = []
	for c in faith.get("claims") or []:
	if not isinstance(c, dict):
	continue
	claims_out.append(
	{
	"claim": str(c.get("claim", "")),
	"verdict": str(c.get("verdict", "")),
	"reason": str(c.get("reason", "")),
	}
	)
	gen_q_out = []
	for s in rel.get("similarities") or []:
	if not isinstance(s, dict):
	continue
	gen_q_out.append(
	{
	"question": str(s.get("generated_question", "")),
	"similarity": float(s.get("similarity", 0.0)),
	}
	)
	return faith_score, rel_score, claims_out, gen_q_out


	@app.route("/ask", methods=["POST"])
	def ask():
	from generation import GENERATION_MODEL, generate
	from retrieval import (
	RETRIEVAL_MODE_BM25,
	RETRIEVAL_MODE_HYBRID,
	RETRIEVAL_MODE_SEMANTIC,
	retrieve,
	)
	from evaluation import EMBED_MODEL, JUDGE_MODEL, evaluate_response

	data = request.get_json(silent=True) or {}
	query = (data.get("query") or "").strip()
	if not query:
	return jsonify({"error": "query is required"}), 400

	mode = data.get("retrieval_mode", "Hybrid + Reranking")
	rmode = {
	"Hybrid + Reranking": RETRIEVAL_MODE_HYBRID,
	"Semantic Only": RETRIEVAL_MODE_SEMANTIC,
	"BM25 Only": RETRIEVAL_MODE_BM25,
	"hybrid": RETRIEVAL_MODE_HYBRID,
	"semantic": RETRIEVAL_MODE_SEMANTIC,
	"bm25": RETRIEVAL_MODE_BM25,
	}.get(mode, RETRIEVAL_MODE_HYBRID)

	ns_raw = data.get("namespace", "fixed")
	if ns_raw in ("recursive", "recursive-chunks"):
	pinecone_ns = "recursive-chunks"
	else:
	pinecone_ns = "fixed-chunks"

	top_k = int(data.get("top_k", 5))
	bm25_k = int(data.get("bm25_k", 10))
	semantic_k = int(data.get("semantic_k", 10))
	run_eval = bool(data.get("run_eval", True))

	gen_model = (data.get("generation_model") or "").strip() or GENERATION_MODEL
	judge_model = (data.get("judge_model") or "").strip() or JUDGE_MODEL
	embed_eval = (data.get("eval_embed_model") or "").strip() or EMBED_MODEL

	temperature = float(data.get("temperature", 0.1))
	max_tokens = int(data.get("max_tokens", 512))

	try:
	t0 = time.time()
	chunks = retrieve(
	query,
	top_k=top_k,
	bm25_k=bm25_k,
	semantic_k=semantic_k,
	pinecone_namespace=pinecone_ns,
	retrieval_mode=rmode,
	)
	t_retrieval = time.time() - t0

	t0 = time.time()
	gen = generate(
	query,
	chunks,
	system_prompt=GENERATION_SYSTEM_PROMPT,
	model=gen_model,
	max_tokens=max_tokens,
	temperature=temperature,
	)
	t_generation = time.time() - t0

	is_fallback = bool(gen.get("is_fallback", False))
	skip_eval = is_fallback or bool(gen.get("skip_evaluation", False))

	eval_result = None
	t_evaluation = 0.0
	eval_status = "skipped"
	eval_error: str \| None = None
	eval_skipped_reason: str \| None = None

	if not run_eval:
	eval_skipped_reason = "run_eval_false"
	elif is_fallback:
	eval_skipped_reason = "generation_fallback"
	elif gen.get("skip_evaluation"):
	eval_skipped_reason = "answer_refusal_or_non_evaluable"

	if run_eval and not skip_eval:
	t0 = time.time()
	try:
	eval_result = evaluate_response(
	question=query,
	context=gen["context"],
	answer=gen["answer"],
	judge_model=judge_model,
	embed_model=embed_eval,
	)
	eval_status = "ok"
	except Exception as e:
	eval_result = None
	eval_status = "error"
	eval_error = str(e)
	traceback.print_exc()
	logger.exception("evaluate_response failed (UI showed 0 scores; see eval_error in JSON)")
	t_evaluation = time.time() - t0

	faith_score = rel_score = 0.0
	claims: list = []
	gen_questions: list = []

	if eval_result:
	faith_score, rel_score, claims, gen_questions = _evaluation_to_json_safe(eval_result)

	# Hide retrieval metadata when generation failed or answer is refusal / non-evaluable.
	omit_chunks_sources = is_fallback or bool(gen.get("skip_evaluation", False))

	if omit_chunks_sources:
	response_chunks = []
	sources_out: list = []
	faith_out = None
	rel_out = None
	t_eval_out = None
	claims_out = []
	gen_q_out = []
	else:
	response_chunks = [
	{
	"text": (c.get("text") or "")[:300],
	"source": (c.get("metadata") or {}).get("filename", "")
	or (c.get("metadata") or {}).get("source", ""),
	"score": float(c.get("cross_score", c.get("rrf_score", 0.0))),
	}
	for c in chunks
	]
	sources_out = gen.get("sources", [])
	if eval_result is not None:
	faith_out = faith_score
	rel_out = rel_score
	claims_out = claims
	gen_q_out = gen_questions
	t_eval_out = t_evaluation
	else:
	faith_out = faith_score
	rel_out = rel_score
	claims_out = claims
	gen_q_out = gen_questions
	t_eval_out = t_evaluation

	return jsonify(
	{
	"answer": gen["answer"],
	"sources": sources_out,
	"chunks": response_chunks,
	"faith_score": faith_out,
	"rel_score": rel_out,
	"t_retrieval": t_retrieval,
	"t_generation": t_generation,
	"t_evaluation": t_eval_out,
	"claims": claims_out,
	"gen_questions": gen_q_out,
	"eval_status": eval_status,
	"eval_error": eval_error,
	"eval_skipped_reason": eval_skipped_reason,
	}
	)
	except Exception as e:
	return jsonify({"error": str(e), "detail": str(e)}), 500


	@app.route("/", defaults={"path": ""})
	@app.route("/<path:path>")
	def spa(path: str):
	"""Serve React build; SPA fallback to index.html."""
	if path:
	candidate = os.path.join(DIST, path)
	if os.path.isfile(candidate):
	return send_from_directory(DIST, path)
	index_path = os.path.join(DIST, "index.html")
	if not os.path.isfile(index_path):
	return (
	jsonify(
	{
	"error": "React build not found",
	"hint": "Run npm run build so dist/index.html exists in the project root (next to api.py).",
	}
	),
	503,
	)
	return send_from_directory(DIST, "index.html")


	if __name__ == "__main__":
	port = int(os.environ.get("PORT", "7860"))
	app.run(host="0.0.0.0", port=port, debug=os.environ.get("FLASK_DEBUG") == "1")